In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [3]:
# Load the dataset
data = pd.read_csv('../Data/churn_data.csv')  # Replace 'your_dataset_path.csv' with your actual file path

# Display the first few rows of the dataset
print(data.head())

# Check the dataset for any missing values
print(data.isnull().sum())

# Split the dataset into features (X) and target (y)
X = data.drop('churn', axis=1)
y = data['churn']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


   gender senior_citizen partner dependents  tenure phone_service  \
0  Female             No     Yes         No       1            No   
1    Male             No      No         No      34           Yes   
2    Male             No      No         No       2           Yes   
3    Male             No      No         No      45            No   
4  Female             No      No         No       2           Yes   

     multiple_lines internet_service online_security online_backup  \
0  No phone service              DSL              No           Yes   
1                No              DSL             Yes            No   
2                No              DSL             Yes           Yes   
3  No phone service              DSL             Yes            No   
4                No      Fiber optic              No            No   

  device_protection tech_support streaming_tv streaming_movies  \
0                No           No           No               No   
1               Yes           No

In [4]:
print("\nData Types:")
print(data.dtypes)


Data Types:
gender                object
senior_citizen        object
partner               object
dependents            object
tenure                 int64
phone_service         object
multiple_lines        object
internet_service      object
online_security       object
online_backup         object
device_protection     object
tech_support          object
streaming_tv          object
streaming_movies      object
contract              object
paperless_billing     object
payment_method        object
monthly_charges      float64
total_charges        float64
churn                 object
dtype: object


In [5]:
# List of categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Display unique values for each categorical column
for col in categorical_cols:
    print(f"Unique values in {col}:")
    print(X[col].unique())
    print("-------------------------------")


Unique values in gender:
['Female' 'Male']
-------------------------------
Unique values in senior_citizen:
['No' 'Yes']
-------------------------------
Unique values in partner:
['Yes' 'No']
-------------------------------
Unique values in dependents:
['No' 'Yes']
-------------------------------
Unique values in phone_service:
['No' 'Yes']
-------------------------------
Unique values in multiple_lines:
['No phone service' 'No' 'Yes']
-------------------------------
Unique values in internet_service:
['DSL' 'Fiber optic' 'No']
-------------------------------
Unique values in online_security:
['No' 'Yes' 'No internet service']
-------------------------------
Unique values in online_backup:
['Yes' 'No' 'No internet service']
-------------------------------
Unique values in device_protection:
['No' 'Yes' 'No internet service']
-------------------------------
Unique values in tech_support:
['No' 'Yes' 'No internet service']
-------------------------------
Unique values in streaming_tv:
['

In [6]:
# Display value counts for the target feature 'churn'
print(data['churn'].value_counts())


churn
No     5153
Yes    1857
Name: count, dtype: int64


In [7]:
# Identify categorical columns (assuming you've already identified them as categorical_cols)
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']

# Create a ColumnTransformer to apply one-hot encoding to categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first'), categorical_cols)
    ],
    remainder='passthrough'
)

# Fit and transform the training data
X_train_encoded = preprocessor.fit_transform(X_train)

# Transform the testing data
X_test_encoded = preprocessor.transform(X_test)

# Convert the encoded data back to DataFrame for better understanding
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=preprocessor.get_feature_names_out())
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=preprocessor.get_feature_names_out())

# Display the first few rows of the encoded training data
print(X_train_encoded_df.head())


   onehot__gender_Male  onehot__senior_citizen_Yes  onehot__partner_Yes  \
0                  0.0                         0.0                  1.0   
1                  1.0                         0.0                  0.0   
2                  1.0                         0.0                  0.0   
3                  0.0                         1.0                  1.0   
4                  1.0                         0.0                  0.0   

   onehot__dependents_Yes  onehot__phone_service_Yes  \
0                     0.0                        1.0   
1                     1.0                        1.0   
2                     0.0                        0.0   
3                     0.0                        1.0   
4                     0.0                        1.0   

   onehot__multiple_lines_No phone service  onehot__multiple_lines_Yes  \
0                                      0.0                         0.0   
1                                      0.0                      

In [8]:
# Identify numerical columns
numerical_cols = ['monthly_charges', 'total_charges']

# Create a ColumnTransformer to standardize numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first'), categorical_cols),
        ('scaler', StandardScaler(), numerical_cols)
    ],
    remainder='passthrough'
)

# Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Transform the testing data
X_test_preprocessed = preprocessor.transform(X_test)

# Convert the preprocessed data back to DataFrame for better understanding
X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed, columns=preprocessor.get_feature_names_out())
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed, columns=preprocessor.get_feature_names_out())

# Display the first few rows of the preprocessed training data
print(X_train_preprocessed_df.head())


   onehot__gender_Male  onehot__senior_citizen_Yes  onehot__partner_Yes  \
0                  0.0                         0.0                  1.0   
1                  1.0                         0.0                  0.0   
2                  1.0                         0.0                  0.0   
3                  0.0                         1.0                  1.0   
4                  1.0                         0.0                  0.0   

   onehot__dependents_Yes  onehot__phone_service_Yes  \
0                     0.0                        1.0   
1                     1.0                        1.0   
2                     0.0                        0.0   
3                     0.0                        1.0   
4                     0.0                        1.0   

   onehot__multiple_lines_No phone service  onehot__multiple_lines_Yes  \
0                                      0.0                         0.0   
1                                      0.0                      

In [9]:
# Display the first few rows of the preprocessed training data
print(X_train_preprocessed_df.head())


   onehot__gender_Male  onehot__senior_citizen_Yes  onehot__partner_Yes  \
0                  0.0                         0.0                  1.0   
1                  1.0                         0.0                  0.0   
2                  1.0                         0.0                  0.0   
3                  0.0                         1.0                  1.0   
4                  1.0                         0.0                  0.0   

   onehot__dependents_Yes  onehot__phone_service_Yes  \
0                     0.0                        1.0   
1                     1.0                        1.0   
2                     0.0                        0.0   
3                     0.0                        1.0   
4                     0.0                        1.0   

   onehot__multiple_lines_No phone service  onehot__multiple_lines_Yes  \
0                                      0.0                         0.0   
1                                      0.0                      

In [11]:
# Display value counts for the target feature 'churn'
print(y_train.value_counts())


churn
No     4072
Yes    1536
Name: count, dtype: int64


In [12]:
from imblearn.combine import SMOTEENN

# Initialize SMOTEENN
sm = SMOTEENN()

# Resample the dataset
X_res, y_res = sm.fit_resample(X_train_preprocessed_df, y_train)

# Display the value counts after resampling
print(pd.Series(y_res).value_counts())



churn
Yes    3148
No     2049
Name: count, dtype: int64


In [13]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
y_res_encoded = label_encoder.fit_transform(y_res)

# Convert the encoded target variable back to original classes for better understanding
encoded_classes = label_encoder.classes_

# Display the encoded target variable and its classes
print("Encoded target variable:")
print(y_res_encoded)
print("\nEncoded classes:")
print(encoded_classes)



Encoded target variable:
[0 0 0 ... 1 1 1]

Encoded classes:
['No' 'Yes']


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier on the preprocessed and balanced training data
rf_classifier.fit(X_res, y_res_encoded)

# Predict on the preprocessed testing data
y_pred = rf_classifier.predict(X_test_preprocessed_df)

# Convert the predicted values back to original classes
y_pred_original = label_encoder.inverse_transform(y_pred)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_original)
print(f"Accuracy: {accuracy:.4f}\n")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_original))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_original))




Accuracy: 0.7183

Classification Report:
              precision    recall  f1-score   support

          No       0.94      0.68      0.79      1081
         Yes       0.44      0.85      0.58       321

    accuracy                           0.72      1402
   macro avg       0.69      0.77      0.68      1402
weighted avg       0.83      0.72      0.74      1402

Confusion Matrix:
[[733 348]
 [ 47 274]]


In [16]:
from sklearn.model_selection import cross_val_score

# Initialize RandomForestClassifier
rf_classifier_cv = RandomForestClassifier(random_state=42)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(rf_classifier_cv, X_res, y_res_encoded, cv=5, scoring='accuracy')

# Print cross-validation scores
print("Cross-validation scores:", cv_scores)
print(f"Mean Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")


Cross-validation scores: [0.94280267 0.93898951 0.97712107 0.98187023 0.97137405]
Mean Accuracy: 0.9624
Standard Deviation: 0.0179


In [17]:
from sklearn.model_selection import cross_val_predict

# Perform cross-validation predictions
y_pred_cv = cross_val_predict(rf_classifier_cv, X_res, y_res_encoded, cv=5)

# Convert the predicted values back to original classes
y_pred_cv_original = label_encoder.inverse_transform(y_pred_cv)

# Print classification report after cross-validation
print("Classification Report after Cross-Validation:")
print(classification_report(y_res, y_pred_cv_original))


Classification Report after Cross-Validation:
              precision    recall  f1-score   support

          No       0.95      0.95      0.95      2038
         Yes       0.97      0.97      0.97      3205

    accuracy                           0.96      5243
   macro avg       0.96      0.96      0.96      5243
weighted avg       0.96      0.96      0.96      5243



In [18]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize the data
scaler = StandardScaler()
X_res_scaled = scaler.fit_transform(X_res)

# Apply PCA
pca = PCA(n_components=0.95, random_state=42)  # Keep 95% of the variance
X_res_pca = pca.fit_transform(X_res_scaled)

# Print the number of components selected
print(f"Number of components selected: {pca.n_components_}")

# Perform cross-validation with PCA-transformed data
cv_scores_pca = cross_val_score(rf_classifier_cv, X_res_pca, y_res_encoded, cv=5, scoring='accuracy')

# Print cross-validation scores with PCA
print("Cross-validation scores with PCA:", cv_scores_pca)
print(f"Mean Accuracy with PCA: {cv_scores_pca.mean():.4f}")
print(f"Standard Deviation with PCA: {cv_scores_pca.std():.4f}")


Number of components selected: 17
Cross-validation scores with PCA: [0.95042898 0.93422307 0.93326978 0.94942748 0.94179389]
Mean Accuracy with PCA: 0.9418
Standard Deviation with PCA: 0.0072


In [19]:
# Initialize RandomForestClassifier
rf_classifier_pca = RandomForestClassifier(random_state=42)

# Perform 5-fold cross-validation with PCA-transformed data
cv_scores_pca = cross_val_score(rf_classifier_pca, X_res_pca, y_res_encoded, cv=5, scoring='accuracy')

# Print cross-validation scores with PCA-transformed data
print("Cross-validation scores with PCA-transformed data:", cv_scores_pca)
print(f"Mean Accuracy with PCA: {cv_scores_pca.mean():.4f}")
print(f"Standard Deviation with PCA: {cv_scores_pca.std():.4f}")

# Fit the model on the entire PCA-transformed training data
rf_classifier_pca.fit(X_res_pca, y_res_encoded)


Cross-validation scores with PCA-transformed data: [0.95042898 0.93422307 0.93326978 0.94942748 0.94179389]
Mean Accuracy with PCA: 0.9418
Standard Deviation with PCA: 0.0072


In [4]:
import joblib

# 1. Save PCA object
joblib.dump(pca, 'Exported/pca_transformer.pkl')

# 2. Save StandardScaler object
joblib.dump(scaler, 'Exported/standard_scaler.pkl')

# 4. Save trained RandomForestClassifier model
joblib.dump(rf_classifier_pca, 'Exported/rf_classifier_pca_model.pkl')

joblib.dump(preprocessor, 'Exported/preprocessor.pkl')
# Save the list of categorical columns
joblib.dump(categorical_cols, 'Exported/categorical_cols.pkl')

NameError: name 'pca' is not defined

In [7]:
import joblib
# Load PCA object
pca_loaded = joblib.load('../Exported/pca_transformer.pkl')

# Load StandardScaler object
scaler_loaded = joblib.load('../Exported/standard_scaler.pkl')

# Load the saved ColumnTransformer object
preprocessor_loaded = joblib.load('../Exported/preprocessor.pkl')

# Load the list of categorical columns
categorical_cols_loaded = joblib.load('../Exported/categorical_cols.pkl')

# Load trained RandomForestClassifier model
rf_classifier_loaded = joblib.load('../Exported/rf_classifier_pca_model.pkl')



In [9]:
import pandas as pd

# Load the new dataset
test_data = pd.read_csv('../Data/test.csv')

# Display the first few rows of the new dataset
print(test_data.head())


   gender senior_citizen partner dependents  tenure phone_service  \
0  Female             No     Yes         No      30           Yes   
1  Female             No      No         No       8           Yes   
2  Female            Yes     Yes         No      50           Yes   
3  Female             No     Yes         No      41           Yes   
4    Male             No      No         No       3            No   

     multiple_lines internet_service online_security online_backup  \
0               Yes      Fiber optic              No           Yes   
1               Yes      Fiber optic              No            No   
2               Yes      Fiber optic              No           Yes   
3                No              DSL              No           Yes   
4  No phone service              DSL             Yes           Yes   

  device_protection tech_support streaming_tv streaming_movies  \
0                No          Yes          Yes               No   
1                No           No

In [10]:
# Identify numerical columns
numerical_cols = ['monthly_charges', 'total_charges']

# Transform the new data using the loaded ColumnTransformer
X_new_preprocessed = preprocessor_loaded.transform(test_data)

# Apply PCA transformation
X_new_pca = pca_loaded.transform(X_new_preprocessed)

# Make predictions
y_pred_new = rf_classifier_loaded.predict(X_new_pca)

# Create a DataFrame to store predictions
predictions_df = pd.DataFrame({
    'Predicted_Churn': y_pred_new
})

# Display all the predictions without truncation
pd.set_option('display.max_rows', None)  # Set the option to display all rows
print("\nAll Predictions for the new dataset:")
print(predictions_df)



All Predictions for the new dataset:
    Predicted_Churn
0                 0
1                 1
2                 0
3                 0
4                 0
5                 0
6                 0
7                 1
8                 0
9                 1
10                0
11                1
12                0
13                0
14                1
15                0
16                1
17                0
18                0
19                0
20                0
21                0
22                0
23                0
24                0
25                0
26                0
27                0
28                0
29                0
30                0
31                0
32                0
33                0
34                0
35                0
36                0
37                1
38                0
39                1
40                0
41                0
42                0
43                0
44                0
45                0
46                0
47