In [1]:
# Standard data science imports
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
# Visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Statistics packages
import pylab
from pylab import rcParams
import statsmodels.api as sm
import statistics
from scipy import stats
# Scikit-learn
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder,StandardScaler,OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score,confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score,log_loss,mean_squared_error
import statsmodels.api as sm
# Ignore Warning Code
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the data
df = pd.read_csv('C:\\Users\\josue\\Desktop\\WGU\\D209\Resources\churn_cleanedtest.csv')
# Load the necessary columns into a DataFrame
selected_columns = [
    'Age', 'Income', 'Education', 'Employment', 'Marital', 'Gender', 
    'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year', 'Outage_sec_perweek', 
    'Yearly_equip_failure', 'Contract', 'Techie', 'Contacts', 
    'InternetService', 'Phone', 'Multiple', 'OnlineSecurity', 'OnlineBackup', 
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
    'Reliability', 'Options', 'Respectfulness', 'Courteous', 'Listening', 
    'Responses', 'Solutions', 'Replacements', 'Churn'
]
df_selected = df[selected_columns]

# Display the first few rows to verify the selection
df_selected.head()

Unnamed: 0,Age,Income,Education,Employment,Marital,Gender,Tenure,MonthlyCharge,Bandwidth_GB_Year,Outage_sec_perweek,...,StreamingMovies,Reliability,Options,Respectfulness,Courteous,Listening,Responses,Solutions,Replacements,Churn
0,68,28561.99,Master's Degree,Part Time,Widowed,Male,6.795513,171.449762,904.53611,6.972566,...,Yes,3,4,4,3,4,5,5,5,No
1,27,21704.77,Regular High School Diploma,Retired,Married,Female,1.156681,242.948015,800.982766,12.014541,...,Yes,3,4,3,4,4,3,4,3,Yes
2,50,33186.785,Regular High School Diploma,Student,Widowed,Female,15.754144,159.440398,2054.706961,10.245616,...,Yes,4,4,3,3,3,4,4,2,No
3,48,18925.23,Doctorate Degree,Retired,Married,Male,17.087227,120.249493,2164.579412,15.206193,...,No,2,5,4,3,3,4,4,4,No
4,83,40074.19,Master's Degree,Student,Separated,Male,1.670972,150.761216,271.493436,8.960316,...,No,3,4,4,4,5,4,4,4,Yes


In [3]:
# Check unique values in the original 'Churn' column
unique_values = df_selected['Churn'].unique()
print(f"Original unique values in 'Churn': {unique_values}")

Original unique values in 'Churn': ['No' 'Yes']


In [4]:
 # Correct the mapping based on actual unique values
df_selected['Churn'] = df_selected['Churn'].map({'yes': 1, 'no': 0, 'Yes': 1, 'No': 0})
# Check the result after conversion
print(f"Unique values in 'Churn' after conversion: {df_selected['Churn'].unique()}")
print(df_selected['Churn'].value_counts())

Unique values in 'Churn' after conversion: [0 1]
Churn
0    7350
1    2650
Name: count, dtype: int64


In [5]:
# List of columns that are binary categorical variables
binary_cols = [
 'Techie', 'Phone', 'Multiple', 'OnlineSecurity', 
    'OnlineBackup', 'DeviceProtection', 'TechSupport', 
    'StreamingTV', 'StreamingMovies'
]

# Convert 'Yes' to 1 and 'No' to 0
for col in binary_cols:
    df_selected[col] = df_selected[col].map({'Yes': 1, 'No': 0})

# Now, apply the One-Hot Encoding as before
# Print the shape of the dataframe before encoding
print("Before encoding:")
print(df_selected.shape)
print(df_selected.head())

# Apply One-Hot Encoding with drop_first=True to create one dummy variable for binary categories
df_encoded = pd.get_dummies(df_selected, columns=binary_cols, drop_first=True)

# Print the shape of the dataframe after encoding
print("After encoding:")
print(df_encoded.shape)
print(df_encoded.head()) 

Before encoding:
(10000, 32)
   Age     Income                    Education Employment    Marital  Gender  \
0   68  28561.990              Master's Degree  Part Time    Widowed    Male   
1   27  21704.770  Regular High School Diploma    Retired    Married  Female   
2   50  33186.785  Regular High School Diploma    Student    Widowed  Female   
3   48  18925.230             Doctorate Degree    Retired    Married    Male   
4   83  40074.190              Master's Degree    Student  Separated    Male   

      Tenure  MonthlyCharge  Bandwidth_GB_Year  Outage_sec_perweek  ...  \
0   6.795513     171.449762         904.536110            6.972566  ...   
1   1.156681     242.948015         800.982766           12.014541  ...   
2  15.754144     159.440398        2054.706961           10.245616  ...   
3  17.087227     120.249493        2164.579412           15.206193  ...   
4   1.670972     150.761216         271.493436            8.960316  ...   

   StreamingMovies Reliability  Options

In [6]:
# List of columns with more than two categories that need one-hot encoding without drop_first
multi_category_cols = ['Contract', 'InternetService','Education','Employment','Marital','Gender']

# Print the shape of the dataframe before encoding
print("Before encoding multi-category columns:")
print(df_selected.shape)

# Apply One-Hot Encoding without drop_first for multi-category variables
df_encoded = pd.get_dummies(df_selected, columns=multi_category_cols, drop_first=False)


# Print the shape of the dataframe after encoding
print("After encoding multi-category columns:")
print(df_encoded.shape)

Before encoding multi-category columns:
(10000, 32)
After encoding multi-category columns:
(10000, 57)


In [7]:
print("\nAfter one-hot encoding,dataframe shape:")
print(df_encoded.shape)
print(df_encoded.head())


After one-hot encoding,dataframe shape:
(10000, 57)
   Age     Income     Tenure  MonthlyCharge  Bandwidth_GB_Year  \
0   68  28561.990   6.795513     171.449762         904.536110   
1   27  21704.770   1.156681     242.948015         800.982766   
2   50  33186.785  15.754144     159.440398        2054.706961   
3   48  18925.230  17.087227     120.249493        2164.579412   
4   83  40074.190   1.670972     150.761216         271.493436   

   Outage_sec_perweek  Yearly_equip_failure  Techie  Contacts  Phone  ...  \
0            6.972566                     1       0         0      1  ...   
1           12.014541                     1       1         0      1  ...   
2           10.245616                     1       1         0      1  ...   
3           15.206193                     0       1         2      1  ...   
4            8.960316                     1       0         2      0  ...   

   Employment_Student  Employment_Unemployed  Marital_Divorced  \
0               False

In [8]:
#export clean dataset 
df_cleaned = df_encoded
df_cleaned.to_csv("df_encoded.csv", index=False)

In [9]:
# Define features (X) and target (y)
X = df_cleaned.drop('Churn', axis=1)
y = df_cleaned['Churn']

# Split the data into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the shapes of the resulting datasets
print(f"Training data shape: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Testing data shape: X_test={X_test.shape}, y_test={y_test.shape}")

Training data shape: X_train=(7000, 56), y_train=(7000,)
Testing data shape: X_test=(3000, 56), y_test=(3000,)


In [10]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Print the features and the corresponding importance
print("\nFeatures and their importance in the model:")
for feature, importance in zip(X.columns, rf_model.feature_importances_):
    print(f"{feature}: {importance:.4f}")


Features and their importance in the model:
Age: 0.0288
Income: 0.0317
Tenure: 0.1678
MonthlyCharge: 0.1211
Bandwidth_GB_Year: 0.1408
Outage_sec_perweek: 0.0356
Yearly_equip_failure: 0.0086
Techie: 0.0072
Contacts: 0.0128
Phone: 0.0038
Multiple: 0.0133
OnlineSecurity: 0.0056
OnlineBackup: 0.0066
DeviceProtection: 0.0066
TechSupport: 0.0057
StreamingTV: 0.0356
StreamingMovies: 0.0531
Reliability: 0.0146
Options: 0.0150
Respectfulness: 0.0135
Courteous: 0.0145
Listening: 0.0142
Responses: 0.0137
Solutions: 0.0135
Replacements: 0.0140
Contract_Month-to-month: 0.0543
Contract_One year: 0.0141
Contract_Two Year: 0.0187
InternetService_DSL: 0.0104
InternetService_Fiber Optic: 0.0087
InternetService_No service: 0.0043
Education_9th Grade to 12th Grade, No Diploma: 0.0031
Education_Associate's Degree: 0.0035
Education_Bachelor's Degree: 0.0051
Education_Doctorate Degree: 0.0010
Education_GED or Alternative Credential: 0.0026
Education_Master's Degree: 0.0034
Education_No Schooling Completed: 

In [11]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Print the first few predictions
print("\nFirst few predictions:")
print(y_pred[:10])

# Print the actual values for comparison
print("\nActual values for comparison:")
print(y_test[:10].values)



First few predictions:
[0 1 1 0 0 0 0 0 0 0]

Actual values for comparison:
[0 1 1 0 0 0 0 0 0 0]


In [12]:
# Evaluate the model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.95      0.92      2156
           1       0.85      0.72      0.78       844

    accuracy                           0.88      3000
   macro avg       0.87      0.83      0.85      3000
weighted avg       0.88      0.88      0.88      3000



In [13]:
# Calculate and display the ROC-AUC score
roc_auc = roc_auc_score(y_test, y_pred)
print(f"ROC-AUC Score: {roc_auc:.4f}")

ROC-AUC Score: 0.8344


In [14]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred)
# Calculate confusion matrix
print("Confusion matrix:")
print(conf_matrix)

# If using probabilities, calculate log loss
y_prob = rf_model.predict_proba(X_test)

Accuracy: 0.8843
Confusion matrix:
[[2045  111]
 [ 236  608]]


In [17]:
# Step 1: Predict the probabilities for the positive class (Churn)
y_pred_prob = rf_model.predict_proba(X_test)[:, 1]

# Step 2: Compute the Mean Squared Error
mse = mean_squared_error(y_test, y_pred_prob)

print(f"Mean Square Error: {mse}")

Mean Square Error: 0.08684139999999999


In [16]:
# save out data frames
train2_df  = pd.concat([X_train, y_train], axis=1)
test2_df  = pd.concat([X_test, y_test], axis=1)

train2_df.to_csv('churn2_train.csv', index=False)
test2_df.to_csv('churn2_test.csv', index=False)