In [78]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

# importing the churn_raw_data.csv through the file path
df = pd.read_csv('/Users/justinhuynh/Desktop/churn_clean.csv')

# check all information about this file
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 50 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CaseOrder             10000 non-null  int64  
 1   Customer_id           10000 non-null  object 
 2   Interaction           10000 non-null  object 
 3   UID                   10000 non-null  object 
 4   City                  10000 non-null  object 
 5   State                 10000 non-null  object 
 6   County                10000 non-null  object 
 7   Zip                   10000 non-null  int64  
 8   Lat                   10000 non-null  float64
 9   Lng                   10000 non-null  float64
 10  Population            10000 non-null  int64  
 11  Area                  10000 non-null  object 
 12  TimeZone              10000 non-null  object 
 13  Job                   10000 non-null  object 
 14  Children              10000 non-null  int64  
 15  Age                 

In [79]:
# remove non-relevant columns 
non_relevant_columns = ['CaseOrder', 'Customer_id', 'Interaction', 'UID', 'City', 'State',
                        'County', 'Zip', 'Lat', 'Lng', 'Population', 'Area', 'TimeZone', 
                        'Port_modem', 'Tablet', 'Item1', 'Item2', 'Item3', 'Item4', 'Item5', 
                        'Item6', 'Item7', 'Item8']

df = df.drop(columns=non_relevant_columns)

# verify the remaining columns
print(df.columns)

Index(['Job', 'Children', 'Age', 'Income', 'Marital', 'Gender', 'Churn',
       'Outage_sec_perweek', 'Email', 'Contacts', 'Yearly_equip_failure',
       'Techie', 'Contract', 'InternetService', 'Phone', 'Multiple',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'PaymentMethod',
       'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year'],
      dtype='object')


In [80]:
# check for missing values in the relevant columns
relevant_columns = df.columns
missing_values = df[relevant_columns].isnull().sum()
print(missing_values[missing_values > 0])

# convert 'Churn' to numeric values
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

Series([], dtype: int64)


In [81]:
# list of categorical variables
categorical_columns = ['Contract', 'DeviceProtection', 'Gender', 'InternetService', 'Job', 
                       'Marital', 'Multiple', 'OnlineBackup', 'OnlineSecurity', 'PaperlessBilling', 
                       'PaymentMethod', 'Phone', 'StreamingMovies', 'StreamingTV', 'TechSupport', 'Techie']

# initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')

# fit and transform the categorical columns
encoded_categorical_data = encoder.fit_transform(df[categorical_columns])

# create a DataFrame with the encoded categorical data
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_columns))

# drop the original categorical columns from the data set
df = df.drop(columns=categorical_columns)

# add the encoded categorical columns to the data set
df = pd.concat([df, encoded_categorical_df], axis=1)

# verify the columns in the feature set
print(df.columns)

Index(['Children', 'Age', 'Income', 'Churn', 'Outage_sec_perweek', 'Email',
       'Contacts', 'Yearly_equip_failure', 'Tenure', 'MonthlyCharge',
       ...
       'OnlineSecurity_Yes', 'PaperlessBilling_Yes',
       'PaymentMethod_Credit Card (automatic)',
       'PaymentMethod_Electronic Check', 'PaymentMethod_Mailed Check',
       'Phone_Yes', 'StreamingMovies_Yes', 'StreamingTV_Yes',
       'TechSupport_Yes', 'Techie_Yes'],
      dtype='object', length=672)


In [82]:
# list of numeric columns
numeric_columns = ['Age', 'Bandwidth_GB_Year', 'Children', 'Contacts', 'Income', 
                   'MonthlyCharge', 'Outage_sec_perweek', 'Tenure', 'Yearly_equip_failure']

# initialize the StandardScaler
scaler = StandardScaler()

# fit and transform the numeric columns
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# verify the scaled feature set
print(df.head())

   Children       Age    Income  Churn  Outage_sec_perweek  Email  Contacts  \
0 -0.972338  0.720925 -0.398778      0           -0.679978     10 -1.005852   
1 -0.506592 -1.259957 -0.641954      1            0.570331     12 -1.005852   
2  0.890646 -0.148730 -1.070885      0            0.252347      9 -1.005852   
3 -0.506592 -0.245359 -0.740525      0            1.650506     15  1.017588   
4 -0.972338  1.445638  0.009478      1           -0.623156     16  1.017588   

   Yearly_equip_failure    Tenure  MonthlyCharge  ...  OnlineSecurity_Yes  \
0              0.946658 -1.048746      -0.003943  ...                 1.0   
1              0.946658 -1.262001       1.630326  ...                 1.0   
2              0.946658 -0.709940      -0.295225  ...                 0.0   
3             -0.625864 -0.659524      -1.226521  ...                 1.0   
4              0.946658 -1.242551      -0.528086  ...                 0.0   

   PaperlessBilling_Yes  PaymentMethod_Credit Card (automatic)

In [83]:
# save prepared data to csv
df.to_csv('cleaned_churn_data_d209_task2.csv', index=False)

In [84]:
# define the feature set (X) and the target variable (y)
X = df.drop(columns=['Churn'])
y = df['Churn']

# split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# save the processed training and testing data to CSV files
X_train.to_csv('X_train_task_2.csv', index=False)
X_test.to_csv('X_test_task_2.csv', index=False)
y_train.to_csv('y_train_task_2.csv', index=False)
y_test.to_csv('y_test_task_2.csv', index=False)

# display training and testing sets shapes
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8000, 671) (2000, 671) (8000,) (2000,)


In [85]:
# load the processed training and testing data
X_train = pd.read_csv('X_train_task_2.csv')
X_test = pd.read_csv('X_test_task_2.csv')
y_train = pd.read_csv('y_train_task_2.csv')
y_test = pd.read_csv('y_test_task_2.csv')

# ensure all columns are numeric
for column in X_train.columns:
    if X_train[column].dtype == object:
        print(f"Column {column} has string values:")
        print(X_train[column].unique())
        # Convert column to numeric, if possible
        X_train[column] = X_train[column].apply(lambda x: 1 if x.strip().lower() == 'yes' else (0 if x.strip().lower() == 'no' else x))

for column in X_test.columns:
    if X_test[column].dtype == object:
        print(f"Column {column} has string values:")
        print(X_test[column].unique())
        # Convert column to numeric, if possible
        X_test[column] = X_test[column].apply(lambda x: 1 if x.strip().lower() == 'yes' else (0 if x.strip().lower() == 'no' else x))

# verify the conversion
print(X_train.dtypes)
print(X_test.dtypes)

Children               float64
Age                    float64
Income                 float64
Outage_sec_perweek     float64
Email                    int64
                        ...   
Phone_Yes              float64
StreamingMovies_Yes    float64
StreamingTV_Yes        float64
TechSupport_Yes        float64
Techie_Yes             float64
Length: 671, dtype: object
Children               float64
Age                    float64
Income                 float64
Outage_sec_perweek     float64
Email                    int64
                        ...   
Phone_Yes              float64
StreamingMovies_Yes    float64
StreamingTV_Yes        float64
TechSupport_Yes        float64
Techie_Yes             float64
Length: 671, dtype: object


In [86]:
# ensure target variable 'y' is in correct format
y_train = y_train.values.ravel()  # convert DataFrame to 1D array
y_test = y_test.values.ravel()    # convert DataFrame to 1D array

# initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# train the model
clf.fit(X_train, y_train)

# make predictions on the test set
y_pred = clf.predict(X_test)

# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# generate classification report
report = classification_report(y_test, y_pred)
print(report)

# generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

Accuracy: 0.8695
              precision    recall  f1-score   support

           0       0.91      0.92      0.91      1456
           1       0.77      0.75      0.76       544

    accuracy                           0.87      2000
   macro avg       0.84      0.83      0.83      2000
weighted avg       0.87      0.87      0.87      2000

[[1333  123]
 [ 138  406]]


In [87]:
# calculate predicted probabilities
y_prob = clf.predict_proba(X_test)[:, 1]

# calculate MSE
mse = mean_squared_error(y_test, y_prob)
print(f"MSE: {mse}")

MSE: 0.1305
