In [1]:
import pandas as pd

customer_df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
customer_df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# TODO:
## Data Preprocessing:

In [2]:
# first converting the 'TotalCharges' column to numeric values and filling missing values with 0
customer_df["TotalCharges"] = pd.to_numeric(customer_df["TotalCharges"], errors="coerce")
customer_df["TotalCharges"].fillna(0, inplace=True)
customer_df["TotalCharges"].isnull().sum()

0

In [3]:
# Convert the 'Churn' column to binary values, where 'No' is mapped to 0 and 'Yes' is mapped to 1
customer_df["Churn"] = customer_df["Churn"].map({"No": 0, "Yes": 1})
pd.unique(customer_df["Churn"])

array([0, 1], dtype=int64)

In [4]:
print(customer_df.isnull().sum())

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [5]:
# Split the data into an 80-20 train-test split with a random state of “1”
from sklearn.model_selection import train_test_split

X = customer_df.drop("Churn", axis=1)
Y = customer_df["Churn"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

print(f"This is X_train shape:{X_train.shape}")
print(f"This is X_test shape:{X_test.shape}")
print(f"This is Y_train shape:{Y_train.shape}")
print(f"This is Y_test shape:{Y_test.shape}")

This is X_train shape:(5634, 20)
This is X_test shape:(1409, 20)
This is Y_train shape:(5634,)
This is Y_test shape:(1409,)


## Feature Engineering

In [6]:
# Selected features
categorical_col = ["gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService",
               "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies",
               "Contract", "PaperlessBilling", "PaymentMethod"]

numerical_col = ["tenure", "MonthlyCharges", "TotalCharges"]

In [7]:
from sklearn.impute import SimpleImputer

# Potential nan values were addressed using SimpleImputer:

numerical_imputer = SimpleImputer(strategy="mean")
categorical_imputer = SimpleImputer(strategy="most_frequent")

X_train[numerical_col] = numerical_imputer.fit_transform(X_train[numerical_col])
X_test[numerical_col] = numerical_imputer.transform(X_test[numerical_col])
X_train[numerical_col] = pd.DataFrame(X_train[numerical_col], columns=numerical_col, index=X_train.index)
X_test[numerical_col] = pd.DataFrame(X_test[numerical_col], columns=numerical_col, index=X_test.index)

X_train[categorical_col] = categorical_imputer.fit_transform(X_train[categorical_col])
X_test[categorical_col] = categorical_imputer.transform(X_test[categorical_col])

In [8]:
# numerical features should be scaled using StandardScaler, convert the output back to a dataframe and put back the column names
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[numerical_col] = scaler.fit_transform(X_train[numerical_col])
X_test[numerical_col] = scaler.transform(X_test[numerical_col])

In [9]:
# categorical features are one-hot encoded using OneHotEncoder(set sparse_output to false), 
# convert the output back to a dataframe and put back the column names
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
X_train_cat = encoder.fit_transform(X_train[categorical_col])
X_test_cat = encoder.transform(X_test[categorical_col])

X_train_cat = pd.DataFrame(X_train_cat, columns=encoder.get_feature_names_out(input_features=categorical_col), index=X_train.index)
X_test_cat = pd.DataFrame(X_test_cat, columns=encoder.get_feature_names_out(input_features=categorical_col), index=X_test.index)

In [10]:
# Combine scaled numerical and one-hot encoded categorical features into train and test set dataframes (use pd.concat)

X_train_processed = pd.concat([X_train[numerical_col], X_train_cat], axis=1)
X_test_processed = pd.concat([X_test[numerical_col], X_test_cat], axis=1)

In [11]:
print(X_train_processed.isnull().sum())

tenure                                     0
MonthlyCharges                             0
TotalCharges                               0
gender_Female                              0
gender_Male                                0
SeniorCitizen_0                            0
SeniorCitizen_1                            0
Partner_No                                 0
Partner_Yes                                0
Dependents_No                              0
Dependents_Yes                             0
PhoneService_No                            0
PhoneService_Yes                           0
MultipleLines_No                           0
MultipleLines_No phone service             0
MultipleLines_Yes                          0
InternetService_DSL                        0
InternetService_Fiber optic                0
InternetService_No                         0
OnlineSecurity_No                          0
OnlineSecurity_No internet service         0
OnlineSecurity_Yes                         0
OnlineBack

In [12]:
# Use scikit learn to train a random forest and extra trees classifier, and use xgboost and lightgbm to 
# train an extreme boosting model and a light gradient boosting model. Use random_state = 1 for 
# training all models and evaluate on the test set. 

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report
from tabulate import tabulate
import warnings
warnings.filterwarnings("ignore")

results = []

models = [
    ("Random Forest Classifier", RandomForestClassifier(random_state=1)),
    ("Extra Trees Classifier", ExtraTreesClassifier(random_state=1)),
    ("XGBoost Classifier", XGBClassifier(random_state=1)),
    ("LightGBM Classifier", lgb.LGBMClassifier(random_state=1))
]

for name, model in models:
    model.fit(X_train_processed, Y_train)
    y_pred = model.predict(X_test_processed)
    accuracy = accuracy_score(Y_test, y_pred)
    report = classification_report(Y_test, y_pred)
    results.append([name, accuracy, report])

table = tabulate(results, headers=["Model", "Accuracy", "Classification Report"], tablefmt="pipe")
print(table)

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000658 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
| Model                    |   Accuracy | Classification Report                                 |
|:-------------------------|-----------:|:------------------------------------------------------|
| Random Forest Classifier |   0.791341 | precision    recall  f1-score   support               |
|                          |            |                                                       |
|                          |            |            0

In [13]:
# getting the best hyperparameters from the randomized search CV?
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}

etc = ExtraTreesClassifier(random_state=1)

random_search = RandomizedSearchCV(
    etc, 
    param_distributions=hyperparameter_grid,
    n_iter=10,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=1
)

random_search.fit(X_train_processed, Y_train)

best_hyperparameters = random_search.best_params_
print(f"Best Hyperparameters: {best_hyperparameters}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}


In [14]:
# Feature importance using the optimal ExtraTreesClassifier model to get the two most important features
etc = random_search.best_estimator_

feature_importances = etc.feature_importances_
feature_names = X_train_processed.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

two_most_important_features = feature_importance_df.head(2)

print("Two most important features:")
print(two_most_important_features)

Two most important features:
                    Feature  Importance
37  Contract_Month-to-month    0.152237
0                    tenure    0.092800


In [15]:
spamclass = 2 * (0.90 * 0.75) / (0.90 + 0.75)
not_spamclass = 2 * (0.95 * 0.98) / (0.95 + 0.98)
print(spamclass)
print(not_spamclass)
overall_score = (spamclass * 1380 + not_spamclass * 320) / 1700
print("This is the overall f1 score:", overall_score)

0.8181818181818182
0.9647668393782383
This is the overall f1 score: 0.845774292759968
