In [51]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

In [52]:
# Load Dataset
customer_data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [53]:
# Explore data (head, info)
customer_data.head()
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [54]:
# Handle missing values in 'TotalCharges'
customer_data['TotalCharges'] = pd.to_numeric(customer_data['TotalCharges'], errors='coerce')
customer_data.fillna(value={'TotalCharges': 0}, inplace=True)

In [55]:
# Encode target variable
customer_data['Churn'] = customer_data['Churn'].map({'Yes': 1, 'No': 0})

In [56]:
# Separate features
categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
                        'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                        'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                        'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical_features = ['tenure', 'MonthlyCharges']

customer_data = customer_data.drop('customerID', axis=1)

In [57]:
# Scale numerical features
scaler = StandardScaler()
customer_data[numerical_features] = scaler.fit_transform(customer_data[numerical_features])

In [58]:
# Encode categorical features
encoder = OneHotEncoder(sparse=False)
encoded_categorical = encoder.fit_transform(customer_data[categorical_features])

encoded_feature_names = encoder.get_feature_names_out(categorical_features)

encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoded_feature_names, index=customer_data.index)



In [59]:
# Combine processed features
processed_data = pd.concat([customer_data[numerical_features], encoded_categorical_df], axis=1)

In [60]:
# Split data into features and target
target_variable = customer_data['Churn']
training_features, testing_features, training_target, testing_target = train_test_split(processed_data, target_variable, test_size=0.2, random_state=1)

In [62]:
# Train and evaluate models
models = {
    'Random Forest': RandomForestClassifier(random_state=1),
    'Extra Trees': ExtraTreesClassifier(random_state=1),
    'XGBoost': XGBClassifier(random_state=1, use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(random_state=1)
}

for model_name, model in models.items():
    model.fit(training_features, training_target)
    predictions = model.predict(testing_features)
    accuracy = accuracy_score(testing_target, predictions)
    print(f"{model_name} Accuracy: {accuracy:.4f}")

Random Forest Accuracy: 0.7835
Extra Trees Accuracy: 0.7608
XGBoost Accuracy: 0.7942
[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000664 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 415
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
LightGBM Accuracy: 0.8126


In [63]:
# Compare model accuracies
model_performance = pd.DataFrame({
    'Model': list(models.keys()),
    'Accuracy': [accuracy_score(testing_target, model.predict(testing_features)) for model in models.values()]
})

print(model_performance)

           Model  Accuracy
0  Random Forest  0.783534
1    Extra Trees  0.760823
2        XGBoost  0.794180
3       LightGBM  0.812633


In [68]:
# Hyperparameter tuning for Extra Trees Classifier
n_estimators_grid = [50, 100, 300, 500, 1000]
min_samples_split_grid = [2, 3, 5, 7, 9]
min_samples_leaf_grid = [1, 2, 4, 6, 8]
max_features_grid = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'n_estimators': n_estimators_grid,
    'min_samples_split': min_samples_split_grid,
    'min_samples_leaf':min_samples_leaf_grid,
    'max_features': max_features_grid
}

In [74]:
# Randomized search for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=et_model,
                                   param_distributions=hyperparameter_grid,
                                   n_iter=10,
                                   scoring='accuracy',
                                   cv=5,
                                   n_jobs=-1,
                                   verbose=1,
                                   random_state=1)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\nerdr\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\nerdr\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\nerdr\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\nerdr\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameter

In [75]:
# Get the best hyperparameters found by RandomizedSearchCV
best_hyperparameters = random_search.best_params_
print("Best Hyperparameters:", best_hyperparameters)

Best Hyperparameters: {'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}


In [76]:
# Create a new Extra Trees Classifier with the best hyperparameters
et_model_optimized = ExtraTreesClassifier(**best_hyperparameters, random_state=1)
et_model_optimized.fit(training_features, training_target)
predictions_optimized = et_model_optimized.predict(testing_features)
accuracy_optimized = accuracy_score(testing_target, predictions_optimized)
print(f'Optimized Extra Trees Classifier Accuracy: {accuracy_optimized:.4f}')

Optimized Extra Trees Classifier Accuracy: 0.8006


In [78]:
# Feature importances
feature_importances = et_model_optimized.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': processed_data.columns, 'Importance': feature_importances})

feature_importance_df_sorted = feature_importance_df.sort_values(by='Importance', ascending=False)

print("The two most important features are:")
print(feature_importance_df_sorted.head(2))

The two most important features are:
                    Feature  Importance
36  Contract_Month-to-month    0.166931
0                    tenure    0.100847
