<a href="https://colab.research.google.com/github/iamcbn/Hamoye/blob/Stage-C/Hamoye_Stage_C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/iamcbn/Hamoye/Stage-C/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [5]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [6]:
df.TotalCharges = pd.to_numeric(df.TotalCharges.str.replace(' ', ''))
df.TotalCharges = df.TotalCharges.fillna(0)

In [7]:
df.TotalCharges.dtypes

dtype('float64')

In [8]:
df.Churn = LabelEncoder().fit_transform(df.Churn)

In [9]:
def feature_engineering(df):
  """
  This function performs feature engineering on the dataframe.

  Args:
      df (pandas.DataFrame): The input dataframe.

  Returns:
      pandas.DataFrame: The dataframe with scaled numerical features and one-hot encoded categorical features.
  """
  categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                 'Contract', 'PaperlessBilling', 'PaymentMethod']
  numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

  # Separate categorical and numerical features
  df_categorical = df[categorical]
  df_numerical = df[numerical]

  # Scale numerical features
  scaler = StandardScaler()
  df_numerical_scaled = pd.DataFrame(scaler.fit_transform(df_numerical), columns=numerical)

  # One-hot encode categorical features
  encoder = OneHotEncoder(sparse_output=False)
  df_categorical_encoded = pd.DataFrame(encoder.fit_transform(df_categorical), columns=encoder.get_feature_names_out(categorical))

  # Combine scaled numerical and encoded categorical features
  df_processed = pd.concat([df_numerical_scaled, df_categorical_encoded], axis=1)

  return df_processed

In [10]:
# Feature engineering
df_processed = feature_engineering(df.copy())



In [11]:
# Split data into training and testing sets
X = df_processed
target = 'Churn'
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [12]:
# Train Random Forest Classifier
model_rf = RandomForestClassifier(random_state=1)
model_rf.fit(X_train, y_train)

# Train Extra Trees Classifier
model_et = ExtraTreesClassifier(random_state=1)
model_et.fit(X_train, y_train)

# Train XGBoost Classifier (corrected typo)
model_xgb = XGBClassifier(objective='binary:logistic', random_state=1)  # Set objective for classification
model_xgb.fit(X_train, y_train)

# Train LightGBM Classifier (corrected typo)
model_lgb = LGBMClassifier(random_state=1)
model_lgb.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
Random Forest Accuracy: 0.7906316536550745
Extra Trees Accuracy: 0.7700496806245565
XGBoost Accuracy: 0.7934705464868701
LightGBM Accuracy: 0.8133427963094393


In [13]:
print("Random Forest Accuracy:", accuracy_score(y_test, model_rf.predict(X_test)))
print("Extra Trees Accuracy:", accuracy_score(y_test, model_et.predict(X_test)))
print("XGBoost Accuracy:", accuracy_score(y_test, model_xgb.predict(X_test)))
print("LightGBM Accuracy:", accuracy_score(y_test, model_lgb.predict(X_test)))

Random Forest Accuracy: 0.7906316536550745
Extra Trees Accuracy: 0.7700496806245565
XGBoost Accuracy: 0.7934705464868701
LightGBM Accuracy: 0.8133427963094393


In [14]:
# Define a parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create the ExtraTreesClassifier model
model_et = ExtraTreesClassifier(random_state=1)

# Use RandomizedSearchCV to find optimal hyperparameters
random_search = RandomizedSearchCV(estimator=model_et, param_distributions=param_grid, cv=5, scoring='accuracy', random_state=1)
random_search.fit(X_train, y_train)

# Get the best model with optimized hyperparameters
best_et_model = random_search.best_estimator_

# Train the best model on the training data (optional)
best_et_model.fit(X_train, y_train)

# Evaluate the best model on the test data
y_pred = best_et_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("ExtraTreesClassifier Accuracy with RandomizedSearchCV:", accuracy)


ExtraTreesClassifier Accuracy with RandomizedSearchCV: 0.8076650106458482


In [15]:
# Define the hyperparameter grid
hyperparameter_grid = {
    'n_estimators': [50, 100, 300, 500, 1000],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'min_samples_split': [2, 3, 5, 7, 9],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

# Create the ExtraTreesClassifier model
model_et = ExtraTreesClassifier(random_state=1)

# Use RandomizedSearchCV with the defined parameters
random_search = RandomizedSearchCV(estimator=model_et, param_distributions=hyperparameter_grid, cv=5, n_iter=10, scoring='accuracy', n_jobs=-1, verbose=1, random_state=1)

# Fit the RandomizedSearchCV model (This will take time)
random_search.fit(X_train, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [18]:
# Assuming you have the optimal model stored in 'best_et_model'

# Get feature importances
feature_importances = best_et_model.feature_importances_

# Sort feature importances and corresponding feature names (assuming you have them)
importances_sorted_idx = np.argsort(feature_importances)[::-1]
features_sorted = X_train.columns[importances_sorted_idx]

# Print the two most important features
print("Top 2 Most Important Features:")
print(f"1. {features_sorted[0]}")
print(f"2. {features_sorted[1]}")

print()
# Print all feature importances with feature names (optional)
for feature, importance in zip(features_sorted, feature_importances[importances_sorted_idx]):
    print(f"{feature}: {importance:.4f}")


Top 2 Most Important Features:
1. Contract_Month-to-month
2. tenure

Contract_Month-to-month: 0.1582
tenure: 0.0835
OnlineSecurity_No: 0.0708
InternetService_Fiber optic: 0.0646
TechSupport_No: 0.0611
Contract_Two year: 0.0600
PaymentMethod_Electronic check: 0.0536
TotalCharges: 0.0460
InternetService_DSL: 0.0324
Contract_One year: 0.0313
OnlineBackup_No: 0.0284
OnlineSecurity_Yes: 0.0230
TechSupport_Yes: 0.0167
DeviceProtection_No: 0.0163
MonthlyCharges: 0.0152
OnlineBackup_Yes: 0.0121
OnlineBackup_No internet service: 0.0118
PaperlessBilling_No: 0.0117
PaperlessBilling_Yes: 0.0113
gender_Male: 0.0095
PaymentMethod_Credit card (automatic): 0.0093
TechSupport_No internet service: 0.0093
StreamingMovies_Yes: 0.0093
gender_Female: 0.0090
Partner_Yes: 0.0090
StreamingMovies_No: 0.0090
StreamingTV_Yes: 0.0086
MultipleLines_Yes: 0.0085
Partner_No: 0.0082
StreamingTV_No: 0.0081
DeviceProtection_Yes: 0.0081
PaymentMethod_Bank transfer (automatic): 0.0081
DeviceProtection_No internet service: 