In [5]:
import pandas as pd

In [6]:
file_path = r"C:\Users\TBeharrie-Green\Desktop\B.AN SENECA\BAN240\telco.csv"
df = pd.read_csv(file_path)

In [7]:
df.head()

Unnamed: 0,Customer ID,Gender,Age,Under 30,Senior Citizen,Married,Dependents,Number of Dependents,Country,State,...,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Satisfaction Score,Customer Status,Churn Label,Churn Score,CLTV,Churn Category,Churn Reason
0,8779-QRDMV,Male,78,No,Yes,No,No,0,United States,California,...,20,0.0,59.65,3,Churned,Yes,91,5433,Competitor,Competitor offered more data
1,7495-OOKFY,Female,74,No,Yes,Yes,Yes,1,United States,California,...,0,390.8,1024.1,3,Churned,Yes,69,5302,Competitor,Competitor made better offer
2,1658-BYGOY,Male,71,No,Yes,No,Yes,3,United States,California,...,0,203.94,1910.88,2,Churned,Yes,81,3179,Competitor,Competitor made better offer
3,4598-XLKNJ,Female,78,No,Yes,Yes,Yes,1,United States,California,...,0,494.0,2995.07,2,Churned,Yes,88,5337,Dissatisfaction,Limited range of services
4,4846-WHAFZ,Female,80,No,Yes,Yes,Yes,1,United States,California,...,0,234.21,3102.36,2,Churned,Yes,67,2793,Price,Extra data charges


In [9]:
#Checking for missing values
missing_values = df.isnull().sum()

In [11]:
data_types = df.dtypes

In [13]:
missing_values

Customer ID                             0
Gender                                  0
Age                                     0
Under 30                                0
Senior Citizen                          0
Married                                 0
Dependents                              0
Number of Dependents                    0
Country                                 0
State                                   0
City                                    0
Zip Code                                0
Latitude                                0
Longitude                               0
Population                              0
Quarter                                 0
Referred a Friend                       0
Number of Referrals                     0
Tenure in Months                        0
Offer                                3877
Phone Service                           0
Avg Monthly Long Distance Charges       0
Multiple Lines                          0
Internet Service                  

In [14]:
data_types

Customer ID                           object
Gender                                object
Age                                    int64
Under 30                              object
Senior Citizen                        object
Married                               object
Dependents                            object
Number of Dependents                   int64
Country                               object
State                                 object
City                                  object
Zip Code                               int64
Latitude                             float64
Longitude                            float64
Population                             int64
Quarter                               object
Referred a Friend                     object
Number of Referrals                    int64
Tenure in Months                       int64
Offer                                 object
Phone Service                         object
Avg Monthly Long Distance Charges    float64
Multiple L

In [16]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [17]:
#remove target variables from features
categorical_cols.remove('Customer ID')
categorical_cols.remove('Churn Label')
numerical_cols.remove('Churn Score')


In [18]:
#pipelines for preprocessing
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [19]:
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [20]:
#combine the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [21]:
#apply transformations to the data
data_processed = preprocessor.fit_transform(df)

In [22]:
data_processed[:5]

<5x1209 sparse matrix of type '<class 'numpy.float64'>'
	with 235 stored elements in Compressed Sparse Row format>

Model Training and Evaluation


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [24]:
#separate the feature and target variable
X = data_processed
y = df['Churn Label'].apply(lambda x:1 if x== 'Yes' else 0)

In [25]:
#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
#define the models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

In [27]:
#train and evaluate the models
for model_name, model in models.items():
    #train the model
    model.fit(X_train, y_train)
    
    #predict on the test set
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:,1]
    
    #evaluate the model
    print(f"Model: {model_name}")
    print(classification_report(y_test, y_pred))
    print(f"AUC-ROC: {roc_auc_score(y_test, y_pred_prob): .2f}\n")

Model: Logistic Regression
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1009
           1       1.00      1.00      1.00       400

    accuracy                           1.00      1409
   macro avg       1.00      1.00      1.00      1409
weighted avg       1.00      1.00      1.00      1409

AUC-ROC:  1.00

Model: Random Forest
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1009
           1       1.00      1.00      1.00       400

    accuracy                           1.00      1409
   macro avg       1.00      1.00      1.00      1409
weighted avg       1.00      1.00      1.00      1409

AUC-ROC:  1.00

Model: Gradient Boosting
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1009
           1       1.00      1.00      1.00       400

    accuracy                           1.00      1409
   macro avg       1.00

Model Interpretation and Recommendations

In [28]:
import numpy as np

In [34]:
#get feature names after preprocessing
from sklearn.compose import ColumnTransformer
feature_names = preprocessor.get_feature_names_out()


In [35]:
#getting the feature importances from the Random Forest model
feature_importances = models['Random Forest'].feature_importances_

In [36]:
#creating a DataFrame for feature importances
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

In [37]:
#sorting features by importance
feature_importance_df = feature_importance_df.sort_values(by= 'Importance', ascending=False)

In [38]:
#displaying the top 10 most features
top_features = feature_importance_df.head(10)
top_features

Unnamed: 0,Feature,Importance
1179,cat__Customer Status_Churned,0.167602
1187,cat__Churn Category_nan,0.154737
1208,cat__Churn Reason_nan,0.151925
16,num__Satisfaction Score,0.082872
1181,cat__Customer Status_Stayed,0.082154
1183,cat__Churn Category_Competitor,0.071121
1171,cat__Contract_Month-to-Month,0.022849
1190,cat__Churn Reason_Competitor had better devices,0.020137
1184,cat__Churn Category_Dissatisfaction,0.015034
1182,cat__Churn Category_Attitude,0.013118
