# Importing Libraries

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Reading Data

In [3]:
customer_churn = pd.read_csv("telecom_customer_churn.csv")

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
offer_dummies = pd.get_dummies(customer_churn['Offer'], prefix='Offer', dummy_na=True)
customer_churn = pd.concat([customer_churn, offer_dummies], axis=1)
customer_churn.drop(columns=['Offer'], inplace=True)

In [6]:
Contract_dummies = pd.get_dummies(customer_churn['Contract'], prefix='Contract', dummy_na=True)
customer_churn = pd.concat([customer_churn, Contract_dummies], axis=1)
customer_churn.drop(columns=['Contract'], inplace=True)

In [7]:
internet_dummies = pd.get_dummies(customer_churn['Internet Type'], prefix='internet', dummy_na=True)
customer_churn = pd.concat([customer_churn, internet_dummies], axis=1)
customer_churn.drop(columns=['Internet Type'], inplace=True)

In [8]:
payment_dummies = pd.get_dummies(customer_churn['Payment Method'], prefix='payment', dummy_na=True)
customer_churn = pd.concat([customer_churn, payment_dummies], axis=1)
customer_churn.drop(columns=['Payment Method'], inplace=True)

In [9]:
customer_churn = customer_churn.where(pd.notnull(customer_churn), None)

In [10]:
customer_churn['Gender'] = customer_churn['Gender'].map({'Male': 1, 'Female': 0}).fillna(0)
customer_churn['Married'] = customer_churn['Married'].map({'Yes': 1, 'No': 0}).fillna(0)
customer_churn['Offer_Offer A'] = customer_churn['Offer_Offer A'].map({'True': 1, 'False': 0}).fillna(0)
customer_churn['Offer_Offer B'] = customer_churn['Offer_Offer B'].map({'True': 1, 'False': 0}).fillna(0)
customer_churn['Offer_Offer C'] = customer_churn['Offer_Offer C'].map({'True': 1, 'False': 0}).fillna(0)
customer_churn['Offer_Offer D'] = customer_churn['Offer_Offer D'].map({'True': 1, 'False': 0}).fillna(0)
customer_churn['Offer_Offer E'] = customer_churn['Offer_Offer E'].map({'True': 1, 'False': 0}).fillna(0)
customer_churn['Offer_nan'] = customer_churn['Offer_nan'].map({'True': 1, 'False': 0}).fillna(0)

customer_churn['payment_Bank Withdrawal'] = customer_churn['payment_Bank Withdrawal'].map({'True': 1, 'False': 0}).fillna(0)
customer_churn['payment_Credit Card'] = customer_churn['payment_Credit Card'].map({'True': 1, 'False': 0}).fillna(0)
customer_churn['payment_Mailed Check'] = customer_churn['payment_Mailed Check'].map({'True': 1, 'False': 0}).fillna(0)
customer_churn['payment_nan'] = customer_churn['payment_nan'].map({'True': 1, 'False': 0}).fillna(0)

customer_churn['Multiple Lines'] = customer_churn['Multiple Lines'].map({'Yes': 1, 'No': 0}).fillna(0)
customer_churn['Internet Service'] = customer_churn['Internet Service'].map({'Yes': 1, 'No': 0}).fillna(0)
customer_churn['Phone Service'] = customer_churn['Phone Service'].map({'Yes': 1, 'No': 0}).fillna(0)

customer_churn['Online Security'] = customer_churn['Online Security'].map({'Yes': 1, 'No': 0}).fillna(0)
customer_churn['Online Backup'] = customer_churn['Online Backup'].map({'Yes': 1, 'No': 0}).fillna(0)
customer_churn['Device Protection Plan'] = customer_churn['Device Protection Plan'].map({'Yes': 1, 'No': 0}).fillna(0)
customer_churn['Premium Tech Support'] = customer_churn['Premium Tech Support'].map({'Yes': 1, 'No': 0}).fillna(0)
customer_churn['Streaming TV'] = customer_churn['Streaming TV'].map({'Yes': 1, 'No': 0}).fillna(0)
customer_churn['Streaming Movies'] = customer_churn['Streaming Movies'].map({'Yes': 1, 'No': 0}).fillna(0)
customer_churn['Streaming Music'] = customer_churn['Streaming Music'].map({'Yes': 1, 'No': 0}).fillna(0)
customer_churn['Unlimited Data'] = customer_churn['Unlimited Data'].map({'Yes': 1, 'No': 0}).fillna(0)		

customer_churn['internet_Cable'] = customer_churn['internet_Cable'].map({'True': 1, 'False': 0}).fillna(0)
customer_churn['internet_DSL'] = customer_churn['internet_DSL'].map({'True': 1, 'False': 0}).fillna(0)
customer_churn['internet_Fiber Optic'] = customer_churn['internet_Fiber Optic'].map({'True': 1, 'False': 0}).fillna(0)
customer_churn['internet_nan'] = customer_churn['internet_nan'].map({'True': 1, 'False': 0}).fillna(0)

customer_churn['Contract_Month-to-Month'] = customer_churn['Contract_Month-to-Month'].map({'True': 1, 'False': 0}).fillna(0)
customer_churn['Contract_One Year'] = customer_churn['Contract_One Year'].map({'True': 1, 'False': 0}).fillna(0)
customer_churn['Contract_Two Year'] = customer_churn['Contract_Two Year'].map({'True': 1, 'False': 0}).fillna(0)
customer_churn['Contract_nan'] = customer_churn['Contract_nan'].map({'True': 1, 'False': 0}).fillna(0)

customer_churn['Paperless Billing'] = customer_churn['Paperless Billing'].map({'Yes': 1, 'No': 0}).fillna(0)

customer_churn['Customer Status'] = customer_churn['Customer Status'].map({'Churned': 1,'Stayed': 0,'Joined': 0}).fillna(0)

In [11]:
# Revenue-related columns to process
revenue_cols = [
    'Monthly Charge', 'Total Charges', 'Total Refunds',
    'Total Extra Data Charges', 'Total Long Distance Charges',
    'Total Revenue', 'Avg Monthly Long Distance Charges', 'Avg Monthly GB Download'
]

# Numeric columns to scale (including Age, Referrals, Tenure)
num_cols = ['Age', 'Number of Referrals', 'Tenure in Months']

# Step 1: Fill missing values
customer_churn[revenue_cols] = customer_churn[revenue_cols].fillna(0)
customer_churn[num_cols] = customer_churn[num_cols].fillna(customer_churn[num_cols].median())

# Step 2: Clip negatives in revenue columns to zero (logical for revenue)
customer_churn[revenue_cols] = customer_churn[revenue_cols].clip(lower=0)

# Step 3: Log transform revenue columns
for col in revenue_cols:
    customer_churn[f'{col}_log'] = np.log1p(customer_churn[col])

log_cols = [f'{col}_log' for col in revenue_cols]

# Step 4: Initialize scaler once
scaler = StandardScaler()

# Step 5: Scale log-transformed revenue columns
customer_churn[[f'{col}_scaled' for col in revenue_cols]] = scaler.fit_transform(customer_churn[log_cols])

# Step 6: Scale other numeric columns (Age, Referrals, Tenure)
for col in num_cols:
    customer_churn[f'{col}_scaled'] = scaler.fit_transform(customer_churn[[col]])

# Step 7: Drop original, log columns for revenue and original numeric columns
cols_to_drop = revenue_cols + log_cols + num_cols
customer_churn.drop(columns=cols_to_drop, inplace=True)

# Step 8: Drop other unwanted columns
cols_to_drop_extra = [
    "Customer ID", "City", "Zip Code", "Latitude", "Longitude",
    "Churn Reason", "Churn Category"
]
customer_churn.drop(columns=cols_to_drop_extra, inplace=True)

In [12]:
customer_churn.head()

Unnamed: 0,Gender,Married,Number of Dependents,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection Plan,Premium Tech Support,Streaming TV,Streaming Movies,Streaming Music,Unlimited Data,Paperless Billing,Customer Status,Offer_Offer A,Offer_Offer B,Offer_Offer C,Offer_Offer D,Offer_Offer E,Offer_nan,Contract_Month-to-Month,Contract_One Year,Contract_Two Year,Contract_nan,internet_Cable,internet_DSL,internet_Fiber Optic,internet_nan,payment_Bank Withdrawal,payment_Credit Card,payment_Mailed Check,payment_nan,Monthly Charge_scaled,Total Charges_scaled,Total Refunds_scaled,Total Extra Data Charges_scaled,Total Long Distance Charges_scaled,Total Revenue_scaled,Avg Monthly Long Distance Charges_scaled,Avg Monthly GB Download_scaled,Age_scaled,Number of Referrals_scaled,Tenure in Months_scaled
0,0,1,0,1,0.0,1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.287808,-0.358594,-0.275856,-0.329738,0.264472,-0.275361,0.87552,0.319127,-0.567773,0.016039,-0.952994
1,1,0,0,1,1.0,1,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.110033,-0.416516,4.094447,1.642307,-0.334335,-0.59014,-0.262005,0.013462,-0.030433,-0.650409,-0.952994
2,1,0,0,1,0.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.438801,-0.841182,-0.275856,-0.329738,-0.188845,-0.848437,0.680426,0.740969,0.208385,-0.650409,-1.15674
3,1,1,0,1,0.0,1,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.797439,0.116583,-0.275856,-0.329738,0.241178,0.057655,0.520636,-0.540164,1.88011,-0.317185,-0.789997
4,0,1,0,1,0.0,1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.599912,-0.872813,-0.275856,-0.329738,-0.961743,-1.09074,-0.550733,0.074558,1.700997,0.349263,-1.197489


# Split Training and Testing Datasets

In [16]:
x_train = customer_churn.drop(columns='Customer Status')
y_train = customer_churn["Customer Status"]

In [17]:
# First, rename the columns
x_train = x_train.rename(columns={
    'Gender': 'gender',  # Assuming 'Gender' from your initial prompt
    'Married': 'married', # Assuming 'Married' from your initial prompt
    'Number of Dependents': 'number_of_dependents',
    'Phone Service': 'phone_service',
    'Multiple Lines': 'multiple_lines',
    'Internet Service': 'internet_service',
    'Online Security': 'online_security',
    'Online Backup': 'online_backup',
    'Device Protection Plan': 'device_protection_plan',
    'Premium Tech Support': 'premium_tech_support',
    'Streaming TV': 'streaming_tv',
    'Streaming Movies': 'streaming_movies',
    'Streaming Music': 'streaming_music',
    'Unlimited Data': 'unlimited_data',
    'Paperless Billing': 'paperless_billing',
    'Offer_Offer A': 'offer_Offer A',
    'Offer_Offer B': 'offer_Offer B',
    'Offer_Offer C': 'offer_Offer C',
    'Offer_Offer D': 'offer_Offer D',
    'Offer_Offer E': 'offer_Offer E',
    'Offer_nan': 'offer_nan', # This was in your first "to be renamed" list, assuming it stays
    'Contract_Month-to-Month': 'contract_Month-to-Month',
    'Contract_One Year': 'contract_One Year',
    'Contract_Two Year': 'contract_Two Year',
    'internet_Cable': 'internet_type_Cable',
    'internet_DSL': 'internet_type_DSL',
    'internet_Fiber Optic': 'internet_type_Fiber Optic',
    'internet_nan': 'internet_type_nan', # This was in your first "to be renamed" list, assuming it stays
    'payment_Bank Withdrawal': 'payment_method_Bank Withdrawal',
    'payment_Credit Card': 'payment_method_Credit Card',
    'payment_Mailed Check': 'payment_method_Mailed Check',
    'Monthly Charge_scaled': 'monthly_charge_log_scaled',
    'Total Charges_scaled': 'total_charges_log_scaled',
    'Total Refunds_scaled': 'total_refunds_log_scaled',
    'Total Extra Data Charges_scaled': 'total_extra_data_charges_log_scaled',
    'Total Long Distance Charges_scaled': 'total_long_distance_charges_log_scaled',
    'Total Revenue_scaled': 'total_revenue_log_scaled',
    'Avg Monthly Long Distance Charges_scaled': 'avg_monthly_long_distance_charges_log_scaled',
    'Avg Monthly GB Download_scaled': 'avg_monthly_gb_download_log_scaled',
    'Age_scaled': 'age_scaled',
    'Number of Referrals_scaled': 'number_of_referrals_scaled',
    'Tenure in Months_scaled': 'tenure_in_months_scaled'
})

# Then, drop the specified columns
columns_to_drop = ['Contract_nan', 'payment_nan']
x_train = x_train.drop(columns=columns_to_drop)

In [18]:
x_train.shape

(7043, 42)

In [19]:
x_train.columns

Index(['gender', 'married', 'number_of_dependents', 'phone_service',
       'multiple_lines', 'internet_service', 'online_security',
       'online_backup', 'device_protection_plan', 'premium_tech_support',
       'streaming_tv', 'streaming_movies', 'streaming_music', 'unlimited_data',
       'paperless_billing', 'offer_Offer A', 'offer_Offer B', 'offer_Offer C',
       'offer_Offer D', 'offer_Offer E', 'offer_nan',
       'contract_Month-to-Month', 'contract_One Year', 'contract_Two Year',
       'internet_type_Cable', 'internet_type_DSL', 'internet_type_Fiber Optic',
       'internet_type_nan', 'payment_method_Bank Withdrawal',
       'payment_method_Credit Card', 'payment_method_Mailed Check',
       'monthly_charge_log_scaled', 'total_charges_log_scaled',
       'total_refunds_log_scaled', 'total_extra_data_charges_log_scaled',
       'total_long_distance_charges_log_scaled', 'total_revenue_log_scaled',
       'avg_monthly_long_distance_charges_log_scaled',
       'avg_monthly_gb_d

In [20]:
x_train.shape

(7043, 42)

# Classifier

In [21]:
clf = RandomForestClassifier(max_depth=2, random_state=0, n_estimators = 500)

grid = {'n_estimators': [100, 200, 500, 1000, 1500],
          'max_depth': [None],
          'max_features': ['auto', 'sqrt'],
          'min_samples_split': [6],
          'min_samples_leaf': [1, 2, 3, 4]}

In [22]:
# Setup GridSearchCV
grid_search = GridSearchCV(estimator=clf,
                      param_grid=grid, 
                      cv=5,
                      scoring='roc_auc',
                      verbose=2)

# Hyperparameter Tuning

In [23]:
grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6,

100 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\user\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterE

In [24]:
grid_search.best_params_

{'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 6,
 'n_estimators': 1500}

In [25]:
grid_search.best_score_

0.8736692446447114

In [26]:
grid_search.best_estimator_

In [27]:
clf = grid_search.best_estimator_

# Classifier Fitting

In [28]:
clf.fit(x_train, y_train)

In [29]:
import joblib

joblib.dump(clf, 'churn_model.pkl')

['churn_model.pkl']