In [5]:
!pip install xgboost
!pip install joblib
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.1-cp311-cp311-win_amd64.whl (8.9 MB)
   ---------------------------------------- 0.0/8.9 MB ? eta -:--:--
   ---- ----------------------------------- 1.0/8.9 MB 10.1 MB/s eta 0:00:01
   ----- ---------------------------------- 1.3/8.9 MB 4.8 MB/s eta 0:00:02
   --------------- ------------------------ 3.4/8.9 MB 6.1 MB/s eta 0:00:01
   ----------------------- ---------------- 5.2/8.9 MB 6.8 MB/s eta 0:00:01
   ---------------------------- ----------- 6.3/8.9 MB 6.4 MB/s eta 0:00:01
   ------------------------------------- -- 8.4/8.9 MB 7.1 MB/s eta 0:00:01
   ---------------------------------------- 8.9/8.9 MB 6.8 MB/s eta 0:00:00
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scikit-

In [6]:
# Import Necessary Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
import joblib
from sklearn import metrics
import warnings

In [7]:
# Settings
pd.set_option('display.max_columns',None)
warnings.filterwarnings('ignore')

In [8]:
data = pd.read_csv("data_cleaned.csv")
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [9]:
# Data Backup
df= data.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   object 
 1   SeniorCitizen     7032 non-null   int64  
 2   Partner           7032 non-null   object 
 3   Dependents        7032 non-null   object 
 4   tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  Contract          7032 non-null   object 
 15  PaperlessBilling  7032 non-null   object 
 16  PaymentMethod     7032 non-null   object 


In [10]:
# Encoding 'Churn'
df['Churn']=df['Churn'].map({'No':0,'Yes':1})

# Separate target and feature
X=df.drop(columns='Churn',axis=1)
y=df['Churn']

# Separating Numeric and Categorical Columns
numeric_columns=X.select_dtypes(include=[int,float]).columns.to_list()
categorical_columns=X.select_dtypes(include=[object]).columns.to_list()
print('Numeric columns are:\n',numeric_columns)
print('Categorical Columns are \n',categorical_columns)

# Split dataset
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

Numeric columns are:
 ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
Categorical Columns are 
 ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [12]:
# Data Processing
preprocessor= ColumnTransformer(transformers=[
    ('num',StandardScaler(), numeric_columns),
    ('cat',OneHotEncoder(handle_unknown='ignore',sparse_output=False),categorical_columns)
])

# Fit and Transform training data, transform test data
X_train_processed=preprocessor.fit_transform(X_train)
X_test_processed=preprocessor.transform(X_test)

In [15]:
# Model Details
model_dict={
    'LogisticRegression':LogisticRegression(max_iter=1000),
    'SVC': SVC(probability=True),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False,eval_metrics='logclass')
}
# Hyper Parameter Details
search_space={
    'C':[0.1,1,10],
    'kernel': ['linear','rbf'],
    'n_estimator': [100,200],
    'max_depth': [None,10],
    'learning_rate':[0.5,1]
}
# Custom Function to Filter HyperParameter
def filterhyperparameter(model,space):
    valid_key=model.get_params().keys()
    param_grid={k:v for k,v in space.items() if k in valid_key}
    return param_grid
    

In [17]:
# Model Training with Grid Search
result=[]
for name,model in model_dict.items():
    print(f'Tuning For:{name}')

    param_grid=filterhyperparameter(model,search_space)
    
    grid=GridSearchCV(estimator=model,param_grid=param_grid,cv=5,scoring='accuracy',n_jobs=-1)
    grid.fit(X_train_processed,y_train)
    
    best_model=grid.best_estimator_
    y_pred=best_model.predict(X_test_processed)
    
    report=metrics.classification_report(y_test,y_pred,output_dict=True)
    best_params=grid.best_params_

    print(f"Best Params for {name}:{best_params}")
    
    result.append({
    'Model_Name': name,
    'Best_Parameter': best_params,
    'Accuracy': round(metrics.accuracy_score(y_test,y_pred),4),
    'F1_score': round(report['weighted avg']['f1-score'],4)
    })

print('Tuning is complete')

Tuning For:LogisticRegression
Best Params for LogisticRegression:{'C': 10}
Tuning For:SVC
Best Params for SVC:{'C': 1, 'kernel': 'linear'}
Tuning For:Random Forest
Best Params for Random Forest:{'max_depth': 10}
Tuning For:AdaBoost
Best Params for AdaBoost:{'learning_rate': 1}
Tuning For:XGBoost
Best Params for XGBoost:{'learning_rate': 0.5, 'max_depth': None}
Tuning is complete


In [19]:
# Model Comparisison
model_df=pd.DataFrame(result)
model_df

Unnamed: 0,Model_Name,Best_Parameter,Accuracy,F1_score
0,LogisticRegression,{'C': 10},0.7882,0.7815
1,SVC,"{'C': 1, 'kernel': 'linear'}",0.7953,0.7895
2,Random Forest,{'max_depth': 10},0.7918,0.7825
3,AdaBoost,{'learning_rate': 1},0.7875,0.7791
4,XGBoost,"{'learning_rate': 0.5, 'max_depth': None}",0.7591,0.7542


In [20]:
# Best Model
model_df_sorted=model_df.sort_values(by='Accuracy', ascending=False)
best_row=model_df_sorted.iloc[0]
best_model_name=best_row['Model_Name']
best_parameter=best_row['Best_Parameter']
print(f'Best Model is:{best_model_name}')
print("Best Hyperparameter are:\n",best_parameter)

Best Model is:SVC
Best Hyperparameter are:
 {'C': 1, 'kernel': 'linear'}


In [22]:
#Final Model
final_model=model_dict[best_model_name].set_params(**best_parameter)

# Retrain 
X_full_processed= preprocessor.transform(X)
final_model.fit(X_full_processed,y)

0,1,2
,C,1
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [23]:
# Save Pipeline
deployment_pipeline = ([
    ('preprocessor',preprocessor),
    ('Model',final_model),
])

# Save Model
joblib.dump(deployment_pipeline,"Churn_pipeline.pkl")
print('Final Model is saved as "Churn_pipeline"')

Final Model is saved as "Churn_pipeline"
