In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [3]:
df=pd.read_csv(r'./data/final_df.csv')

In [4]:
df

Unnamed: 0,gender,Partner,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group,is_long_term_contract,senior_with_dependents
0,0,1,0,2,2,0,1,0,0,0,0,1,0,0.115423,0.001275,0,0,0,0
1,1,0,1,0,2,1,0,1,0,0,0,0,1,0.385075,0.215867,0,2,0,0
2,1,0,1,0,2,1,1,0,0,0,0,1,1,0.354229,0.010310,1,0,0,0
3,1,0,0,2,2,1,0,1,1,0,0,0,2,0.239303,0.210241,0,3,0,0
4,0,0,1,0,1,0,0,0,0,0,0,1,0,0.521891,0.015330,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,1,1,1,2,1,0,1,1,1,1,1,1,0.662189,0.227521,0,1,0,0
7039,0,1,1,1,1,0,1,1,0,1,1,1,3,0.845274,0.847461,0,5,0,0
7040,0,1,0,2,2,1,0,0,0,0,0,1,0,0.112935,0.037809,0,0,0,0
7041,1,1,1,1,1,0,0,0,0,0,0,1,1,0.558706,0.033210,1,0,0,0


In [5]:
X=df.drop('Churn',axis=1)
y=df['Churn']

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [7]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
print(f'PCA reduced features from {X_train.shape[1]} to {X_train_pca.shape[1]}')

PCA reduced features from 18 to 10


In [8]:
import joblib
joblib.dump(pca, '../models/pca.pkl')

['../models/pca.pkl']

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
models={
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(n_estimators=100,random_state=42),
    'Support Vector Machine':SVC(kernel='linear',random_state=42),
    'CatBoost': CatBoostClassifier(iterations=100,random_state=42,verbose=0)
}

In [10]:
# Create a function which can evaluate models and return a report 
def evaluate_models(X, y, models):
    
    
    
    

    models_list = []
    scores = []
    
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train_pca, y_train) # Train model

        # Make predictions
        y_pred = model.predict(X_test_pca)

        score = accuracy_score(y_test,y_pred)
        
        model_name = list(models.keys())[i]
        print(f'---- score for --- {model_name} ----')
        print(f"{score}")
        models_list.append(model_name)
        scores.append(score)
    
    print()
    
    report = pd.DataFrame()
    report['Model_name'] = models_list
    report['Score'] = scores        
    return report

In [11]:
report = evaluate_models(X, y, models)

---- score for --- Logistic Regression ----
0.8168914123491838
---- score for --- Random Forest ----
0.7785663591199432
---- score for --- Support Vector Machine ----
0.8147622427253371
---- score for --- CatBoost ----
0.8041163946061036



In [12]:
report.sort_values('Score')

Unnamed: 0,Model_name,Score
1,Random Forest,0.778566
3,CatBoost,0.804116
2,Support Vector Machine,0.814762
0,Logistic Regression,0.816891


In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [10, 100, 1000],
    'solver': ['liblinear', 'lbfgs', 'saga'],
    'penalty': ['l2'],
    'max_iter': [1000]
}

logreg = LogisticRegression()
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train_pca, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)
best_lr_model = grid_search.best_estimator_

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters: {'C': 10, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
Best cross-validation score: 0.7957052865164909


In [14]:
best_lr_model = LogisticRegression(
    C = 10, 
    max_iter =  1000, 
    penalty =  'l2', 
    solver =  'liblinear'
)

In [16]:
from sklearn.metrics import classification_report
best_model = best_lr_model.fit(X_train_pca,y_train)
y_pred = best_model.predict(X_test_pca)
score = accuracy_score(y_test,y_pred)
cr = classification_report(y_test,y_pred)

print("Logistic regression")
print ("Accuracy Score value: {:.4f}".format(score))
print (cr)

Logistic regression
Accuracy Score value: 0.8176
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      1036
           1       0.69      0.55      0.62       373

    accuracy                           0.82      1409
   macro avg       0.77      0.73      0.75      1409
weighted avg       0.81      0.82      0.81      1409



In [17]:
joblib.dump(best_model, '../models/logistic_regression.pkl')

['../models/logistic_regression.pkl']