In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.linear_model import SGDClassifier

In [2]:
train = pd.read_csv('customer_churn_dataset-training-master.csv').dropna()
train = train.drop('CustomerID',axis = 1)

test = pd.read_csv('customer_churn_dataset-testing-master.csv').dropna()
test = test.drop('CustomerID',axis = 1)

In [3]:
def age_cat(df, col):
    conditions = [
      (df[col] >= 18) & (df[col] <= 25),  # 0-17 years old (inclusive)
      (df[col] > 25) & (df[col] <= 30),(df[col] > 30) & (df[col] <= 40),(df[col] > 40) & (df[col] <= 50),
      df[col] >= 51   
  ]
    categories = ['18-25', '26-30', '31-40','41-50','51+']
    df['Age'] = np.select(conditions, categories, default=np.nan)
    return df

In [4]:
train = age_cat(train,'Age')
test = age_cat(test,'Age')

In [5]:
x_train = train.drop(['Churn','Payment Delay'], axis = 1)
y_train = train['Churn']
x_test = test.drop(['Churn','Payment Delay'], axis = 1)
y_test = test['Churn']

In [6]:
x_train

Unnamed: 0,Age,Gender,Tenure,Usage Frequency,Support Calls,Subscription Type,Contract Length,Total Spend,Last Interaction
0,26-30,Female,39.0,14.0,5.0,Standard,Annual,932.00,17.0
1,51+,Female,49.0,1.0,10.0,Basic,Monthly,557.00,6.0
2,51+,Female,14.0,4.0,6.0,Basic,Quarterly,185.00,3.0
3,51+,Male,38.0,21.0,7.0,Standard,Monthly,396.00,29.0
4,18-25,Male,32.0,20.0,5.0,Basic,Monthly,617.00,20.0
...,...,...,...,...,...,...,...,...,...
440828,41-50,Male,54.0,15.0,1.0,Premium,Annual,716.38,8.0
440829,18-25,Female,8.0,13.0,1.0,Premium,Annual,745.38,2.0
440830,26-30,Male,35.0,27.0,1.0,Standard,Quarterly,977.31,9.0
440831,26-30,Male,55.0,14.0,2.0,Standard,Quarterly,602.55,2.0


In [9]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 440832 entries, 0 to 440832
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Age                440832 non-null  object 
 1   Gender             440832 non-null  object 
 2   Tenure             440832 non-null  float64
 3   Usage Frequency    440832 non-null  float64
 4   Support Calls      440832 non-null  float64
 5   Subscription Type  440832 non-null  object 
 6   Contract Length    440832 non-null  object 
 7   Total Spend        440832 non-null  float64
 8   Last Interaction   440832 non-null  float64
dtypes: float64(5), object(4)
memory usage: 33.6+ MB


In [10]:
transformation = ColumnTransformer([
    ('One_hot_encoding',OneHotEncoder(sparse_output=False, drop='first'), [1]),
    ('ordinal_encoding', OrdinalEncoder(categories=[['Basic','Standard', 'Premium'],['Monthly', 'Quarterly','Annual'],['18-25', '26-30', '31-40','41-50','51+']]), [5,6,0]),
    ('Standardization', StandardScaler(),[2,3,4,7,8])
],remainder='passthrough')

In [11]:
pipe = Pipeline([
    ('Step_1',transformation),
    ('Step_2',SGDClassifier(loss='epsilon_insensitive',
                            max_iter=1000, shuffle=True, learning_rate='invscaling',
                            eta0=0.01, early_stopping=True,
                            penalty='elasticnet', n_iter_no_change=200), )
])

pipe.fit(x_train,y_train)

In [12]:
pred = pipe.predict(x_test)

In [13]:
accuracy_score(y_test, pred)

0.6043433684406748

In [14]:
confusion_matrix(y_test, pred)

array([[11278, 22603],
       [ 2867, 27626]], dtype=int64)

In [16]:
import pickle

In [17]:
pickle.dump(pipe,open('churn_pred_model.pkl','wb'))