In [25]:
import pandas as pd 
df =pd.read_csv(r".\data\customer_churn.csv")

In [26]:
df.columns

Index(['Names', 'Age', 'Total_Purchase', 'Account_Manager', 'Years',
       'Num_Sites', 'Onboard_date', 'Location', 'Company', 'Churn'],
      dtype='object')

In [27]:
df.head()

Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company,Churn
0,Cameron Williams,42.0,11066.8,0,7.22,8.0,2013-08-30 07:00:40,"10265 Elizabeth Mission Barkerburgh, AK 89518",Harvey LLC,1
1,Kevin Mueller,41.0,11916.22,0,6.5,11.0,2013-08-13 00:38:46,"6157 Frank Gardens Suite 019 Carloshaven, RI 1...",Wilson PLC,1
2,Eric Lozano,38.0,12884.75,0,6.67,12.0,2016-06-29 06:20:07,"1331 Keith Court Alyssahaven, DE 90114","Miller, Johnson and Wallace",1
3,Phillip White,42.0,8010.76,0,6.71,10.0,2014-04-22 12:43:12,"13120 Daniel Mount Angelabury, WY 30645-4695",Smith Inc,1
4,Cynthia Norton,37.0,9191.58,0,5.56,9.0,2016-01-19 15:31:15,"765 Tricia Row Karenshire, MH 71730",Love-Jones,1


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Names            900 non-null    object 
 1   Age              900 non-null    float64
 2   Total_Purchase   900 non-null    float64
 3   Account_Manager  900 non-null    int64  
 4   Years            900 non-null    float64
 5   Num_Sites        900 non-null    float64
 6   Onboard_date     900 non-null    object 
 7   Location         900 non-null    object 
 8   Company          900 non-null    object 
 9   Churn            900 non-null    int64  
dtypes: float64(4), int64(2), object(4)
memory usage: 70.4+ KB


In [29]:
df.drop(columns =['Names'], axis =1, inplace =True)

In [30]:
date_column = 'Onboard_date'
categorical_features = [feature for feature in df.columns if df[feature].dtype=='object' and feature!=date_column]

In [31]:
categorical_features

['Location', 'Company']

In [33]:
numerical_features =df.columns[~df.columns.isin(categorical_features + [date_column])].to_list()

In [27]:
for feature in categorical_features:
    print(f"The '{feature}' has {df[feature].nunique()} values.")

The 'Onboard_date' has 900 values.
The 'Location' has 900 values.
The 'Company' has 873 values.


In [34]:
df['Onboard_date'] = pd.to_datetime(df['Onboard_date'])


In [35]:
df['Onboard_year'] = df['Onboard_date'].dt.year
df['Onboard_month'] = df['Onboard_date'].dt.month
df['Onboard_day'] = df['Onboard_date'].dt.day
df['Onboard_dayofweek'] = df['Onboard_date'].dt.dayofweek
df['Onboard_weekofyear'] = df['Onboard_date'].dt.isocalendar().week

In [36]:
df.drop(columns =['Onboard_date'], axis =1, inplace =True)

In [37]:
# Example with target 'Churn'
from category_encoders import TargetEncoder


In [38]:
encoder = TargetEncoder()
df['Location_encoded'] = encoder.fit_transform(df['Location'], df['Churn'])

In [39]:
df.drop(columns =['Location'], axis =1, inplace =True)

In [40]:
freq = df['Company'].value_counts() / len(df)
df['Company_freq'] = df['Company'].map(freq)
df.drop(columns =['Company'], axis =1, inplace =True)

In [41]:
df.head()

Unnamed: 0,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Churn,Onboard_year,Onboard_month,Onboard_day,Onboard_dayofweek,Onboard_weekofyear,Location_encoded,Company_freq
0,42.0,11066.8,0,7.22,8.0,1,2013,8,30,4,35,0.27509,0.001111
1,41.0,11916.22,0,6.5,11.0,1,2013,8,13,1,33,0.27509,0.003333
2,38.0,12884.75,0,6.67,12.0,1,2016,6,29,2,26,0.27509,0.001111
3,42.0,8010.76,0,6.71,10.0,1,2014,4,22,1,17,0.27509,0.002222
4,37.0,9191.58,0,5.56,9.0,1,2016,1,19,1,3,0.27509,0.001111


In [43]:
X =df.drop(columns=['Churn'], axis =1)
y =df['Churn']

In [42]:
from sklearn.model_selection import train_test_split

In [44]:
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size= 0.25, random_state= 42)

In [45]:
print(f"Shape of training data: {X_train.size}")
print(f"Shape of validation data: {y_train.size}")
print(f"Shape of testing data: {X_test.size}")
print(f"Shape of testing validation data: {y_test.size}")

Shape of training data: 8100
Shape of validation data: 675
Shape of testing data: 2700
Shape of testing validation data: 225


In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import make_pipeline

In [51]:
model =make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=2000, solver='saga')
)

In [52]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('standardscaler', ...), ('logisticregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,2000


In [53]:
from sklearn.metrics import classification_report

In [54]:
y_pred =model.predict(X_test)

In [56]:
report =classification_report(y_test, y_pred)

In [62]:
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       186
           1       1.00      1.00      1.00        39

    accuracy                           1.00       225
   macro avg       1.00      1.00      1.00       225
weighted avg       1.00      1.00      1.00       225



In [70]:
X_test.iloc[0,:]

Age                       41.0
Total_Purchase        13365.66
Account_Manager            1.0
Years                     8.36
Num_Sites                  9.0
Onboard_year            2008.0
Onboard_month              7.0
Onboard_day               26.0
Onboard_dayofweek          5.0
Onboard_weekofyear        30.0
Location_encoded       0.27509
Company_freq          0.001111
Name: 70, dtype: Float64

In [72]:
res =model.predict([X_test.iloc[0,:]])



1