In [67]:
# Import necessary functions

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.metrics import confusion_matrix, plot_confusion_matrix,\
    precision_score, recall_score, accuracy_score, f1_score, log_loss,\
    roc_curve, roc_auc_score, classification_report

sns.set(font_scale = 1)

In [68]:
# Import main dataset as dataframe

df = pd.read_csv('Data/churn.csv')

In [69]:
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account length          3333 non-null   int64  
 2   area code               3333 non-null   int64  
 3   phone number            3333 non-null   object 
 4   international plan      3333 non-null   object 
 5   voice mail plan         3333 non-null   object 
 6   number vmail messages   3333 non-null   int64  
 7   total day minutes       3333 non-null   float64
 8   total day calls         3333 non-null   int64  
 9   total day charge        3333 non-null   float64
 10  total eve minutes       3333 non-null   float64
 11  total eve calls         3333 non-null   int64  
 12  total eve charge        3333 non-null   float64
 13  total night minutes     3333 non-null   float64
 14  total night calls       3333 non-null   

In [71]:
df['area code'].value_counts()

415    1655
510     840
408     838
Name: area code, dtype: int64

In [72]:
df_clean = df.drop(columns=['area code', 'phone number'], axis=1)
df_clean

Unnamed: 0,state,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.70,1,False
1,OH,107,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1,False
2,NJ,137,no,no,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,yes,no,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,AZ,192,no,yes,36,156.2,77,26.55,215.5,126,18.32,279.1,83,12.56,9.9,6,2.67,2,False
3329,WV,68,no,no,0,231.1,57,39.29,153.4,55,13.04,191.3,123,8.61,9.6,4,2.59,3,False
3330,RI,28,no,no,0,180.8,109,30.74,288.8,58,24.55,191.9,91,8.64,14.1,6,3.81,2,False
3331,CT,184,yes,no,0,213.8,105,36.35,159.6,84,13.57,139.2,137,6.26,5.0,10,1.35,2,False


In [73]:
df_clean['international plan'].replace(['no','yes'], [0,1], inplace = True)
df_clean['voice mail plan'].replace(['no','yes'], [0,1], inplace = True)
df_clean['churn'].replace([False, True], [0,1], inplace = True)

In [74]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account length          3333 non-null   int64  
 2   international plan      3333 non-null   int64  
 3   voice mail plan         3333 non-null   int64  
 4   number vmail messages   3333 non-null   int64  
 5   total day minutes       3333 non-null   float64
 6   total day calls         3333 non-null   int64  
 7   total day charge        3333 non-null   float64
 8   total eve minutes       3333 non-null   float64
 9   total eve calls         3333 non-null   int64  
 10  total eve charge        3333 non-null   float64
 11  total night minutes     3333 non-null   float64
 12  total night calls       3333 non-null   int64  
 13  total night charge      3333 non-null   float64
 14  total intl minutes      3333 non-null   

In [75]:
df_clean['churn'].value_counts(normalize=True)

0    0.855086
1    0.144914
Name: churn, dtype: float64

In [76]:
# Correlation heatmap for intial cleansed dataset

#fig, ax = plt.subplots(figsize=(15, 18))

#sns.heatmap(df_clean.corr(), annot=True);

In [77]:
X = df_clean.drop(columns = ['churn', 'state'], axis = 1)
y = df_clean['churn']

dt = DecisionTreeClassifier(random_state=42)

dt.fit(X, y)

for fi, feature in zip(dt.feature_importances_, feature_used):
    print(fi, feature)

NameError: name 'feature_used' is not defined

In [None]:
# Separate data into feature and target DataFrames
X = df_clean.drop(columns = ['churn', 'state'], axis = 1)
y = df_clean['churn']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=1)

# Scale the data for modeling
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

# Train a logistic regresssion model with the train data
log_model = LogisticRegression(random_state=42)
log_model.fit(X_train_sc, y_train)

In [None]:
log_model.score(X_test_sc, y_test)

In [None]:
# get importance
importance = log_model.coef_[0]

# summarize feature importance
for i,v in enumerate(importance):
 print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
X.info()

In [None]:
# plot feature importance
fig, ax = plt.subplots(figsize=(15, 18))
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
df_clean['customer service calls'].value_counts(normalize=True)

In [78]:
csc_calls = df_clean.groupby('customer service calls').sum()
csc_calls.reset_index(drop=False, inplace=True)

In [79]:
csc_calls

Unnamed: 0,customer service calls,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,churn
0,0,70608,83,193,5676,126739.3,70327,21546.13,140107.7,69770,11909.35,140946.9,69354,6342.71,7239.9,3098,1955.11,92
1,1,120202,112,344,9960,211936.3,119343,36029.75,239056.2,118254,20319.96,236393.6,118679,10637.73,11986.1,5365,3236.91,122
2,2,75312,62,215,6293,134156.1,75342,22807.02,152042.4,76194,12923.8,152518.0,76987,6863.37,7800.5,3397,2106.54,87
3,3,43515,38,98,2928,78642.8,42864,13369.41,84952.2,42390,7221.03,86780.0,42472,3905.12,4348.8,1932,1174.33,44
4,4,17042,21,42,1245,30589.2,16895,5200.25,33543.7,16948,2851.21,33050.7,16130,1487.31,1697.0,714,458.31,76
5,5,6769,6,17,479,11573.7,6568,1967.59,13073.9,6697,1111.31,12668.3,6450,570.13,698.7,292,188.67,40
6,6,1984,0,9,284,3322.2,2142,564.78,4335.6,2027,368.54,4448.1,2191,200.17,221.1,73,59.71,14
7,7,1045,0,2,69,1404.1,793,238.7,1967.8,969,167.26,1818.5,994,81.83,88.2,42,23.81,5
8,8,167,0,1,40,360.6,241,61.3,401.8,215,34.15,390.0,198,17.55,16.3,12,4.4,1
9,9,205,1,1,20,466.1,237,79.24,386.2,217,32.83,492.4,204,22.15,24.3,5,6.56,2
