In [141]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import f1_score

In [120]:
data = pd.read_csv('../../src/data/syriatel_customer_churn.csv')

In [121]:
data

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.70,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.70,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.30,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.90,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,AZ,192,415,414-4276,no,yes,36,156.2,77,26.55,...,126,18.32,279.1,83,12.56,9.9,6,2.67,2,False
3329,WV,68,415,370-3271,no,no,0,231.1,57,39.29,...,55,13.04,191.3,123,8.61,9.6,4,2.59,3,False
3330,RI,28,510,328-8230,no,no,0,180.8,109,30.74,...,58,24.55,191.9,91,8.64,14.1,6,3.81,2,False
3331,CT,184,510,364-6381,yes,no,0,213.8,105,36.35,...,84,13.57,139.2,137,6.26,5.0,10,1.35,2,False


In [173]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account length          3333 non-null   int64  
 2   area code               3333 non-null   int64  
 3   phone number            3333 non-null   object 
 4   international plan      3333 non-null   object 
 5   voice mail plan         3333 non-null   object 
 6   number vmail messages   3333 non-null   int64  
 7   total day minutes       3333 non-null   float64
 8   total day calls         3333 non-null   int64  
 9   total day charge        3333 non-null   float64
 10  total eve minutes       3333 non-null   float64
 11  total eve calls         3333 non-null   int64  
 12  total eve charge        3333 non-null   float64
 13  total night minutes     3333 non-null   float64
 14  total night calls       3333 non-null   

In [174]:
# checking target balance
data.churn.value_counts()

False    2850
True      483
Name: churn, dtype: int64

In [175]:
# Checking phone number
data['phone number'].value_counts()

# dropping this

360-3126    1
394-8086    1
416-1845    1
361-9839    1
344-5973    1
           ..
400-4344    1
334-4354    1
364-3567    1
337-3868    1
335-4584    1
Name: phone number, Length: 3333, dtype: int64

In [176]:
# Dropping phone number column, and assigning modified dataframe to new variable: df. To preserve og dataframe

df = data.drop('phone number', axis=1)

In [177]:
# checking df.info() to confirm drop

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account length          3333 non-null   int64  
 2   area code               3333 non-null   int64  
 3   international plan      3333 non-null   object 
 4   voice mail plan         3333 non-null   object 
 5   number vmail messages   3333 non-null   int64  
 6   total day minutes       3333 non-null   float64
 7   total day calls         3333 non-null   int64  
 8   total day charge        3333 non-null   float64
 9   total eve minutes       3333 non-null   float64
 10  total eve calls         3333 non-null   int64  
 11  total eve charge        3333 non-null   float64
 12  total night minutes     3333 non-null   float64
 13  total night calls       3333 non-null   int64  
 14  total night charge      3333 non-null   

In [178]:
# checking area code value counts
df['area code'].value_counts()

# will keep this.

415    1655
510     840
408     838
Name: area code, dtype: int64

In [179]:
# splitting feature variables and target variable
X = df.drop('churn', axis=1)
y = df['churn']

In [180]:
# Checking dtypes of our feature variables
X.dtypes.value_counts()

float64    8
int64      8
object     3
dtype: int64

In [181]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [182]:
# checking shape of data sets
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(2499, 19) (2499,)
(834, 19) (834,)


### Checking for duplicates

In [183]:
# checking for duplicates

X_train.duplicated().value_counts()

False    2499
dtype: int64

### Preprocessing

#### Train data

In [206]:
# Splitting train data into numerical and categoricals columns

X_tr_num = X_train.select_dtypes(exclude='object') # numerical cols
X_tr_cat = X_train.select_dtypes(include='object') # categorical cols

In [185]:
# ohe'ing train categorical cols 

ohe = OneHotEncoder(sparse= False, drop= 'first')

X_tr_ohe = pd.DataFrame(ohe.fit_transform(X_tr_cat), 
                        index= X_tr_cat.index, 
                        columns=ohe.get_feature_names(X_tr_cat.columns))
print(X_tr_ohe.shape)


# standardizing train numerical cols

ss = StandardScaler()

X_tr_sc = pd.DataFrame(ss.fit_transform(X_tr_num), 
                       index= X_tr_num.index, 
                       columns= X_tr_num.columns)
print(X_tr_sc.shape)

(2499, 52)
(2499, 16)


In [186]:
# joining the num and ohe cat cols together

X_tr_final = X_tr_sc.join(X_tr_ohe)
X_tr_final.head()

Unnamed: 0,account length,area code,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,...,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,international plan_yes,voice mail plan_yes
367,-1.404508,-0.512381,-0.5847,-1.883677,1.330852,-1.88417,1.037727,0.40134,1.037905,1.069609,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3103,0.366388,-0.512381,-0.5847,0.294083,0.529165,0.293703,0.516178,0.40134,0.517286,2.214376,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
549,0.518179,-0.679077,1.685101,1.056392,-1.875896,1.056666,0.093407,0.849774,0.094283,-0.077125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2531,2.010792,-0.512381,-0.5847,-0.679156,1.68159,-0.67932,-0.402459,0.65047,-0.403094,-0.322994,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2378,0.290493,1.749923,-0.5847,0.48466,1.080325,0.484172,-0.718549,-0.296224,-0.719184,-1.186487,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## FSM

#### Preproessing Test Data (same as we did on the train data)

In [205]:
# Splitting test data into numerical and categoricals columns

X_te_num = X_test.select_dtypes(exclude='object') # numerical cols
X_te_cat = X_test.select_dtypes(include='object') # categorical cols

In [188]:
# ohe'ing test categorical cols 

X_te_ohe = pd.DataFrame(ohe.transform(X_te_cat), 
                        index= X_te_cat.index, 
                        columns=ohe.get_feature_names(X_te_cat.columns))
print(X_te_ohe.shape)


# standardizing test numerical cols

X_te_sc = pd.DataFrame(ss.transform(X_te_num), 
                       index= X_te_num.index, 
                       columns= X_te_num.columns)
print(X_te_sc.shape)

(834, 52)
(834, 16)


In [189]:
# joining the num and ohe cat cols together

X_te_final = X_te_sc.join(X_te_ohe)
X_te_final.head()

Unnamed: 0,account length,area code,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,...,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,international plan_yes,voice mail plan_yes
438,0.315791,1.749923,-0.5847,-0.462675,-0.372733,-0.46273,2.562862,0.301688,2.562574,-0.220713,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2674,-0.847941,-0.512381,-0.5847,-1.311946,0.829797,-1.311676,0.326524,1.198556,0.326702,-0.240382,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1345,-0.063687,-0.512381,-0.5847,-3.330584,-5.032539,-3.330643,-0.815352,1.497512,-0.814476,-0.659343,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1957,1.175941,-0.679077,-0.5847,0.606778,-1.074209,0.60716,0.063774,-0.445702,0.064068,-0.873741,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2148,-0.114284,-0.679077,-0.5847,-0.666204,0.078216,-0.666259,0.47074,-1.342571,0.470802,0.53263,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [202]:
# lg1- instantiating logistic regression instance and fitting final train data to model. 
# calc'ed f1 score of lg1 on final test data.

lg1 = LogisticRegression()

lg1.fit(X_tr_final, y_train)

lg1_y_pred = lg1.predict(X_te_final)

f1_score(y_test, lg1_y_pred)

0.29585798816568043

In [203]:
# accuracy score of lg1

lg1.score(X_te_final, y_test)

0.8573141486810552

In [196]:
# cross val score- f1 score & accuracy- for both train X/y and test X/y.

print(f"train f1: {cross_val_score(LogisticRegression(), X_tr_final, y_train, cv= 5, scoring= 'f1').mean()},", 
      f"train accuracy: {cross_val_score(LogisticRegression(), X_tr_final, y_train, cv= 5, scoring= 'accuracy').mean()},")

print(f"test f1: {cross_val_score(LogisticRegression(), X_te_final, y_test, cv= 5, scoring= 'f1').mean()},", 
    f"test accuracy: {cross_val_score(LogisticRegression(), X_te_final, y_test, cv= 5, scoring= 'accuracy').mean()}")

# Doesn't seem like we're overfitting

train f1: 0.30778284332902006, train accuracy: 0.861543887775551,
test f1: 0.32727757661968193, test accuracy: 0.8573046677728879


In [204]:
# checking coefs of lg1

lg1.coef_

array([[ 3.42282690e-02,  7.48929488e-02,  4.07617559e-02,
         3.15815996e-01,  1.53552083e-02,  3.18313663e-01,
         1.78013397e-01,  2.98169143e-02,  1.68843811e-01,
         7.82387562e-02, -3.91069798e-03,  7.05477538e-02,
         1.11785181e-01, -2.35494667e-01,  1.38455643e-01,
         6.82969187e-01, -6.47462821e-01,  4.17927040e-01,
        -4.84305568e-01,  4.61975769e-01, -2.88514498e-01,
         7.55128067e-02, -1.73909943e-03, -1.51947316e-01,
        -1.19945057e-01,  2.18493991e-01, -5.24436540e-01,
        -7.30276249e-01, -7.05429473e-02, -6.91447231e-01,
         1.11098377e-01,  3.10835920e-01,  2.35532947e-01,
        -2.64898676e-01,  2.15474131e-01,  4.23443500e-01,
         5.53475117e-01,  4.25963327e-01,  3.92810968e-01,
        -1.84611332e-01,  4.88414230e-01,  9.14469511e-01,
         1.61533517e-01, -1.06253891e+00,  9.82361505e-02,
         1.29649364e-01,  6.33578349e-01, -1.05375801e-01,
         5.79007935e-01,  2.99309649e-01,  8.03035209e-0