In [2]:
import pandas as pd
from imblearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [23]:
# pulled in dataframe
df = pd.read_csv('../../src/data/syriatel_customer_churn.csv')

In [25]:
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [26]:
# dropped area code and phone number. still looking into if area code should be included
df = df.drop(['area code', 'phone number'], axis = 1)

In [27]:
X = df.drop('churn', axis=1)
y = df['churn']

In [33]:
X.to_csv('../../src/data/X_dataframe.csv')


In [32]:
y.to_csv('../../src/data/y_dataframe.csv')

In [6]:
from imblearn.pipeline import make_pipeline, Pipeline

In [7]:
# train-test-split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
# Our first model should have accuracy above 86 percent.
dummy_model = DummyClassifier(strategy="most_frequent")
dummy_model.fit(X_train, y_train)
dummy_model.score(X_train, y_train)

0.8567426970788315

In [9]:
def One_hot(X):
    '''
    Returns data frame of X_train and X_test of catagorical columns one 
    hot encoded merged with continous columns. 
    Catagorical anything that is an object and continous is not object
    '''
    X_num = X.select_dtypes(exclude='object') # numerical cols
    X_cat = X.select_dtypes(include='object') # categorical cols
    
    ohe = OneHotEncoder(drop='first', sparse=False)
    ohe.fit(X_cat)

    X_catagory = pd.DataFrame(ohe.transform(X_cat),
                                      columns=ohe.get_feature_names(),
                                        index=X.index)
    X_ohe = X_num.merge(X_catagory,
                           left_index=True,
                          right_index=True)
    


    return X_ohe

In [10]:
X_train_ohe = One_hot(X_train)

In [11]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train_ss = pd.DataFrame(ss.fit_transform(X_train_ohe), index= X_train_ohe.index, columns= X_train_ohe.columns)

In [21]:
X_train_ss.head(2)

Unnamed: 0,account length,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,...,x0_TX,x0_UT,x0_VA,x0_VT,x0_WA,x0_WI,x0_WV,x0_WY,x1_yes,x2_yes
367,-1.404508,-0.5847,-1.883677,1.330852,-1.88417,1.037727,0.40134,1.037905,1.069609,0.00494,...,-0.150014,-0.144338,-0.156845,-0.144338,-0.147201,-0.163417,-0.175899,-0.1555,-0.327448,-0.611418
3103,0.366388,-0.5847,0.294083,0.529165,0.293703,0.516178,0.40134,0.517286,2.214376,0.670832,...,-0.150014,-0.144338,-0.156845,-0.144338,-0.147201,-0.163417,-0.175899,-0.1555,-0.327448,-0.611418


In [12]:
# for reproducibility purposes
sm = SMOTE(random_state=42)


# SMOTE number of neighbors


X_resample, y_resampled = sm.fit_resample(X_train_ss, y_train)

In [13]:
y_resampled.value_counts()

True     2141
False    2141
Name: churn, dtype: int64

In [14]:
X_test_ohe = One_hot(X_test)

In [18]:
ss = StandardScaler()
ss.fit(X_train_ohe)
X_test_ss = pd.DataFrame(ss.transform(X_test_ohe), index= X_test_ohe.index, columns= X_test_ohe.columns)

In [19]:
lg1 = LogisticRegression(random_state=42)

lg1.fit(X_resample, y_resampled)
y_hat_log1 = lg1.predict(X_test_ss)

print(lg1.score(X_test_ss, y_test))
print(confusion_matrix(y_test, y_hat_log1))
print(classification_report(y_test, y_hat_log1))

0.7841726618705036
[[557 152]
 [ 28  97]]
              precision    recall  f1-score   support

       False       0.95      0.79      0.86       709
        True       0.39      0.78      0.52       125

    accuracy                           0.78       834
   macro avg       0.67      0.78      0.69       834
weighted avg       0.87      0.78      0.81       834



In [20]:
forest = RandomForestClassifier()
forest.fit(X_resample, y_resampled)
y_hat_forest = lg1.predict(X_test_ss)

print(forest.score(X_test_ss, y_hat_forest))
print(confusion_matrix(y_test, y_hat_forest))
print(classification_report(y_test, y_hat_forest))

0.7973621103117506
[[557 152]
 [ 28  97]]
              precision    recall  f1-score   support

       False       0.95      0.79      0.86       709
        True       0.39      0.78      0.52       125

    accuracy                           0.78       834
   macro avg       0.67      0.78      0.69       834
weighted avg       0.87      0.78      0.81       834

