In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

from scipy import stats

import acquire as ac
import prepare as pr

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = ac.get_telco_data()
encoder, train, validate, test = pr.prep_telco(df)
scaler, train, validate, test = pr.min_max_scaler(train, validate, test)
print(f"   train shape = {train.shape}")
print(f"validate shape = {validate.shape}")
print(f"    test shape = {test.shape}")

   train shape = (4218, 21)
validate shape = (1407, 20)
    test shape = (1407, 20)


In [3]:
train.head()

Unnamed: 0,customer_id,gender,senior_citizen,tenure,internet_service_type_id,internet_service_type,device_protection,tech_support,contract_type_id,contract_type,paperless_billing,payment_type_id,payment_type,monthly_charges,total_charges,churn,phone_service_type,family,streaming,online_protection,tenure_years
5767,6885-PKOAM,Female,0,0.985915,1,DSL,Yes,Yes,3,Two year,No,4,Credit card (automatic),0.671642,6223.8,0,Multiple lines,Partner,TV and movies,Security and backup,6.0
601,4923-ADWXJ,Female,0,0.338028,1,DSL,No,Yes,1,Month-to-month,Yes,3,Bank transfer (automatic),0.473134,1679.65,0,Single line,Single,Movies,Backup,2.0
6469,1099-GODLO,Female,0,0.0,3,,No internet service,No internet service,3,Two year,No,2,Mailed check,0.020896,20.35,0,Single line,Partner and dependents,No internet service,No internet service,0.0
5729,6374-AFWOX,Male,0,0.633803,1,DSL,Yes,Yes,3,Two year,No,2,Mailed check,0.471642,3047.15,0,Single line,Partner,,Security and backup,4.0
1163,9489-UTFKA,Male,1,0.098592,1,DSL,No,No,1,Month-to-month,No,3,Bank transfer (automatic),0.424378,551.95,0,Single line,Partner,TV,Security,1.0


# Modeling

In [4]:
# identifying train features and target
X_train = train[["senior_citizen", "tenure", "monthly_charges"]]
y_train = train[["churn"]]

# identifying validate features and target
X_validate = validate[["senior_citizen", "tenure", "monthly_charges"]]
y_validate = validate[["churn"]]

# identifying test features and target
X_test = test[["senior_citizen", "tenure", "monthly_charges"]]
y_test = test[["churn"]]

In [5]:
evaluation = pd.DataFrame({
    "actual": train.churn,
})

---
## Logistic Regression

In [6]:
logit = LogisticRegression()
logit.fit(X_train, y_train.churn)
evaluation["churn ~ senior_citizen + tenure + monthly_charges (logit)"] = logit.predict(X_train)

print(f"Train Accuracy: {logit.score(X_train, y_train.churn):.2%}")

Train Accuracy: 79.23%


In [7]:
cm = pd.crosstab(evaluation["actual"], evaluation["churn ~ senior_citizen + tenure + monthly_charges (logit)"], normalize=True)
cm

churn ~ senior_citizen + tenure + monthly_charges (logit),0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.67449,0.059744
1,0.147937,0.117828


In [23]:
cr = pd.DataFrame(classification_report(evaluation["actual"], evaluation["churn ~ senior_citizen + tenure + monthly_charges (logit)"], output_dict=True))
cr

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.820121,0.663551,0.792319,0.741836,0.77851
recall,0.918631,0.443354,0.792319,0.680993,0.792319
f1-score,0.866585,0.531551,0.792319,0.699068,0.777545
support,3097.0,1121.0,0.792319,4218.0,4218.0


---
## Decision Tree

In [10]:
clf = DecisionTreeClassifier(max_depth=2)
clf.fit(X_train, y_train.churn)
evaluation["churn ~ senior_citizen + tenure + monthly_charges (clf3)"] = clf.predict(X_train)

print(f"Train Accuracy: {clf.score(X_train, y_train.churn):.2%}")

Train Accuracy: 78.26%


In [13]:
print(classification_report(evaluation["actual"], evaluation["churn ~ senior_citizen + tenure + monthly_charges (clf3)"]))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86      3097
           1       0.64      0.42      0.51      1121

    accuracy                           0.78      4218
   macro avg       0.73      0.67      0.68      4218
weighted avg       0.77      0.78      0.77      4218



In [14]:
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train.churn)
evaluation["churn ~ senior_citizen + tenure + monthly_charges (clf5)"] = clf.predict(X_train)

print(f"Train Accuracy: {clf.score(X_train, y_train.churn):.2%}")

Train Accuracy: 79.61%


In [15]:
print(classification_report(evaluation["actual"], evaluation["churn ~ senior_citizen + tenure + monthly_charges (clf5)"]))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87      3097
           1       0.67      0.46      0.55      1121

    accuracy                           0.80      4218
   macro avg       0.75      0.69      0.71      4218
weighted avg       0.78      0.80      0.78      4218



---
## Random Forest

In [16]:
rf = RandomForestClassifier(min_samples_leaf=3, max_depth=6)
rf.fit(X_train, y_train.churn)
evaluation["churn ~ senior_citizen + tenure + monthly_charges (rf; 3/6)"] = rf.predict(X_train)

print(f"Train Accuracy: {rf.score(X_train, y_train.churn):.2%}")

Train Accuracy: 80.61%


In [17]:
print(classification_report(evaluation["actual"], evaluation["churn ~ senior_citizen + tenure + monthly_charges (rf; 3/6)"]))

              precision    recall  f1-score   support

           0       0.82      0.94      0.88      3097
           1       0.72      0.45      0.55      1121

    accuracy                           0.81      4218
   macro avg       0.77      0.69      0.71      4218
weighted avg       0.80      0.81      0.79      4218



---
## KNN

In [18]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train.churn)
evaluation["churn ~ senior_citizen + tenure + monthly_charges (knn)"] = knn.predict(X_train)

print(f"Train Accuracy: {knn.score(X_train, y_train.churn):.2%}")

Train Accuracy: 83.40%


In [19]:
print(classification_report(evaluation["actual"], evaluation["churn ~ senior_citizen + tenure + monthly_charges (knn)"]))

              precision    recall  f1-score   support

           0       0.86      0.92      0.89      3097
           1       0.74      0.58      0.65      1121

    accuracy                           0.83      4218
   macro avg       0.80      0.75      0.77      4218
weighted avg       0.83      0.83      0.83      4218

