In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

## 1. Quick Cleaning

In [4]:
df = df[df["TotalCharges"] != " "].reset_index(drop=True)

In [5]:
df.nunique()

customerID          7032
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                72
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1584
TotalCharges        6530
Churn                  2
dtype: int64

In [6]:
df.nunique()[df.nunique() == 3]

MultipleLines       3
InternetService     3
OnlineSecurity      3
OnlineBackup        3
DeviceProtection    3
TechSupport         3
StreamingTV         3
StreamingMovies     3
Contract            3
dtype: int64

In [7]:
def get_cols_list(df, nunique, smaller_or_equal = True):
    if smaller_or_equal:
        return df.nunique()[df.nunique() <= nunique].keys().tolist()
    
    return df.nunique()[df.nunique() == nunique].keys().tolist()

In [8]:
df[get_cols_list(df, 3, smaller_or_equal = False)]

Unnamed: 0,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract
0,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month
1,No,DSL,Yes,No,Yes,No,No,No,One year
2,No,DSL,Yes,Yes,No,No,No,No,Month-to-month
3,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year
4,No,Fiber optic,No,No,No,No,No,No,Month-to-month
...,...,...,...,...,...,...,...,...,...
7027,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year
7028,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year
7029,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month
7030,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month


In [9]:
df[get_cols_list(df, 2, smaller_or_equal = False)].apply(pd.value_counts)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,PaperlessBilling,Churn
0,,5890.0,,,,,
1,,1142.0,,,,,
Female,3483.0,,,,,,
Male,3549.0,,,,,,
No,,,3639.0,4933.0,680.0,2864.0,5163.0
Yes,,,3393.0,2099.0,6352.0,4168.0,1869.0


In [10]:
to_bool = ['Partner','Dependents','PhoneService','PaperlessBilling','Churn']

#for col in to_bool:
#    df[col] = np.where(df[col] == 'Yes', 1, 0)
    
df[to_bool] = np.where(df[to_bool] == 'Yes',1,0)   

In [11]:
df[get_cols_list(df,2,False)].apply(pd.value_counts)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,PaperlessBilling,Churn
0,,5890.0,3639.0,4933.0,680.0,2864.0,5163.0
1,,1142.0,3393.0,2099.0,6352.0,4168.0,1869.0
Female,3483.0,,,,,,
Male,3549.0,,,,,,


In [12]:
df = df.assign(Female = np.where(df['gender'] == 'Female', 1, 0))
df = df.drop(['gender'],axis=1)

In [13]:
df[get_cols_list(df,2,False)].apply(pd.value_counts)

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,PaperlessBilling,Churn,Female
0,5890,3639,4933,680,2864,5163,3549
1,1142,3393,2099,6352,4168,1869,3483


    check columns with 3 unique values

In [14]:
df[get_cols_list(df,3,False)].apply(pd.value_counts)

Unnamed: 0,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract
DSL,,2416.0,,,,,,,
Fiber optic,,3096.0,,,,,,,
Month-to-month,,,,,,,,,3875.0
No,3385.0,1520.0,3497.0,3087.0,3094.0,3472.0,2809.0,2781.0,
No internet service,,,1520.0,1520.0,1520.0,1520.0,1520.0,1520.0,
No phone service,680.0,,,,,,,,
One year,,,,,,,,,1472.0
Two year,,,,,,,,,1685.0
Yes,2967.0,,2015.0,2425.0,2418.0,2040.0,2703.0,2731.0,


In [15]:
exclude  = ['InternetService', 'Contract']
df[[col for col in get_cols_list(df,3,False) if col not in exclude]].apply(pd.value_counts)

Unnamed: 0,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
No,3385.0,3497.0,3087.0,3094.0,3472.0,2809.0,2781.0
No internet service,,1520.0,1520.0,1520.0,1520.0,1520.0,1520.0
No phone service,680.0,,,,,,
Yes,2967.0,2015.0,2425.0,2418.0,2040.0,2703.0,2731.0


In [16]:
to_bool_again = [col for col in get_cols_list(df,3,False) if col not in exclude]

In [17]:
df[to_bool_again] = np.where(df[to_bool_again] == 'Yes', 1, 0)

In [18]:
df[[col for col in get_cols_list(df,2,False) if col not in exclude]].apply(pd.value_counts)

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,Churn,Female
0,5890,3639,4933,680,4065,5017,4607,4614,4992,4329,4301,2864,5163,3549
1,1142,3393,2099,6352,2967,2015,2425,2418,2040,2703,2731,4168,1869,3483


In [19]:
df.dtypes

customerID           object
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService      object
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract             object
PaperlessBilling      int64
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                 int64
Female                int64
dtype: object

In [20]:
df['TotalCharges'] = df['TotalCharges'].astype(float)

In [21]:
get_cols_list(df, 4)

['SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Churn',
 'Female']

In [22]:
df.nunique()

customerID          7032
SeniorCitizen          2
Partner                2
Dependents             2
tenure                72
PhoneService           2
MultipleLines          2
InternetService        3
OnlineSecurity         2
OnlineBackup           2
DeviceProtection       2
TechSupport            2
StreamingTV            2
StreamingMovies        2
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1584
TotalCharges        6530
Churn                  2
Female                 2
dtype: int64

## 2. Create model evaluation function

In [23]:
def evaluate_model(df,seed=666):
    # assign columns
    id_col=["customerID"]
    target_col=["Churn"]
    bool_col=[col for col in get_cols_list(df,2,False) if col not in target_col]
    cat_col= [col for col in get_cols_list(df,4) if col not in target_col]
    num_col=[col for col in df.columns if col not in cat_col + target_col + id_col]
    # Create dummies variebles
    df=pd.get_dummies(data=df,
                           columns=[col for col in cat_col if col not in bool_col],
                           drop_first=True
                    )
    cat_col= [col for col in get_cols_list(df,4) if col not in target_col]
    bool_col=[col for col in get_cols_list(df,2,False) if col not in target_col]
    # split df
    train,test=train_test_split(df,test_size=.2,random_state=seed)
    train_X=train[cat_col+num_col].reset_index(drop=True)
    train_Y=train[target_col].reset_index(drop=True)
    test_X=test[cat_col+num_col].reset_index(drop=True)
    test_Y=test[target_col].reset_index(drop=True)
    # Standardize
    std=StandardScaler()
    scaled_fitted=std.fit(train_X[num_col])
    scaled=scaled_fitted.transform(train_X[num_col])
    scaled=pd.DataFrame(scaled,columns=num_col)
    train_X=train_X.drop(columns=num_col,axis=1)
    train_X=train_X.merge(scaled,left_index=True,right_index=True,how="left")
    lr=LogisticRegression(solver="liblinear")
    lr.fit(train_X,train_Y.values.ravel())
    # Predict on test set
    scaled_test=scaled_fitted.transform(test_X[num_col])
    scaled_test=pd.DataFrame(scaled_test,columns=num_col)
    test_X=test_X.drop(columns=num_col,axis=1)
    test_X=test_X.merge(scaled_test,left_index=True,right_index=True,how="left")
    pred=lr.predict(test_X)
    #Test accuracy
    return accuracy_score(test_Y,pred)

In [24]:
evaluate_model(df)

  return self.partial_fit(X, y)


0.80099502487562191

In [25]:
evaluate_model(df)

  return self.partial_fit(X, y)


0.80099502487562191

    This is our baseline model. Every change we do now shoul improve accuracy compard to this model

## 3. Create different inputs

In [26]:
df['InternetService'].value_counts()

Fiber optic    3096
DSL            2416
No             1520
Name: InternetService, dtype: int64

In [27]:
df_i = df.copy()

In [28]:
df_i['InternetService'] = np.where(df_i['InternetService'] == 'No', 0, 1)

In [29]:
evaluate_model(df_i)

  return self.partial_fit(X, y)


0.79815209665955933

### Model from 25.11

#### Another function

In [30]:
def evaluate_model(df,scaler = StandardScaler,seed=666):
    
    # Assign columns
    
    id_col=["customerID"]
    target_col=["Churn"]
    bool_col=[col for col in get_cols_list(df,2,False) if col not in target_col]
    cat_col= [col for col in get_cols_list(df,4) if col not in target_col]
    num_col=[col for col in df.columns if col not in cat_col + target_col + id_col]
    
    # Create dummies variebles
    
    df=pd.get_dummies(data=df,
                           columns=[col for col in cat_col if col not in bool_col],
                           drop_first=True
                    )
    cat_col= [col for col in get_cols_list(df,4) if col not in target_col]
    bool_col=[col for col in get_cols_list(df,2,False) if col not in target_col]
    
    # Split df
    
    train,test=train_test_split(df,test_size=.2,random_state=seed)
    train_X=train[cat_col+num_col].reset_index(drop=True)
    train_Y=train[target_col].reset_index(drop=True)
    test_X=test[cat_col+num_col].reset_index(drop=True)
    test_Y=test[target_col].reset_index(drop=True)
    
    # Standardize
    
    std=scaler()
    scaled_fitted=std.fit(train_X[num_col])
    scaled=scaled_fitted.transform(train_X[num_col])
    scaled=pd.DataFrame(scaled,columns=num_col)
    train_X=train_X.drop(columns=num_col,axis=1)
    train_X=train_X.merge(scaled,left_index=True,right_index=True,how="left")
    lr=LogisticRegression(solver="liblinear")
    lr.fit(train_X,train_Y.values.ravel())
    
    # Predict on test set
    
    scaled_test=scaled_fitted.transform(test_X[num_col])
    scaled_test=pd.DataFrame(scaled_test,columns=num_col)
    test_X=test_X.drop(columns=num_col,axis=1)
    test_X=test_X.merge(scaled_test,left_index=True,right_index=True,how="left")
    pred=lr.predict(test_X)
    
    #Test accuracy
    
    return accuracy_score(test_Y,pred)

    use MinMaxScaler

In [31]:
from sklearn.preprocessing import MinMaxScaler

In [32]:
evaluate_model(df, scaler = MinMaxScaler)

  return self.partial_fit(X, y)


0.80028429282160629

## 4. Refactor evaluate_model function to take classification algorithm

In [None]:
def evaluate_model_new(df,classifier,scaler = StandardScaler,seed=666):
    
    # Assign columns
    
    id_col=["customerID"]
    target_col=["Churn"]
    bool_col=[col for col in get_cols_list(df,2,False) if col not in target_col]
    cat_col= [col for col in get_cols_list(df,4) if col not in target_col]
    num_col=[col for col in df.columns if col not in cat_col + target_col + id_col]
    
    # Create dummies variebles
    
    df=pd.get_dummies(data=df,
                           columns=[col for col in cat_col if col not in bool_col],
                           drop_first=True
                    )
    cat_col= [col for col in get_cols_list(df,4) if col not in target_col]
    bool_col=[col for col in get_cols_list(df,2,False) if col not in target_col]
    
    # Split df
    
    train,test=train_test_split(df,test_size=.2,random_state=seed)
    train_X=train[cat_col+num_col].reset_index(drop=True)
    train_Y=train[target_col].reset_index(drop=True)
    test_X=test[cat_col+num_col].reset_index(drop=True)
    test_Y=test[target_col].reset_index(drop=True)
    
    # Standardize
    
    std=scaler()
    scaled_fitted=std.fit(train_X[num_col])
    scaled=scaled_fitted.transform(train_X[num_col])
    scaled=pd.DataFrame(scaled,columns=num_col)
    train_X=train_X.drop(columns=num_col,axis=1)
    train_X=train_X.merge(scaled,left_index=True,right_index=True,how="left")
    model=LogisticRegression(solver="liblinear")
    model.fit(train_X,train_Y.values.ravel())
    
    # Predict on test set
    
    scaled_test=scaled_fitted.transform(test_X[num_col])
    scaled_test=pd.DataFrame(scaled_test,columns=num_col)
    test_X=test_X.drop(columns=num_col,axis=1)
    test_X=test_X.merge(scaled_test,left_index=True,right_index=True,how="left")
    pred=model.predict(test_X)
    
    #Test accuracy
    
    return accuracy_score(test_Y,pred)