In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

%matplotlib inline

In [2]:
# read csv
df = pd.read_csv("data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

## Data Cleaning

In [3]:
# from the exercise churn prediction, 
# known of 11 cases of TotalCharges with value " " 
# which transform into 0
df.loc[df["TotalCharges"].eq(" "), "TotalCharges"] = 0

# transform into type float
df["TotalCharges"] = df["TotalCharges"].astype(float)

In [4]:
categorical = ["Contract", "InternetService"]
# list of all no categorical features with 3 different values
to_bool = [col for col in df.nunique()[df.nunique().eq(3)].index if col not in categorical]

In [5]:
# transform all no categorical boolean 1 & 0
df[to_bool] = np.where(df[to_bool].eq("Yes"),1,0)

In [6]:
# create column "female" as boolean in replacement of gender
df = (df
      .assign(female=df["gender"].eq("Female").astype(int))
      .drop(columns="gender"))

In [7]:
# other columns to boolean
to_bool_too = ["Partner", "Dependents", "PhoneService", "Churn", "PaperlessBilling"]

df[to_bool_too] = np.where(df[to_bool_too].eq("Yes"),1,0)

In [8]:
df.dtypes

customerID           object
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService      object
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract             object
PaperlessBilling      int64
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
female                int64
dtype: object

In [9]:
df.head()

Unnamed: 0,customerID,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,female
0,7590-VHVEG,0,1,0,1,0,0,DSL,0,1,...,0,0,0,Month-to-month,1,Electronic check,29.85,29.85,0,1
1,5575-GNVDE,0,0,0,34,1,0,DSL,1,0,...,0,0,0,One year,0,Mailed check,56.95,1889.5,0,0
2,3668-QPYBK,0,0,0,2,1,0,DSL,1,1,...,0,0,0,Month-to-month,1,Mailed check,53.85,108.15,1,0
3,7795-CFOCW,0,0,0,45,0,0,DSL,1,0,...,1,0,0,One year,0,Bank transfer (automatic),42.3,1840.75,0,0
4,9237-HQITU,0,0,0,2,1,0,Fiber optic,0,0,...,0,0,0,Month-to-month,1,Electronic check,70.7,151.65,1,1


## Create function that we can use to test feature sets
- features sets
- escalling methods for numerical features
- ML algorithms

In [10]:
target_col = ["Churn"]
num_cols = ["tenure", "MonthlyCharges", "TotalCharges"]

In [11]:
def evaluate_model(df, target_col, num_cols, cat_cols, bool_cols, algo, test_size=.1, scaler=None, seed=3):
    """
    This function returns the accuracy score of a given model for churn prediction
    given the data set
    
    
    """
    # create a copy of num_cols to work
    num_cols_copy = num_cols.copy()
    
    # split dataset
    train, test = train_test_split(df, test_size=test_size, random_state=seed)
    train = train.reset_index()
    test = test.reset_index()
    
    # apply rescaling if requested
    if scaler:
        # train scaler
        scaler_fitted = scaler.fit(train[num_cols_copy])
        
        # rescale train
        train_scaled_num_cols = pd.DataFrame(scaler_fitted.transform(train[num_cols_copy]),columns=["scaled_"+col for col in num_cols_copy])
        train = train.merge(train_scaled_num_cols, left_index=True, right_index=True, how="inner")
        
        # rescale test
        test_scaled_num_cols = pd.DataFrame(scaler_fitted.transform(test[num_cols_copy]), columns=["scaled_"+col for col in num_cols_copy])
        test = test.merge(test_scaled_num_cols, left_index=True, right_index=True, how="inner")
        
        # overwrite num_cols
        num_cols_copy = ["scaled_"+col for col in num_cols_copy]
    
    # create dummies
    train = pd.get_dummies(data=train, columns=cat_cols, drop_first=True)
    test = pd.get_dummies(data=test, columns=cat_cols, drop_first=True)
    
    dummy_cols = [col for categ in cat_cols for col in train.columns if col.startswith(categ)]
    
    # create input sets
    train_X = train[bool_cols + dummy_cols + num_cols_copy]
    train_y = train[target_col]
    test_X = test[bool_cols + dummy_cols + num_cols_copy]
    test_y = test[target_col]
    
    # train model
    model = algo
    model.fit(train_X, train_y.values.ravel())
    
    # predict
    predictions = model.predict(test_X)
    
    return accuracy_score(test_y, predictions)
    
    

In [12]:
target_col = ["Churn"]
num_cols = ["tenure", "MonthlyCharges", "TotalCharges"]
bool_cols = [col for col in df.nunique()[df.nunique().eq(2)].index if col not in target_col]
cat_cols = ["InternetService", "Contract", "PaymentMethod"]

In [13]:
evaluate_model(df=df,
               target_col=target_col,
               num_cols=num_cols,
               cat_cols=cat_cols,
               bool_cols=bool_cols,
               algo=LogisticRegression(solver="liblinear"),
               scaler=StandardScaler())

0.8141843971631205

accuracy score of 81% based on the LogisticRegression as done in the Churn predict exercise

#### it is possible to apply dict unpacking for the function

In [14]:
cols = {"target_col":target_col,
        "num_cols":num_cols,
        "cat_cols":cat_cols,
        "bool_cols":bool_cols}

In [15]:
evaluate_model(df=df,
               **cols,
               algo=LogisticRegression(solver="liblinear"),
               scaler=StandardScaler())

0.8141843971631205

In [16]:
### Compare with previous model with other variables

df_p = df.copy()

services = ["PhoneService",
            "OnlineSecurity",
            "OnlineBackup",
            "DeviceProtection",
            "TechSupport",
            "StreamingTV",
            "StreamingMovies"]

df_p = (df_p
        .assign(no_of_services=df_p[services].sum(axis=1))
        .assign(AutomaticPayment=np.where(df_p["PaymentMethod"].str.contains("automatic"),1,0)))

# variables = ["no_of_services",
#              "AutomaticPyament",
#              "MonthlyCharges",
#              "SeniorCitizen",
#              "OnlineBackup",
#              "Contract_One year",
#              "Contract_Two year"]


cols_p = {"target_col": target_col,
          "num_cols": ["no_of_services", "tenure", "MonthlyCharges"],
          "cat_cols": ["Contract"],
          "bool_cols": ["AutomaticPayment", "SeniorCitizen", "OnlineBackup"]}

evaluate_model(df= df_p,
               **cols_p,
               algo=LogisticRegression(solver="liblinear"),
               scaler=StandardScaler())

0.8042553191489362

#### Use MinMaxScaler()

In [17]:
from sklearn.preprocessing import MinMaxScaler

evaluate_model(df=df,
               **cols,
               algo=LogisticRegression(solver="liblinear"),
               scaler=MinMaxScaler())
# it improves the outcome for the given train/test 0,15 porcentual points

0.8156028368794326

#### try DecisionTree

In [18]:
from sklearn.tree import DecisionTreeClassifier

evaluate_model(df=df,
               **cols,
               algo=DecisionTreeClassifier(random_state=500),
               )

0.7432624113475177

In [19]:
from sklearn.ensemble import RandomForestClassifier

evaluate_model(df=df,
               **cols,
               algo=RandomForestClassifier(random_state=500))

0.8184397163120567