In [47]:
# import libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
# reading dataset
churn_train = pd.read_csv('udel-churn-train.csv')
churn_test = pd.read_csv('udel-churn-test.csv')

In [3]:
# create a deepy copy
df_train = churn_train.copy()
df_train['TotalCharges']=pd.to_numeric(df_train['TotalCharges'], errors='coerce')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5634 entries, 0 to 5633
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        5634 non-null   int64  
 1   customerID        5634 non-null   object 
 2   gender            5589 non-null   object 
 3   SeniorCitizen     5573 non-null   float64
 4   Partner           5594 non-null   object 
 5   Dependents        5594 non-null   object 
 6   tenure            5544 non-null   float64
 7   PhoneService      5550 non-null   object 
 8   MultipleLines     5580 non-null   object 
 9   InternetService   5486 non-null   object 
 10  OnlineSecurity    5564 non-null   object 
 11  OnlineBackup      5528 non-null   object 
 12  DeviceProtection  5564 non-null   object 
 13  TechSupport       5564 non-null   object 
 14  StreamingTV       5529 non-null   object 
 15  StreamingMovies   5578 non-null   object 
 16  Contract          5555 non-null   object 


# Data Preprocessing

In [4]:
# pandas-profiling packages
from pandas_profiling import ProfileReport
profile = ProfileReport(df_train,minimal=True)
profile.to_widgets()

HBox(children=(FloatProgress(value=0.0, description='variables', max=22.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='table', max=1.0, style=ProgressStyle(description_width='i…









HBox(children=(FloatProgress(value=0.0, description='package', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='build report structure', max=1.0, style=ProgressStyle(des…




Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(value='Number of va…

In [5]:
# first find the numerical and categorical features
df_train_num = df_train.select_dtypes(exclude='object')
df_train_cat = df_train.select_dtypes(include='object')
num_features = df_train_num.columns.tolist()
cat_features = df_train_cat.columns.tolist()

In [6]:
# remove the target variable "Churn" and the unuseful features "customerID","Unnamed: 0"
# cat_features.remove('Churn')
# cat_features.remove('customerID')
cat_features = [e for e in cat_features if e not in ('Churn','customerID')]

In [7]:
# classified some numerical variables into categorical variables
num_to_cat = []

df_train['SeniorCitizen'].value_counts()
num_to_cat.append('SeniorCitizen')

In [8]:
# change this features to categorical
num_features = [element for element in num_features if element not in num_to_cat]
cat_features += num_to_cat

# Outliers

In [9]:
# build the IQR outlier remover
# note that you have to remove the rows for both X and y 

def find_outlier_iqr(X):
    # calculate IQR
    q1 = X.quantile(0.25)
    q3 = X.quantile(0.75)
    iqr = q3 - q1
    
    outlier_index = ((X < (q1 - 1.5 * iqr)) | (X > (q3 + 1.5 * iqr))).any(axis=1)
    
    return outlier_index

In [10]:
# this shows how many outliers will be removed from the dataset
outlier_index = find_outlier_iqr(df_train[num_features])
# before remove outliers
len_df_train = len(df_train)
# remove the outliers from the dataset
df_train_iqr = df_train[~outlier_index]
# after remove outliers
len_df_train_after = len(df_train_iqr)
print(f'we will remove {(len_df_train-len_df_train_after)/len_df_train:.2%} outliers')

we will remove 0.28% outliers


# Building the Pipeline

In [24]:
# prepare the data
X = df_train_iqr.drop(['Churn'],axis=1)
y = df_train_iqr['Churn']

In [25]:
df_train_iqr.columns

Index(['Unnamed: 0', 'customerID', 'gender', 'SeniorCitizen', 'Partner',
       'Dependents', 'tenure', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges',
       'Churn'],
      dtype='object')

In [26]:
# Split the data into a training set and a test set. 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

(4494, 21)
(1124, 21)


In [54]:
# building the pipeline
# Create the preprocessing pipeline for numerical and categorical features
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_pipeline = Pipeline(
    steps=[
        ('num_imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ]
)

cat_pipeline = Pipeline(
    steps=[
        ('cat_imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder()),
    ]
)

In [55]:
# Combine two pipelines to form the preprocessor
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, num_features),
        ('cat_pipeline', cat_pipeline, cat_features),
    ]
)

In [56]:
# Make a full pipeline by combining preprocessor and the model
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


tree_clf_model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('tree_clf', DecisionTreeClassifier()),
    ]
)

log_reg_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('log_reg', LogisticRegression()),
    ]
)

svm_clf_model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('svm_clf',SVC()),
    ]
)

# Decision Tree

In [30]:
# use grid search to find the best decision tree model
from sklearn.model_selection import GridSearchCV


# here you must use the step names as the prefix followed by two under_scores to sepecify the parameter names
# you also need to specify the "full path" of the steps
# as you can see we can even grid search parameters for preprocessing pipeline step
param_grid = [
    {
        'preprocessor__num_pipeline__num_imputer__strategy': ['mean', 'median'],
        'tree_clf__criterion': ['gini', 'entropy'], 
        'tree_clf__max_depth': [2, 3, 4, 5, 6, 7],
    }
]

# set up the grid search 
grid_search = GridSearchCV(tree_clf_model, param_grid, cv=5,
                          scoring='accuracy',
                          return_train_score=True)

In [31]:
# train the model using the full pipeline
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num_pipeline',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('num_imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                       

In [32]:
# check the best performing parameter combination
grid_search.best_params_

{'preprocessor__num_pipeline__num_imputer__strategy': 'median',
 'tree_clf__criterion': 'gini',
 'tree_clf__max_depth': 5}

In [33]:
# get the best trained model
tree_clf_best = grid_search.best_estimator_

# prediction for the test set
y_pred = tree_clf_best.predict(X_test)

In [36]:
# make predictions for kaggle submission
churn_test = pd.read_csv('udel-churn-test.csv')

y_pred_churn = tree_clf_best.predict(churn_test)

# combine id and prediction for kaggle submission
churn_submit = pd.DataFrame({
    'customerID': churn_test['customerID'],
    'Churn': y_pred_churn
})

# generate the csv
churn_submit.to_csv('churn-submit_decisiontree.csv', index=False)
print('csv saved! please submit the prediction csv to Kaggle.com')

csv saved! please submit the prediction csv to Kaggle.com


# Logistic Regression

In [49]:
# conduct cross validation with the full pipeline
from sklearn.model_selection import cross_val_score

log_reg_scores_accu = cross_val_score(log_reg_pipeline, X_train, y_train, scoring="accuracy", cv=10)
print(f'cross validation average accuracy score: {log_reg_scores_accu.mean()}')

cross validation average accuracy score: 0.8032927493194754


In [50]:
# we train the final model using ALL training data
log_reg_pipeline.fit(X_train, y_train)

# # use named_steps[] to access any step in your pipeline
print(log_reg_pipeline.named_steps['log_reg'].intercept_)
print(log_reg_pipeline.named_steps['log_reg'].coef_)

# get the prediction results from the testing set
y_pred = log_reg_pipeline.predict(X_test)

# calculate accuracy
# we have similiar score for the testing set as the cross validation score - good
from sklearn.metrics import accuracy_score

print(f'Testing set accuracy score : {accuracy_score(y_test,y_pred)}')

[-1.54258585]
[[ 3.85548276e-02 -1.24219442e+00  7.96144787e-01  4.39799911e-01
  -2.92194291e-02 -1.85655163e-04 -7.67030269e-03 -2.17347816e-02
   5.02181446e-02 -7.96232289e-02  5.60660532e-01 -5.90065616e-01
  -3.81147407e-04 -1.68925461e-01  1.39901524e-01  9.06018261e-03
   8.20767179e-02 -1.20541985e-01  2.49052250e-01 -4.20744116e-02
  -2.36382923e-01 -3.41575404e-02  2.87624508e-01 -2.82872052e-01
   1.55038853e-02 -4.20744116e-02 -2.83455801e-03  2.79750606e-01
  -4.20744116e-02 -2.67081279e-01 -1.49010552e-01  3.14763266e-01
  -1.95157799e-01  5.72547863e-02 -1.51958584e-01  6.52987137e-02
   7.69188406e-01  4.81560201e-02 -8.46749510e-01 -2.03444624e-01
   1.74039540e-01 -1.39952064e-01 -6.94813222e-02  1.92829147e-01
  -1.28008449e-02 -8.12650851e-02  5.18600008e-02]]
Testing set accuracy score : 0.791814946619217


In [53]:
# make prediction using linear regression model
y_pred_logistic = log_reg_pipeline.predict(churn_test)

# combine id and prediction for kaggle submission
churn_submit = pd.DataFrame({
    'customerID': churn_test['customerID'],
    'Churn': y_pred_logistic
})

# generate the csv
churn_submit.to_csv('churn-submit_log.csv', index=False)
print('csv saved! please submit the prediction csv to Kaggle.com')

csv saved! please submit the prediction csv to Kaggle.com


# SVM 

In [57]:
# use grid search to find the best decision tree model
from sklearn.model_selection import GridSearchCV


# here you must use the step names as the prefix followed by two under_scores to sepecify the parameter names
# you also need to specify the "full path" of the steps
# as you can see we can even grid search parameters for preprocessing pipeline step
param_grid = [
    {
        'svm_clf__kernel': ['linear','poly','rbf'],
        'svm_clf__degree': [1,2,3], 
        'svm_clf__C': [1,3,5],
    }
]

# set up the grid search 
grid_search = GridSearchCV(svm_clf_model, param_grid, cv=5,
                          scoring='accuracy',
                          return_train_score=True)

In [58]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num_pipeline',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('num_imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                       

In [59]:
grid_search.best_params_

{'svm_clf__C': 1, 'svm_clf__degree': 2, 'svm_clf__kernel': 'poly'}

In [60]:
SVM_reg_best = grid_search.best_estimator_
SVM_reg_best

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num_pipeline',
                                                  Pipeline(memory=None,
                                                           steps=[('num_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                  

In [63]:
# make predictions for kaggle submission
y_pred_churn = SVM_reg_best.predict(churn_test)

# combine id and prediction for kaggle submission
churn_submit = pd.DataFrame({
    'customerID': churn_test['customerID'],
    'Churn': y_pred_churn
})

# generate the csv
churn_submit.to_csv('churn-submit_SVM.csv', index=False)
print('csv saved! please submit the prediction csv to Kaggle.com')

csv saved! please submit the prediction csv to Kaggle.com
