# Set Up

In [1]:
#Data
import pandas as pd
import numpy as np
#Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
#Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#Selection
from sklearn.model_selection import train_test_split, GridSearchCV

def obj_to_float(x):
    try:
        return(float(x))
    except:
        return(None)

cust_info=pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.csv').drop(['customerID'],axis=1)

churn=cust_info['Churn']
cust_info=cust_info.drop('Churn',axis=1)
#churn is y, cust_info is X
#churn=LabelEncoder().fit_transform(churn)
cust_info['TotalCharges']=cust_info['TotalCharges'].apply(obj_to_float)

cust_info.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65


In [2]:
cat_feat=['gender','Partner','Dependents'
          ,'PhoneService','MultipleLines','InternetService'
          ,'OnlineSecurity','OnlineBackup','DeviceProtection'
          ,'TechSupport','StreamingTV','StreamingMovies'
          ,'Contract','PaperlessBilling','PaymentMethod']


num_feat=['TotalCharges','SeniorCitizen','MonthlyCharges'
          ,'tenure']

#columns are eitehr caterforical or numerical features
assert(set(cust_info.columns)==set(num_feat).union(set(cat_feat)))

# Models

## Numerical Features

In [3]:
num_train, num_test, churn_train, churn_test = train_test_split(
    cust_info[num_feat], churn, test_size=0.2)

num_clf=Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler()),
    ('classifier',LogisticRegression(solver='lbfgs'))
])

num_clf.fit(num_train,churn_train)
print('Train score: %0.5f' %num_clf.score(num_train,churn_train))
print('Test score: %0.5f' %num_clf.score(num_test,churn_test))

Train score: 0.78896
Test score: 0.79489


In [4]:
num_clf.named_steps


{'imputer': SimpleImputer(add_indicator=False, copy=True, fill_value=None,
               missing_values=nan, strategy='most_frequent', verbose=0),
 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False)}

## Combining Features

Source demonstrating how to combine features

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py

https://medium.com/bigdatarepublic/integrating-pandas-and-scikit-learn-with-pipelines-f70eb6183696
