In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

## 1. Import the data

In [None]:
df = pd.read_csv("../input/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

In [None]:
df.info()

In [None]:
# Transform TotalCharges into numeric 
df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors = 'coerce')

In [None]:
# Checking SeniorCitizen 
df.SeniorCitizen.describe()

In [None]:
# check the number of NAs for each columns
df.isnull().sum()

In [None]:
# Drop NAs
df = df.dropna()

## 2. Data Exploratory

We can divide variables into two kinds: Continuous variables and Discontious varibles (which also means binary case). Let's check the binary cases first. 

In [None]:
df.columns

In [None]:
# Checking SeniorCitizen 
df.SeniorCitizen.describe()

In [None]:
# Get the list of variables into two groups
X_numeric = df.columns[df.dtypes != 'object'].tolist()
X_binary = df.columns[df.dtypes == 'object'].tolist()

print("There are {} numeric variables with {}".format(len(X_numeric), X_numeric))
print("There are {} binary variables with {}".format(len(X_binary), X_binary))

In [None]:
X_binary = X_binary[1:-1]
X_binary.append(X_numeric[0])
X_numeric = X_numeric[1:]

print("There are {} continous variables with {}".format(len(X_numeric), X_numeric))
print("There are {} binary variables with {}".format(len(X_binary), X_binary))

In [None]:
# The distribution of binary variables
fig, axes = plt.subplots(nrows = 4, ncols = 4, figsize = (20, 20))

for i in range(0, 16):
    ax = axes[i//4, i%4]
    sns.countplot(x = X_binary[i], data = df, hue = df.Churn, dodge = False, ax = ax)
    plt.subplots_adjust(wspace = .5, hspace = .5)
    plt.title(str(X_binary[i]))

In [None]:
# The distribution of binary variables
fig, axes = plt.subplots(nrows = 3, figsize = (10, 5))

for i in range(0, 3):
    ax = axes[i]
    sns.kdeplot(df[X_numeric[i]], shade = 'b', ax = ax)
    plt.subplots_adjust(wspace = .5, hspace = .5)

## 3. Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
# Encoding the target variable
labelencoder = LabelEncoder()
y = df.Churn
y = labelencoder.fit_transform(y)

In [None]:
# Drop the target variable
df = df.drop(['customerID','Churn'], axis = 1)

# Encoding the bianry variables
for i in X_binary:
    df[i] = labelencoder.fit_transform(df[i])
    
df.head()

In [None]:
# Standardize the numeric data
scaler = StandardScaler()
df[X_numeric] = scaler.fit_transform(df[X_numeric])

X = df
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

## 4, Modeling

In [None]:
model_dic = {'logistic': LogisticRegression(),
             'gaussian': GaussianNB(),
             'randomforest' : RandomForestClassifier(),
             'adaboost' : AdaBoostClassifier(),
             'gradientboost' : GradientBoostingClassifier(),
             'xgb' : XGBClassifier(objective = 'binary:logistic')}

In [None]:
pred_df = pd.DataFrame()
pred_df['actual'] = y_test

for index, val in enumerate(model_dic):
    clas = model_dic[val]
    clas.fit(X_train, y_train)
    pred = clas.predict(X_test)
    pred_df[val] = pred
    print(val, ": ", accuracy_score(y_test, pred))

## 5. Modeling with Keras

In [None]:
input_node = X_train.shape[1]
input_node

In [None]:
# Initialize the model 
model = Sequential()

# Create hidden layers 
model.add(Dense(input_dim = input_node, output_dim = 10, activation = 'relu'))
model.add(Dense(output_dim = 5, activation= 'relu'))
model.add(Dense(output_dim = 1, activation= 'sigmoid'))

# Backpropagation
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Callbacks 
early_stopper = EarlyStopping(patience = 5)

# Summary
model.summary()

In [None]:
# Fitting 
model.fit(X_train, y_train, batch_size = 10, epochs = 50, callbacks = [early_stopper]) 
y_pred_ann = model.predict_proba(X_test)

In [None]:
pred_df['keras'] = (y_pred_ann >= .5)*1

In [None]:
pred_df.head()

In [None]:
# Acuuracy score 
accuracy_score(y_test, pred_df.keras)

In [None]:
fig, axs = plt.subplots(nrows = 4, ncols = 2, figsize = (15, 15))

for index, val in enumerate(pred_df.columns):
    ax = axs[index // 2, index % 2]
    sns.countplot(x = val, data = pred_df, ax = ax)
    plt.subplots_adjust(wspace = .5, hspace = .5)
    plt.title(val)

* Referance :  [Kernel by Nilan](https://www.kaggle.com/nilanml/telecom-customer-churn-voting-80-1-accuracy)