In [None]:
# Libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from xgboost import XGBClassifier
import lightgbm as lgb

In [None]:
# load the dataset
train = pd.read_csv('../customer-churn-prediction-2020/train.csv')
test = pd.read_csv('../customer-churn-prediction-2020/test.csv')
print('Train shape {}'.format(train.shape))
print('Test shape {}'.format(test.shape))

In [None]:
# display the all columns 
pd.set_option('display.max_columns',None)

## Data Exploration 

In [None]:
# Display the head of the data
train.head()


In [None]:
# Checking the missing values 
train.info()

* **There are no missing values in any features.**

In [None]:
# describe the five points of statistics of numericals data
train.describe()

In [None]:
# Describe the string data
train.describe(include='O')

### Univariate Analysis

#### 1. Categorical Variables

In [None]:
# function for display the percentage
def with_per(total, axis):
    for p in axis.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width()
        y = p.get_height()
        ax.annotate(percentage, (x, y),ha='center')

In [None]:
# extract the categorical variables
cat_var = [ feature for feature in train.columns if train[feature].dtypes=='O']
print('List of categorical variables {}'.format(cat_var))

In [None]:
#display the all the categorical variable 
for feature in cat_var:
    sns.set(style = 'whitegrid')
    plt.figure(figsize=(20,5))
    total = len(train)
    ax = sns.countplot(x = train[feature], data = train)
    #plt.title(feature)
    with_per(total, ax)
    plt.show()

* See the above plot
* 1. 90.7% customers didn't have international plan
* 2. 73.8%  customers didn't have voice mail plan
* 3. 49.6% customers are living in the area code area_code_415.
* 4. only 14.1% customers are churn


#### 2. Numerical Variables


In [None]:
#Extract the numerical features from the dataset
num_var = [feature for feature in train.columns if train[feature].dtypes != 'O']
print('List of Numerical featues {}'.format(num_var))

In [None]:
# Density plot of all the numerical features
for feature in num_var:
    sns.distplot(train[feature])
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.show()

* 1. Number_customer_service_calls, total_internation_call and Number_voice_mail_messages are not a normal distribuation so we can covert into normal distribution in featuer engineering step.
* 2. Other than the obove features all look like a normal distribuation.

### Bivariate Analysis

#### 1. Countinous Features

In [None]:
sns.FacetGrid(train, hue='churn',height=7).map(sns.distplot, 'account_length').add_legend()
plt.title('Churn rate VS account_length')

* customers account length between 60 to 120 has more churn rate

In [None]:
sns.FacetGrid(train, hue='churn',height=7).map(sns.distplot, 'number_vmail_messages').add_legend()
plt.title('Churn rate VS number_vmail_messages')
plt.show()

* More churn rate when the number_vamil_messages is 0

In [None]:
sns.FacetGrid(train, hue='churn',height=7).map(sns.distplot, 'total_day_minutes').add_legend()
plt.title('Churn rate VS total day minutes')
plt.show()

* Churn rate is high when the total_day_minutes is lies between 210 min to 300 min.

In [None]:
sns.FacetGrid(train, hue='churn',height=7).map(sns.distplot, 'total_day_calls').add_legend()
plt.title('Churn rate VS total day calls')
plt.show()

* churn rate is high lies between 85 to 115.

In [None]:
sns.FacetGrid(train, hue='churn',height=7).map(sns.distplot, 'total_day_charge').add_legend()
plt.title('Churn rate VS total day charge')
plt.show()

* churn rate is high when total day charge is lies between 40 to 50.

In [None]:
sns.FacetGrid(train, hue='churn',height=7).map(sns.distplot, 'total_eve_minutes').add_legend()
plt.title('Churn rate VS total evening minutes')
plt.show()

* churn rate is high when the total evening minutes is lies between 180 min to 220 min.

In [None]:
sns.FacetGrid(train, hue='churn',height=5).map(sns.distplot, 'total_eve_calls').add_legend()
plt.title('Churn rate VS total evening calls')
plt.show()

* churn rate is high when total evening calls lies between 90 to 115.

In [None]:
sns.FacetGrid(train, hue='churn',height=7).map(sns.distplot, 'total_eve_charge').add_legend()
plt.title('Churn rate VS total evening charges')
plt.show()

* churn rate is high when the total evening charge is lies between 15 to 18


In [None]:
sns.FacetGrid(train, hue='churn',height=7).map(sns.distplot, 'total_night_minutes').add_legend()
plt.title('Churn rate VS total night minutes')
plt.show()

* Churn rate is high when the total_night_minutes is lies between 190 to 220 min

In [None]:
sns.FacetGrid(train, hue='churn',height=7).map(sns.distplot, 'total_night_calls').add_legend()
plt.title('Churn rate VS total night calls')
plt.show()

* churn rate is high when total_nigh_calls lies between 90 to 110.


In [None]:
sns.FacetGrid(train, hue='churn',height=7).map(sns.distplot, 'total_night_charge').add_legend()
plt.title('Churn rate VS total night charge')
plt.show()

* churn rate is high when total_night_charge lies between 7.5 to 10.

In [None]:
sns.FacetGrid(train, hue='churn',height=7).map(sns.distplot, 'total_intl_minutes').add_legend()
plt.title('Churn rate VS total international minutes')
plt.show()

* churn rate is high when total international minutes lies between 9 to 12.


In [None]:
sns.FacetGrid(train, hue='churn',height=7).map(sns.distplot, 'total_intl_calls').add_legend()
plt.title('Churn rate VS total international calls')
plt.show()

* churn rate is high when total international calls is 1.

In [None]:
sns.FacetGrid(train, hue='churn',height=7).map(sns.distplot, 'total_intl_charge').add_legend()
plt.title('Churn rate VS total international charge')
plt.show()

* churn rate is high when total international charge is 2.5 to 3.

In [None]:
sns.FacetGrid(train, hue='churn',height=7).map(sns.distplot, 'number_customer_service_calls').add_legend()
plt.title('Churn rate VS Number of customer service calls')
plt.show()

* churn rate is high when number of customer service calls is 1.

### Outlier Detection


In [None]:
for feature in num_var:
    if feature != 'churn':
        sns.boxplot(x ='churn', y = feature, data = train)
        plt.title(feature)
        plt.show()

* every features has a outliers so we need to remove the outliers.
* outlies contains the some usefull information.
* so we have to replace the outliers with some meaning full values. so we should replace the outliers with meadin values

#### Removing the outliers

In [None]:
#functions for removing outliers
def remove_outliers(train,labels):
    for label in labels:
        q1 = train[label].quantile(0.25)
        q3 = train[label].quantile(0.75)
        iqr = q3 - q1
        upper_bound = q3 + 1.5 * iqr
        lower_bound = q1 - 1.5 * iqr
        train[label] = train[label].mask(train[label]< lower_bound, train[label].median(),axis=0)
        train[label] = train[label].mask(train[label]> upper_bound, train[label].median(),axis=0)

    return train

In [None]:
train = remove_outliers(train, num_var)

* after removing the outliers we have to see the outliers

In [None]:
for feature in num_var:
    if feature != 'churn':
        sns.boxplot(x ='churn', y = feature, data = train)
        plt.title(feature)
        plt.show()

### Handling the Categorical Variable

* state feature has 51 different category so we can't converted into onehot encoder that is it create 51 different features so it leads to overfitting so I will use the hashing encoding for state featuer.


In [None]:
hash_state = ce.HashingEncoder(cols = 'state')
train = hash_state.fit_transform(train)
test = hash_state.transform(test)
train.head()

In [None]:
test.head()

In [None]:
# replace no to 0 and yes to 1
train.international_plan.replace(['no','yes'],[0,1],inplace = True)
train.voice_mail_plan.replace(['no','yes'],[0,1],inplace=True)
train.churn.replace(['no','yes'],[0,1],inplace = True)
test.international_plan.replace(['no','yes'],[0,1],inplace = True)
test.voice_mail_plan.replace(['no','yes'],[0,1],inplace = True)
train.head()

In [None]:
# converting the area_code to numerical variable using one-hot encoder
onehot_area = OneHotEncoder()
onehot_area.fit(train[['area_code']])

# Train
encoded_values = onehot_area.transform(train[['area_code']])
train[onehot_area.categories_[0]] = encoded_values.toarray()
train = train.drop('area_code', axis=1)

# Test
encoded_values = onehot_area.transform(test[['area_code']])
test[onehot_area.categories_[0]] = encoded_values.toarray()
test = test.drop('area_code', axis=1)

In [None]:
train.head()

In [None]:
test.head()

### Handling the Imbalanced dataset

In [None]:
# showing the imbalanced class
sns.countplot(x = 'churn', data = train)
plt.show()

* 0 represent the no churn and 1 represent the churn so there are huge difference in the class. so we need to balanced the dataset
* We have to use upsampling for handling the dataset

In [None]:
x = train.drop('churn',axis=1).values
y = train.churn.values
id_submission = test.id
test = test.drop('id', axis=1)
# spliting the data into test and train
x_train, x_test , y_train, y_test = train_test_split(x, y , test_size=0.3, random_state=0)

In [None]:
print('Before upsampling count of label 0 {}'.format(sum(y_train==0)))
print('Before upsampling count of label 1 {}'.format(sum(y_train==1)))
# Minority Over Sampling Technique
sm = SMOTE(sampling_strategy = 1, random_state=1)   
x_train_s, y_train_s = sm.fit_resample(x_train, y_train.ravel())
                                         
print('After upsampling count of label 0 {}'.format(sum(y_train_s==0)))
print('After upsampling count of label 1 {}'.format(sum(y_train_s==1)))

* after apply the upsampling technique the number of samples of both classes are same

### Scaling the dataset

In [None]:
# creating the object of minmax scaler
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
test = scaler.transform(test)

### Building the model

#### Support Vector Classification

In [None]:
svc = SVC(kernel='rbf', decision_function_shape='ovr')
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
print('Accuracy: ')
print('{}'.format(accuracy_score(y_test, y_pred)))
print('Classification report: ')
print('{}'.format(classification_report(y_test, y_pred)))
print('Confusion Matrix')
print('{}'.format(confusion_matrix(y_test, y_pred)))
print('Cohen kappa score: ')
print('{}'.format(cohen_kappa_score(y_test, y_pred)))

#### Random Forest Classifier

In [None]:
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)
print('Accuracy: ')
print('{}'.format(accuracy_score(y_test, y_pred)))
print('Classification report: ')
print('{}'.format(classification_report(y_test, y_pred)))
print('Confusion Matrix')
print('{}'.format(confusion_matrix(y_test, y_pred)))
print('Cohen kappa score: ')
print('{}'.format(cohen_kappa_score(y_test, y_pred)))

### XGBClassifier

In [None]:
clf = XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.7, 
                        subsample=0.8, nthread=10, learning_rate=0.01)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print('Accuracy: ')
print('{}'.format(accuracy_score(y_test, y_pred)))
print('Classification report: ')
print('{}'.format(classification_report(y_test, y_pred)))
print('Confusion Matrix')
print('{}'.format(confusion_matrix(y_test, y_pred)))
print('Cohen kappa score: ')
print('{}'.format(cohen_kappa_score(y_test, y_pred)))

Just add a  classic LGBM Classifier. Improvements can be made with hyperparameter optimization

In [None]:
model = lgb.LGBMClassifier(is_unbalance=True)
model.fit(X, y)

In [None]:
test['churn'] = model.predict(test.loc[:, ~test.columns.isin(['id'])])
test['churn'] = test['churn'].apply(lambda x: 'yes' if x == 1 else 'no')

In [None]:
test[['id', 'churn']].to_csv('submission.csv', index=False)

In [53]:
y_pred_sub = rfc.predict(test)


In [54]:
submit = pd.DataFrame({'id':id_submission, 'churn':y_pred_sub})
submit.tail()

Unnamed: 0,id,churn
745,746,0
746,747,0
747,748,0
748,749,0
749,750,0


In [None]:
# replace 0 to no and 1 to yes
submit.churn.replace([0,1],['no','yes'], inplace=True)

In [None]:
submit