# 1. Importing Dataset and Data Pre-Processing

In [1]:
# Import library for reading the dataset
import pandas as pd

## Loading the dataset. Make sure that the dataset file is in the same path of the notebook

In [2]:
# Importing the dataset and store it in variable named dataset
dataset = pd.read_csv('customer-churn-data.csv')
#here 'Data.csv' is the dataset file name given as argument to read_csv function

## A peek at the top/head of the dataset

In [3]:
dataset.head()
# notice that the null or missing value is stored as NaN (5th row salary column) in the output

Unnamed: 0,Gender,Age,Payment Method,Churn,LastTransaction
0,male,64.0,credit card,loyal,98
1,male,35.0,cheque,churn,118
2,female,25.0,credit card,loyal,107
3,male,39.0,credit card,loyal,90
4,female,28.0,cheque,churn,189


## How to check whether the dataset has missing values????

In [4]:
# isnull().any() will tell if there are any null values in any of the columns of the dataset
dataset.isnull().any()
# So in the Age and Salary column has null values

Gender             False
Age                 True
Payment Method     False
Churn              False
LastTransaction    False
dtype: bool

## How to to know how many missing values are in the dataset in each column????

In [5]:
# isnull().sum() will tell the number of null values in each column
dataset.isnull().sum()
#Age and Salary column has one missing value each

Gender             0
Age                3
Payment Method     0
Churn              0
LastTransaction    0
dtype: int64

## observe the missing values in the Age and Salary column

In [6]:
dataset

Unnamed: 0,Gender,Age,Payment Method,Churn,LastTransaction
0,male,64.0,credit card,loyal,98
1,male,35.0,cheque,churn,118
2,female,25.0,credit card,loyal,107
3,male,39.0,credit card,loyal,90
4,female,28.0,cheque,churn,189
5,female,21.0,credit card,loyal,102
6,male,48.0,credit card,loyal,141
7,female,,credit card,churn,153
8,male,36.0,credit card,loyal,46
9,male,22.0,credit card,loyal,51


# 2. Taking care of missing data
## Using imputer class that is used for handling missing data

In [7]:
#import required library
from sklearn.preprocessing import Imputer

## Create imputer object: Argeuments of imputer     
### 1. missing_values= 'NaN'              (as we saw earlier missing value is stored as NaN in the dataset variable)
### 2. strategy= 'mean'                         (method used for handling missing value is by replacing it with the mean)
### 3. axis=0                                          (mean is computed along x-axis)

In [8]:
#imputer object creation
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)

## Apply imputer object's fit function to replace the missing values
## Arguments of fit function: columns containing missing values (Age & Salary in our example) 

In [9]:
#apply imputer object's fit function
imputer = imputer.fit(dataset[['Age']])

## Replace the old columns containing missing values with new columns same columns but that is free of missing values using imputer object's trasform function

In [10]:
dataset[['Age']] = imputer.transform(dataset[['Age']])

## Now check the dataset for missing values

In [11]:
dataset.isnull().any()

Gender             False
Age                False
Payment Method     False
Churn              False
LastTransaction    False
dtype: bool

## False indicating no missing values .Now the data set is free of missing values! 

In [12]:
dataset

Unnamed: 0,Gender,Age,Payment Method,Churn,LastTransaction
0,male,64.000000,credit card,loyal,98
1,male,35.000000,cheque,churn,118
2,female,25.000000,credit card,loyal,107
3,male,39.000000,credit card,loyal,90
4,female,28.000000,cheque,churn,189
5,female,21.000000,credit card,loyal,102
6,male,48.000000,credit card,loyal,141
7,female,45.562011,credit card,churn,153
8,male,36.000000,credit card,loyal,46
9,male,22.000000,credit card,loyal,51


# 3. Seperating the independent and dependent(target) feature
## iloc[ :, [ cols ] ] (: indicates all rows , [cols] required column numbers)
## Independent feature stored in nparray x


In [13]:
X=dataset.iloc[:,[0,1,2,4]].values

In [14]:
X

array([['male', 64.0, 'credit card', 98],
       ['male', 35.0, 'cheque', 118],
       ['female', 25.0, 'credit card', 107],
       ...,
       ['male', 84.0, 'credit card', 124],
       ['male', 19.0, 'credit card', 49],
       ['female', 24.0, 'credit card', 25]], dtype=object)

## Independent feature stored in nparray y

In [15]:
y=dataset.iloc[:,[3]].values

In [16]:
y

array([['loyal'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['loyal'],
       ['churn'],
       ['churn'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['churn'],
       ['churn'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['l

# 4. Categorical encoding

In [17]:
# Import required library
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

## 0th column of X has Gender as non ordinal categorical variable

In [18]:
X[:,0]

array(['male', 'male', 'female', 'male', 'female', 'female', 'male',
       'female', 'male', 'male', 'male', 'male', 'female', 'female',
       'male', 'male', 'female', 'female', 'female', 'female', 'male',
       'female', 'male', 'female', 'female', 'female', 'male', 'female',
       'male', 'male', 'male', 'female', 'female', 'female', 'male',
       'female', 'female', 'male', 'female', 'female', 'female', 'male',
       'male', 'male', 'female', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'female', 'male', 'male', 'male', 'male', 'female',
       'female', 'male', 'male', 'female', 'female', 'male', 'male',
       'male', 'male', 'male', 'male', 'female', 'female', 'female',
       'male', 'female', 'male', 'male', 'female', 'female', 'male',
       'female', 'male', 'male', 'female', 'female', 'female', 'male',
       'female', 'male', 'male', 'female', 'male', 'male', 'female',
       'female', 'female', 'female', 'male', 'female', 'male', 'male',
       'fe

## first label encode 0th column of X

In [19]:
#label encoding column 0 (country) in X 
labelencoder_X1= LabelEncoder()
X[:, 0] = labelencoder_X1.fit_transform(X[:, 0])

In [20]:
X[:,0]

array([1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,

In [21]:
X

array([[1, 64.0, 'credit card', 98],
       [1, 35.0, 'cheque', 118],
       [0, 25.0, 'credit card', 107],
       ...,
       [1, 84.0, 'credit card', 124],
       [1, 19.0, 'credit card', 49],
       [0, 24.0, 'credit card', 25]], dtype=object)

## payment method as another catagorical variable in column 2

In [22]:
labelencoder_X2 = LabelEncoder()
X[:, 2] = labelencoder_X2.fit_transform(X[:, 2])

In [23]:
X[:, 2]

array([2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 2, 0, 0, 2, 2, 0, 2,
       2, 2, 2, 2, 0, 0, 1, 2, 0, 0, 0, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 1, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 0, 0,
       2, 2, 1, 2, 0, 0, 0, 2, 2, 1, 0, 0, 0, 0, 2, 2, 0, 2, 2, 2, 0, 2,
       2, 1, 2, 0, 0, 2, 0, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 2, 1, 2, 0,
       2, 2, 2, 2, 1, 0, 2, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2,
       2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 1, 2, 0, 0, 2, 0, 0, 2, 0,
       2, 2, 1, 2, 1, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 0, 0,
       2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 1, 0, 2,
       2, 1, 2, 2, 2, 2, 2, 0, 0, 1, 2, 2, 2, 2, 2, 1, 2, 0, 0, 2, 2, 2,
       2, 2, 2, 0, 1, 2, 2, 2, 0, 2, 0, 2, 1, 2, 0, 1, 2, 0, 2, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 0, 1,
       2, 0, 2, 2, 0, 2, 0, 1, 2, 0, 2, 2, 2, 2, 0,

In [24]:
X

array([[1, 64.0, 2, 98],
       [1, 35.0, 1, 118],
       [0, 25.0, 2, 107],
       ...,
       [1, 84.0, 2, 124],
       [1, 19.0, 2, 49],
       [0, 24.0, 2, 25]], dtype=object)

## one hot encoding is left to your preference

## categoriacal encoding for target variable

In [25]:
y

array([['loyal'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['loyal'],
       ['churn'],
       ['churn'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['churn'],
       ['churn'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['churn'],
       ['loyal'],
       ['loyal'],
       ['l

In [26]:
# Encoding the Dependent Variable
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

  y = column_or_1d(y, warn=True)


## ignore dataconversion warning

In [27]:
y

array([1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,

# 5. Splitting the data into test and train set

In [28]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split



## Using train_test_split function
## Argumnets:
### 1. X (independent feature)
### 2. y (dependent/target feature)
### 3. test_size (ratio of split)
### 4. random state (0 :- selects same random rows each time you run the program , any other number:- takes random rows each time u run the program)

## The function peforms splitting and returns 4 nparrays: X_train , X_test , y_train , y_test  

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [30]:
X_train

array([[1, 25.0, 2, 67],
       [1, 81.0, 0, 148],
       [1, 35.0, 1, 118],
       ...,
       [1, 20.0, 0, 26],
       [1, 63.0, 2, 116],
       [1, 58.0, 2, 66]], dtype=object)

In [31]:
X_test

array([[0, 61.0, 2, 147],
       [1, 35.0, 2, 69],
       [0, 19.0, 2, 68],
       [0, 18.0, 0, 173],
       [0, 51.0, 2, 179],
       [1, 41.0, 2, 96],
       [1, 18.0, 0, 88],
       [1, 57.0, 0, 136],
       [0, 64.0, 2, 160],
       [1, 62.0, 2, 82],
       [1, 52.0, 1, 166],
       [0, 39.0, 2, 157],
       [1, 40.0, 2, 108],
       [0, 33.0, 2, 162],
       [0, 69.0, 1, 93],
       [0, 59.0, 2, 108],
       [0, 44.0, 0, 118],
       [0, 29.0, 2, 33],
       [0, 84.0, 2, 114],
       [0, 59.0, 0, 119],
       [1, 45.0, 2, 55],
       [1, 42.0, 2, 46],
       [1, 61.0, 2, 94],
       [1, 31.0, 0, 99],
       [1, 34.0, 2, 110],
       [1, 19.0, 2, 19],
       [1, 76.0, 2, 105],
       [1, 23.0, 2, 99],
       [1, 36.0, 2, 107],
       [1, 65.0, 2, 111],
       [0, 52.0, 0, 98],
       [1, 56.0, 2, 43],
       [0, 30.0, 2, 60],
       [0, 39.0, 2, 109],
       [0, 45.0, 2, 154],
       [1, 46.0, 0, 192],
       [1, 37.0, 0, 122],
       [0, 35.0, 2, 28],
       [0, 39.0, 2, 75],
    

In [32]:
y_train

array([1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,

In [33]:
y_test

array([0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1], dtype=int64)

## Total samples in dataset

In [34]:
len(X)

898

## Number of samples in Train set

In [35]:
len(X_train)

718

## Number of samples in Test set

In [36]:
len(X_test)

180

# 6. Feature Scaling (using standardization)

In [37]:
#import required library
from sklearn.preprocessing import StandardScaler

## Scaling the independent features

In [38]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)



## After Scaling

In [39]:
X_train

array([[ 0.88911399, -1.06781313,  0.73979254, -1.02802635],
       [ 0.88911399,  1.91672513, -1.46881886,  0.8061851 ],
       [ 0.88911399, -0.53485987, -0.36451316,  0.12684753],
       ...,
       [ 0.88911399, -1.33428976, -1.46881886, -1.95645437],
       [ 0.88911399,  0.95740926,  0.73979254,  0.08155836],
       [ 0.88911399,  0.69093263,  0.73979254, -1.05067094]])

In [40]:
X_test

array([[-1.12471518e+00,  8.50818610e-01,  7.39792537e-01,
         7.83540516e-01],
       [ 8.89113992e-01, -5.34859871e-01,  7.39792537e-01,
        -9.82737179e-01],
       [-1.12471518e+00, -1.38758509e+00,  7.39792537e-01,
        -1.00538176e+00],
       [-1.12471518e+00, -1.44088042e+00, -1.46881886e+00,
         1.37229975e+00],
       [-1.12471518e+00,  3.17865348e-01,  7.39792537e-01,
         1.50816726e+00],
       [ 8.89113992e-01, -2.15087914e-01,  7.39792537e-01,
        -3.71333362e-01],
       [ 8.89113992e-01, -1.44088042e+00, -1.46881886e+00,
        -5.52490048e-01],
       [ 8.89113992e-01,  6.37637305e-01, -1.46881886e+00,
         5.34450072e-01],
       [-1.12471518e+00,  1.01070459e+00,  7.39792537e-01,
         1.07792013e+00],
       [ 8.89113992e-01,  9.04113936e-01,  7.39792537e-01,
        -6.88357563e-01],
       [ 8.89113992e-01,  3.71160674e-01, -3.64513163e-01,
         1.21378765e+00],
       [-1.12471518e+00, -3.21678567e-01,  7.39792537e-01,
      

## No need for Scaling the target feature

# Data Pre-Processing Completed !