In [1]:
# Import required python libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading Leads dataset
leads = pd.read_csv('Leads.csv')

# printing the dataset .head()-print 1st five rows of dataset
leads.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,...,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,...,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified


In [3]:
# Shape of the dataset contains 9240 rows and 37 column
leads.shape

(9240, 37)

In [4]:
# Printing cloumn names of 37 columns
leads.columns

Index(['Prospect ID', 'Lead Number', 'Lead Origin', 'Lead Source',
       'Do Not Email', 'Do Not Call', 'Converted', 'TotalVisits',
       'Total Time Spent on Website', 'Page Views Per Visit', 'Last Activity',
       'Country', 'Specialization', 'How did you hear about X Education',
       'What is your current occupation',
       'What matters most to you in choosing a course', 'Search', 'Magazine',
       'Newspaper Article', 'X Education Forums', 'Newspaper',
       'Digital Advertisement', 'Through Recommendations',
       'Receive More Updates About Our Courses', 'Tags', 'Lead Quality',
       'Update me on Supply Chain Content', 'Get updates on DM Content',
       'Lead Profile', 'City', 'Asymmetrique Activity Index',
       'Asymmetrique Profile Index', 'Asymmetrique Activity Score',
       'Asymmetrique Profile Score',
       'I agree to pay the amount through cheque',
       'A free copy of Mastering The Interview', 'Last Notable Activity'],
      dtype='object')

In [5]:
# Checking the info to see the types of the values and the null values if any
leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
Prospect ID                                      9240 non-null object
Lead Number                                      9240 non-null int64
Lead Origin                                      9240 non-null object
Lead Source                                      9204 non-null object
Do Not Email                                     9240 non-null object
Do Not Call                                      9240 non-null object
Converted                                        9240 non-null int64
TotalVisits                                      9103 non-null float64
Total Time Spent on Website                      9240 non-null int64
Page Views Per Visit                             9103 non-null float64
Last Activity                                    9137 non-null object
Country                                          6779 non-null object
Specialization                                   7802 

Data Cleaning and Preparation

In [6]:
# Checking missing values count in each column
leads.isnull().sum()

Prospect ID                                         0
Lead Number                                         0
Lead Origin                                         0
Lead Source                                        36
Do Not Email                                        0
Do Not Call                                         0
Converted                                           0
TotalVisits                                       137
Total Time Spent on Website                         0
Page Views Per Visit                              137
Last Activity                                     103
Country                                          2461
Specialization                                   1438
How did you hear about X Education               2207
What is your current occupation                  2690
What matters most to you in choosing a course    2709
Search                                              0
Magazine                                            0
Newspaper Article           

From above results its clear that columns with high numbers of null values are not useful so lets drop columns whose null value count is more than 3000 and try it.

In [7]:
# Dropping columns with 3000+ null values

for column in leads.columns:
    if leads[column].isnull().sum() > 3000:
        leads.drop(column, 1, inplace=True)

In [8]:
# Checking missing values count in each column again
leads.isnull().sum()

Prospect ID                                         0
Lead Number                                         0
Lead Origin                                         0
Lead Source                                        36
Do Not Email                                        0
Do Not Call                                         0
Converted                                           0
TotalVisits                                       137
Total Time Spent on Website                         0
Page Views Per Visit                              137
Last Activity                                     103
Country                                          2461
Specialization                                   1438
How did you hear about X Education               2207
What is your current occupation                  2690
What matters most to you in choosing a course    2709
Search                                              0
Magazine                                            0
Newspaper Article           

In [9]:
leads.drop(['City','Country'], axis = 1, inplace = True)

In [10]:
# leads.drop(['Country'], axis = 1, inplace = True)

In [11]:
# Checking percentage of missing values in each column

round(100*(leads.isnull().sum()/len(leads.index)), 2)

Prospect ID                                       0.00
Lead Number                                       0.00
Lead Origin                                       0.00
Lead Source                                       0.39
Do Not Email                                      0.00
Do Not Call                                       0.00
Converted                                         0.00
TotalVisits                                       1.48
Total Time Spent on Website                       0.00
Page Views Per Visit                              1.48
Last Activity                                     1.11
Specialization                                   15.56
How did you hear about X Education               23.89
What is your current occupation                  29.11
What matters most to you in choosing a course    29.32
Search                                            0.00
Magazine                                          0.00
Newspaper Article                                 0.00
X Educatio

In [12]:
# Value counts of all the columns

for column in leads:
    print(leads[column].astype('category').value_counts())
    print('___________________________________________________')

fffb0e5e-9f92-4017-9f42-781a69da4154    1
56453aec-3f7b-4f30-870c-8f966d393100    1
53ac14bd-2bb2-4315-a21c-94562d1b6b2d    1
53aabd84-5dcc-4299-bbe3-62f3764b07b1    1
539ffa32-1be7-4fe1-b04c-faf1bab763cf    1
539eb309-df36-4a89-ac58-6d3651393910    1
5398e7ff-74db-4074-89fb-4fd9a603f521    1
53953744-234a-4cb9-9af4-bcc47eb472f4    1
539366d9-f633-455a-99e4-dbc5907db28e    1
5390c5fe-b12c-4f6e-ae92-908672abb0a1    1
5379ee79-64b7-44f8-8c56-0e1ca2d5b887    1
537963c8-22d9-459d-8aae-ddac40580ffb    1
53744d5a-0483-42c0-80b0-8990a4d2356d    1
53715ab1-2106-4c4e-8493-81cc465eb9ce    1
536cdc6b-f4c1-449d-bfd8-9ef0ac912dbb    1
53690d88-52f0-4ce5-b6b8-a13570a6db35    1
5363bd79-576c-48ed-83e4-024c81ea00c5    1
53c4e210-3344-4737-813f-74ef9a747ab6    1
53dbb914-71e7-458a-9749-cfb4d655eac2    1
53dd16bd-8201-448d-8e20-97de1cf44a7f    1
541325bd-15bb-4b52-8ad9-3fdf3cb1dd55    1
5434ccf3-9de6-4c72-8dd6-66c2829d0ee2    1
542a0891-2e52-40ba-ab42-e468b9636322    1
54238b21-65ce-4304-98c6-0f8a6b9671

These 3 columns (Lead Profile,How did you hear about X Education & Specialization) have the level 'Select'. And in most of the cases students choosed 'Select' which basically means that the student had not selected the option for that particular column. These values can be treated as missing values. Lets count the number of "Select' values in these 3 columns.

In [13]:
leads['Lead Profile'].astype('category').value_counts()

Select                         4146
Potential Lead                 1613
Other Leads                     487
Student of SomeSchool           241
Lateral Student                  24
Dual Specialization Student      20
Name: Lead Profile, dtype: int64

In [14]:
leads['How did you hear about X Education'].value_counts()

Select                   5043
Online Search             808
Word Of Mouth             348
Student of SomeSchool     310
Other                     186
Multiple Sources          152
Advertisements             70
Social Media               67
Email                      26
SMS                        23
Name: How did you hear about X Education, dtype: int64

In [15]:
leads['Specialization'].value_counts()

Select                               1942
Finance Management                    976
Human Resource Management             848
Marketing Management                  838
Operations Management                 503
Business Administration               403
IT Projects Management                366
Supply Chain Management               349
Banking, Investment And Insurance     338
Media and Advertising                 203
Travel and Tourism                    203
International Business                178
Healthcare Management                 159
Hospitality Management                114
E-COMMERCE                            112
Retail Management                     100
Rural and Agribusiness                 73
E-Business                             57
Services Excellence                    40
Name: Specialization, dtype: int64

Lead Profile and How did you hear about X Education Select value count is more than 70% of total value so lets drop these 2 column. 

And lets drop the columns where the value is NO, Columns are - Do Not Call, Search, Magazine, Newspaper Article, X Education Forums, Newspaper, Digital Advertisement, Through Recommendations, Receive More Updates About Our Courses, Update me on Supply Chain Content, Get updates on DM Content, I agree to pay the amount through cheque.

Also, lets drop the column "What matters most to you in choosing a course" as out of 3 catagories - catagory "Better Career Prospects was selected 6528 times while the other two catagories only 1 time and 2 times. 

Lets drop all these above mentioned columns as it will not help in analysis.

In [16]:
leads.drop(['Do Not Call', 'Search', 'Magazine', 'Newspaper Article', 'X Education Forums', 'Newspaper', 
            'Digital Advertisement', 'Through Recommendations', 'Receive More Updates About Our Courses', 
            'Update me on Supply Chain Content', 'Get updates on DM Content', 
            'I agree to pay the amount through cheque','What matters most to you in choosing a course','Lead Profile', 'How did you hear about X Education'], axis = 1, inplace = True)

In [17]:
# Checking missing values count in each column again
leads.isnull().sum()

Prospect ID                                  0
Lead Number                                  0
Lead Origin                                  0
Lead Source                                 36
Do Not Email                                 0
Converted                                    0
TotalVisits                                137
Total Time Spent on Website                  0
Page Views Per Visit                       137
Last Activity                              103
Specialization                            1438
What is your current occupation           2690
A free copy of Mastering The Interview       0
Last Notable Activity                        0
dtype: int64

In [18]:
# Let's drop the null rows for the column 'What is you current occupation'
leads = leads[~pd.isnull(leads['What is your current occupation'])]

In [19]:
# Checking missing values count in each column again
leads.isnull().sum()

Prospect ID                                 0
Lead Number                                 0
Lead Origin                                 0
Lead Source                                36
Do Not Email                                0
Converted                                   0
TotalVisits                               130
Total Time Spent on Website                 0
Page Views Per Visit                      130
Last Activity                             103
Specialization                             18
What is your current occupation             0
A free copy of Mastering The Interview      0
Last Notable Activity                       0
dtype: int64

Now we have a dataset with very less count of null values and we saw a decrease in null values count after removing the null value rows of column 'What is you current occupation'. Now lets remove all null value rows from columns 'Lead Source', 'Total Visits','Page Views Per Visit','Last Activity','Specialization' 1 by 1, start with higher value count i.e. Total visits 130

In [20]:
leads = leads[~pd.isnull(leads['TotalVisits'])]

In [21]:
leads.isnull().sum()

Prospect ID                                0
Lead Number                                0
Lead Origin                                0
Lead Source                               29
Do Not Email                               0
Converted                                  0
TotalVisits                                0
Total Time Spent on Website                0
Page Views Per Visit                       0
Last Activity                              0
Specialization                            18
What is your current occupation            0
A free copy of Mastering The Interview     0
Last Notable Activity                      0
dtype: int64

In [22]:
leads = leads[~pd.isnull(leads['Lead Source'])]
leads.isnull().sum()

Prospect ID                                0
Lead Number                                0
Lead Origin                                0
Lead Source                                0
Do Not Email                               0
Converted                                  0
TotalVisits                                0
Total Time Spent on Website                0
Page Views Per Visit                       0
Last Activity                              0
Specialization                            18
What is your current occupation            0
A free copy of Mastering The Interview     0
Last Notable Activity                      0
dtype: int64

In [23]:
leads = leads[~pd.isnull(leads['Specialization'])]
leads.isnull().sum()

Prospect ID                               0
Lead Number                               0
Lead Origin                               0
Lead Source                               0
Do Not Email                              0
Converted                                 0
TotalVisits                               0
Total Time Spent on Website               0
Page Views Per Visit                      0
Last Activity                             0
Specialization                            0
What is your current occupation           0
A free copy of Mastering The Interview    0
Last Notable Activity                     0
dtype: int64

Atlast we have a dataset with zero NULL Values. Lets proceed with further analysis.

In [24]:
# Checking quality of current dataset after dropping all irrelevant columns and rows
print(len(leads.index))
print(len(leads.index)/9240)                            

6373
0.6897186147186147


In [25]:
# Current dataset after modification
leads.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Specialization,What is your current occupation,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,0,0.0,0,0.0,Page Visited on Website,Select,Unemployed,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,0,5.0,674,2.5,Email Opened,Select,Unemployed,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,1,2.0,1532,2.0,Email Opened,Business Administration,Student,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,0,1.0,305,1.0,Unreachable,Media and Advertising,Unemployed,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,1,2.0,1428,1.0,Converted to Lead,Select,Unemployed,No,Modified


In [26]:
leads.drop(['Prospect ID', 'Lead Number'], 1, inplace = True)
leads.head()

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Specialization,What is your current occupation,A free copy of Mastering The Interview,Last Notable Activity
0,API,Olark Chat,No,0,0.0,0,0.0,Page Visited on Website,Select,Unemployed,No,Modified
1,API,Organic Search,No,0,5.0,674,2.5,Email Opened,Select,Unemployed,No,Email Opened
2,Landing Page Submission,Direct Traffic,No,1,2.0,1532,2.0,Email Opened,Business Administration,Student,Yes,Email Opened
3,Landing Page Submission,Direct Traffic,No,0,1.0,305,1.0,Unreachable,Media and Advertising,Unemployed,No,Modified
4,Landing Page Submission,Google,No,1,2.0,1428,1.0,Converted to Lead,Select,Unemployed,No,Modified


Lets create Dummy Variables for categorical variables. Lets identify the categorical variables first.

In [27]:
# Checking the columns type 'object'

obj = leads.loc[:, leads.dtypes == 'object']
obj.columns

Index(['Lead Origin', 'Lead Source', 'Do Not Email', 'Last Activity',
       'Specialization', 'What is your current occupation',
       'A free copy of Mastering The Interview', 'Last Notable Activity'],
      dtype='object')

In [28]:
# Creating dummy variables using 'get_dummies'
dummy_var = pd.get_dummies(leads[['Lead Origin', 'Lead Source', 'Do Not Email', 'Last Activity',
                              'What is your current occupation','A free copy of Mastering The Interview', 
                              'Last Notable Activity']], drop_first=True)

# Concatinate the dummy_var results to the master dataframe
leads = pd.concat([leads, dummy_var], axis=1)

In [29]:
# Creating dummy variable for the variable 'Specialization' as it got a level 'Select' so lets drop that level by specifying it explicitly

dummy_spl = pd.get_dummies(leads['Specialization'], prefix = 'Specialization')
dummy_spl = dummy_spl.drop(['Specialization_Select'], 1)
leads = pd.concat([leads, dummy_spl], axis = 1)

In [30]:
# Dropping the variables for which dummy variables are created

leads = leads.drop(['Lead Origin', 'Lead Source', 'Do Not Email', 'Last Activity',
                   'Specialization', 'What is your current occupation',
                   'A free copy of Mastering The Interview', 'Last Notable Activity'], 1)

In [31]:
# New updated dataset
leads.head()

Unnamed: 0,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Source_Direct Traffic,Lead Source_Facebook,Lead Source_Google,...,Specialization_IT Projects Management,Specialization_International Business,Specialization_Marketing Management,Specialization_Media and Advertising,Specialization_Operations Management,Specialization_Retail Management,Specialization_Rural and Agribusiness,Specialization_Services Excellence,Specialization_Supply Chain Management,Specialization_Travel and Tourism
0,0,0.0,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,5.0,674,2.5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,2.0,1532,2.0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1.0,305,1.0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,2.0,1428,1.0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# Split the dataset into Train and Test sets
# Import the required library

from sklearn.model_selection import train_test_split

In [33]:
# Storing Feature variables in X

X = leads.drop(['Converted'], 1)
X.head()

Unnamed: 0,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Source_Direct Traffic,Lead Source_Facebook,Lead Source_Google,Lead Source_Live Chat,...,Specialization_IT Projects Management,Specialization_International Business,Specialization_Marketing Management,Specialization_Media and Advertising,Specialization_Operations Management,Specialization_Retail Management,Specialization_Rural and Agribusiness,Specialization_Services Excellence,Specialization_Supply Chain Management,Specialization_Travel and Tourism
0,0.0,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5.0,674,2.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2.0,1532,2.0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,305,1.0,1,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,2.0,1428,1.0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# Storing Target variables in y
y = leads['Converted']
y.head()

0    0
1    0
2    1
3    0
4    1
Name: Converted, dtype: int64

In [35]:
# Splitting the dataset into 70% Train and 30% Test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

In [36]:
# Rescaling the numeric variables present in the dataset.

# Import MinMax scaler
from sklearn.preprocessing import MinMaxScaler

In [37]:
# Scaling all the numeric variables in x_Train using MinMaxScalar

scaler = MinMaxScaler()

X_train[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']] = scaler.fit_transform(X_train[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']])

X_train.head()

Unnamed: 0,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Source_Direct Traffic,Lead Source_Facebook,Lead Source_Google,Lead Source_Live Chat,...,Specialization_IT Projects Management,Specialization_International Business,Specialization_Marketing Management,Specialization_Media and Advertising,Specialization_Operations Management,Specialization_Retail Management,Specialization_Rural and Agribusiness,Specialization_Services Excellence,Specialization_Supply Chain Management,Specialization_Travel and Tourism
8003,0.015936,0.029489,0.125,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
218,0.015936,0.082306,0.25,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4171,0.023904,0.034331,0.375,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4037,0.0,0.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3660,0.0,0.0,0.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# Corelation between different variables
leads.corr()

Unnamed: 0,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Source_Direct Traffic,Lead Source_Facebook,Lead Source_Google,...,Specialization_IT Projects Management,Specialization_International Business,Specialization_Marketing Management,Specialization_Media and Advertising,Specialization_Operations Management,Specialization_Retail Management,Specialization_Rural and Agribusiness,Specialization_Services Excellence,Specialization_Supply Chain Management,Specialization_Travel and Tourism
Converted,1.000000,0.005651,0.313338,-0.063362,-0.117563,0.288666,-0.019269,-0.133600,-0.021207,0.020205,...,-0.005689,-0.024789,0.049520,-0.000862,0.031349,-0.018603,0.006964,-0.005142,0.005785,-0.011762
TotalVisits,0.005651,1.000000,0.202551,0.489039,0.267954,-0.208375,-0.043000,0.075252,-0.042052,0.085306,...,0.025182,0.028630,-0.000493,0.038725,0.008929,0.014223,0.068015,0.015114,0.063383,0.064384
Total Time Spent on Website,0.313338,0.202551,1.000000,0.303870,0.275606,-0.249493,-0.061429,0.114088,-0.060945,0.227496,...,0.025526,0.011056,0.052437,0.043356,0.050860,0.024919,0.018767,0.003203,0.045386,0.037867
Page Views Per Visit,-0.063362,0.489039,0.303870,1.000000,0.458168,-0.340185,-0.065739,0.109785,-0.062896,0.183735,...,0.062421,0.057990,0.017799,0.063772,0.030364,0.026099,0.027465,0.015230,0.052972,0.111284
Lead Origin_Landing Page Submission,-0.117563,0.267954,0.275606,0.458168,1.000000,-0.363764,-0.074917,0.508857,-0.071507,0.067225,...,0.133206,0.089105,0.084975,0.093730,0.095849,0.070983,0.050077,0.039433,0.111610,0.094875
Lead Origin_Lead Add Form,0.288666,-0.208375,-0.249493,-0.340185,-0.363764,1.000000,-0.020659,-0.204332,-0.021040,-0.216777,...,-0.038283,-0.035452,0.022421,-0.044041,-0.021911,-0.025339,-0.018872,-0.011155,-0.035065,-0.045397
Lead Origin_Lead Import,-0.019269,-0.043000,-0.061429,-0.065739,-0.074917,-0.020659,1.000000,-0.042082,0.981903,-0.044885,...,-0.013931,0.007085,-0.022226,0.004894,-0.006609,-0.007261,-0.006251,-0.004093,-0.001963,-0.010092
Lead Source_Direct Traffic,-0.133600,0.075252,0.114088,0.109785,0.508857,-0.204332,-0.042082,1.000000,-0.042857,-0.443951,...,0.098312,0.042974,0.053192,0.005889,0.050362,0.022168,0.021596,0.053189,0.093536,0.002757
Lead Source_Facebook,-0.021207,-0.042052,-0.060945,-0.062896,-0.071507,-0.021040,0.981903,-0.042857,1.000000,-0.045713,...,-0.014187,0.006607,-0.022636,0.004424,-0.007097,-0.007395,-0.006366,-0.004169,-0.002431,-0.010278
Lead Source_Google,0.020205,0.085306,0.227496,0.183735,0.067225,-0.216777,-0.044885,-0.443951,-0.045713,1.000000,...,0.009316,-0.006288,0.042857,-0.003722,0.001890,0.021190,-0.037642,-0.027058,-0.027074,-0.053104


In [39]:
# # For Visualisation
# import matplotlib.pyplot as plt
# import seaborn as sns
# %matplotlib inline
# plt.figure(figsize = (20,10))        
# sns.heatmap(leads.corr(),annot = True)

Building Models using RFE

In [40]:
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

# Creating a LogisticRegression object log_reg
log_reg = LogisticRegression()

In [41]:
# Import RFE
from sklearn.feature_selection import RFE

# Select 20 variables
rfe = RFE(log_reg, 20)             
rfe = rfe.fit(X_train, y_train)

In [42]:
# Let's take a look at which features have been selected by RFE

list(zip(X_train.columns, rfe.support_, rfe.ranking_))

[('TotalVisits', True, 1),
 ('Total Time Spent on Website', True, 1),
 ('Page Views Per Visit', False, 2),
 ('Lead Origin_Landing Page Submission', False, 6),
 ('Lead Origin_Lead Add Form', True, 1),
 ('Lead Origin_Lead Import', False, 51),
 ('Lead Source_Direct Traffic', False, 18),
 ('Lead Source_Facebook', False, 46),
 ('Lead Source_Google', False, 31),
 ('Lead Source_Live Chat', False, 37),
 ('Lead Source_Olark Chat', True, 1),
 ('Lead Source_Organic Search', False, 30),
 ('Lead Source_Pay per Click Ads', False, 36),
 ('Lead Source_Press_Release', False, 47),
 ('Lead Source_Reference', True, 1),
 ('Lead Source_Referral Sites', False, 32),
 ('Lead Source_Social Media', False, 52),
 ('Lead Source_WeLearn', False, 33),
 ('Lead Source_Welingak Website', True, 1),
 ('Lead Source_bing', False, 27),
 ('Lead Source_testone', False, 29),
 ('Do Not Email_Yes', True, 1),
 ('Last Activity_Converted to Lead', False, 3),
 ('Last Activity_Email Bounced', True, 1),
 ('Last Activity_Email Link Clic

In [43]:
# Creating a variable rfe_col and storing all the columns selected by RFE

rfe_col = X_train.columns[rfe.support_]

In [44]:
X_train = X_train[rfe_col]

In [45]:
# Import statsmodels

import statsmodels.api as sm

In [46]:
# Fit a logistic Regression model on X_train after adding a constant and output the summary

X_train_sm = sm.add_constant(X_train)
log_mod = sm.GLM(y_train, X_train_sm, family = sm.families.Binomial())
reg_res = log_mod.fit()
reg_res.summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,4461
Model:,GLM,Df Residuals:,4440
Model Family:,Binomial,Df Model:,20
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-2019.4
Date:,"Fri, 15 Nov 2019",Deviance:,4038.7
Time:,16:09:24,Pearson chi2:,4.69e+03
No. Iterations:,22,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.7083,0.607,-1.166,0.244,-1.899,0.482
TotalVisits,8.6822,2.585,3.359,0.001,3.615,13.749
Total Time Spent on Website,4.4003,0.187,23.550,0.000,4.034,4.766
Lead Origin_Lead Add Form,2.9442,1.210,2.433,0.015,0.572,5.316
Lead Source_Olark Chat,1.5531,0.127,12.219,0.000,1.304,1.802
Lead Source_Reference,1.2610,1.233,1.023,0.306,-1.156,3.678
Lead Source_Welingak Website,3.3115,1.573,2.106,0.035,0.229,6.394
Do Not Email_Yes,-1.4072,0.223,-6.310,0.000,-1.844,-0.970
Last Activity_Email Bounced,-1.0803,0.662,-1.631,0.103,-2.378,0.218


In [47]:
# Import 'variance_inflation_factor'

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [48]:
# Make a VIF dataframe for all the variables present

vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Unnamed: 0,Features,VIF
2,Lead Origin_Lead Add Form,84.21
4,Lead Source_Reference,65.21
5,Lead Source_Welingak Website,20.03
13,What is your current occupation_Unemployed,4.63
8,Last Activity_Had a Phone Conversation,2.45
16,Last Notable Activity_Had a Phone Conversation,2.45
1,Total Time Spent on Website,2.4
7,Last Activity_Email Bounced,2.01
17,Last Notable Activity_Modified,1.77
10,Last Activity_SMS Sent,1.74


Lets drop the variable Lead Source_Reference as the VIF value is high i.e 65.21 and p-value is also high with 0.306

In [49]:
X_train.drop('Lead Source_Reference', axis = 1, inplace = True)

# Refit the model
log_mod1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
log_mod1.fit().summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,4461
Model:,GLM,Df Residuals:,4441
Model Family:,Binomial,Df Model:,19
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-2019.8
Date:,"Fri, 15 Nov 2019",Deviance:,4039.6
Time:,16:09:25,Pearson chi2:,4.67e+03
No. Iterations:,22,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.7076,0.607,-1.165,0.244,-1.898,0.483
TotalVisits,8.6793,2.585,3.357,0.001,3.612,13.746
Total Time Spent on Website,4.4003,0.187,23.549,0.000,4.034,4.766
Lead Origin_Lead Add Form,4.1668,0.260,16.019,0.000,3.657,4.677
Lead Source_Olark Chat,1.5529,0.127,12.217,0.000,1.304,1.802
Lead Source_Welingak Website,2.0887,1.037,2.014,0.044,0.056,4.121
Do Not Email_Yes,-1.4049,0.223,-6.306,0.000,-1.842,-0.968
Last Activity_Email Bounced,-1.0819,0.662,-1.634,0.102,-2.380,0.216
Last Activity_Had a Phone Conversation,1.4853,0.984,1.510,0.131,-0.443,3.413


In [50]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Unnamed: 0,Features,VIF
12,What is your current occupation_Unemployed,4.63
15,Last Notable Activity_Had a Phone Conversation,2.45
7,Last Activity_Had a Phone Conversation,2.45
1,Total Time Spent on Website,2.39
6,Last Activity_Email Bounced,2.01
16,Last Notable Activity_Modified,1.77
9,Last Activity_SMS Sent,1.74
2,Lead Origin_Lead Add Form,1.71
5,Do Not Email_Yes,1.65
13,What is your current occupation_Working Profes...,1.65


In [51]:
# Dropping Last Notable Activity_Had a Phone Conversation as p value is 0.999
X_train.drop('Last Notable Activity_Had a Phone Conversation', axis = 1, inplace = True)

In [52]:
# Refit the model
log_mod2 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
log_mod2.fit().summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,4461
Model:,GLM,Df Residuals:,4442
Model Family:,Binomial,Df Model:,18
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-2021.6
Date:,"Fri, 15 Nov 2019",Deviance:,4043.2
Time:,16:09:25,Pearson chi2:,4.67e+03
No. Iterations:,21,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.7046,0.607,-1.160,0.246,-1.895,0.486
TotalVisits,8.7256,2.586,3.374,0.001,3.658,13.794
Total Time Spent on Website,4.3956,0.187,23.529,0.000,4.029,4.762
Lead Origin_Lead Add Form,4.1678,0.260,16.019,0.000,3.658,4.678
Lead Source_Olark Chat,1.5524,0.127,12.212,0.000,1.303,1.802
Lead Source_Welingak Website,2.0879,1.037,2.013,0.044,0.055,4.121
Do Not Email_Yes,-1.4054,0.223,-6.309,0.000,-1.842,-0.969
Last Activity_Email Bounced,-1.0741,0.662,-1.622,0.105,-2.372,0.224
Last Activity_Had a Phone Conversation,2.6597,0.801,3.320,0.001,1.089,4.230


In [53]:
# Dropping What is your current occupation_Housewife feature
X_train.drop('What is your current occupation_Housewife', axis = 1, inplace = True)

In [54]:
# Refit the model
log_mod3 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
log_mod3.fit().summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,4461
Model:,GLM,Df Residuals:,4443
Model Family:,Binomial,Df Model:,17
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-2023.9
Date:,"Fri, 15 Nov 2019",Deviance:,4047.7
Time:,16:09:25,Pearson chi2:,4.68e+03
No. Iterations:,7,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1300,0.558,-0.233,0.816,-1.224,0.964
TotalVisits,8.5476,2.574,3.321,0.001,3.503,13.592
Total Time Spent on Website,4.3937,0.187,23.527,0.000,4.028,4.760
Lead Origin_Lead Add Form,4.1644,0.260,16.013,0.000,3.655,4.674
Lead Source_Olark Chat,1.5476,0.127,12.185,0.000,1.299,1.797
Lead Source_Welingak Website,2.0878,1.037,2.013,0.044,0.055,4.121
Do Not Email_Yes,-1.4095,0.223,-6.317,0.000,-1.847,-0.972
Last Activity_Email Bounced,-1.0726,0.662,-1.620,0.105,-2.370,0.225
Last Activity_Had a Phone Conversation,2.6604,0.801,3.321,0.001,1.090,4.230


In [55]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Unnamed: 0,Features,VIF
11,What is your current occupation_Unemployed,4.61
1,Total Time Spent on Website,2.38
6,Last Activity_Email Bounced,2.01
14,Last Notable Activity_Modified,1.75
9,Last Activity_SMS Sent,1.74
2,Lead Origin_Lead Add Form,1.7
5,Do Not Email_Yes,1.65
0,TotalVisits,1.64
12,What is your current occupation_Working Profes...,1.64
3,Lead Source_Olark Chat,1.53


In [56]:
# Dropping Last Activity_Email Bounced feature
X_train.drop('Last Activity_Email Bounced', axis = 1, inplace = True)

In [57]:
# Refit the model
log_mod4 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
log_mod4.fit().summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,4461
Model:,GLM,Df Residuals:,4444
Model Family:,Binomial,Df Model:,16
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-2025.5
Date:,"Fri, 15 Nov 2019",Deviance:,4051.0
Time:,16:09:25,Pearson chi2:,4.66e+03
No. Iterations:,7,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1211,0.558,-0.217,0.828,-1.216,0.973
TotalVisits,8.8196,2.572,3.429,0.001,3.778,13.861
Total Time Spent on Website,4.3927,0.187,23.534,0.000,4.027,4.758
Lead Origin_Lead Add Form,4.1835,0.260,16.074,0.000,3.673,4.694
Lead Source_Olark Chat,1.5462,0.127,12.180,0.000,1.297,1.795
Lead Source_Welingak Website,2.0933,1.038,2.018,0.044,0.060,4.127
Do Not Email_Yes,-1.5399,0.211,-7.295,0.000,-1.954,-1.126
Last Activity_Had a Phone Conversation,2.6677,0.801,3.328,0.001,1.097,4.239
Last Activity_Olark Chat Conversation,-0.6246,0.191,-3.264,0.001,-1.000,-0.250


In [58]:
# Dropping What is your current occupation_Working Professional feature
X_train.drop('What is your current occupation_Working Professional', axis = 1, inplace = True)

In [59]:
# Refit the model
log_mod5 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
log_mod5.fit().summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,4461
Model:,GLM,Df Residuals:,4445
Model Family:,Binomial,Df Model:,15
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-2026.2
Date:,"Fri, 15 Nov 2019",Deviance:,4052.3
Time:,16:09:25,Pearson chi2:,4.63e+03
No. Iterations:,7,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.5173,0.201,2.575,0.010,0.124,0.911
TotalVisits,8.7289,2.567,3.400,0.001,3.697,13.761
Total Time Spent on Website,4.3928,0.187,23.532,0.000,4.027,4.759
Lead Origin_Lead Add Form,4.1835,0.260,16.077,0.000,3.673,4.694
Lead Source_Olark Chat,1.5457,0.127,12.181,0.000,1.297,1.794
Lead Source_Welingak Website,2.0916,1.038,2.016,0.044,0.058,4.125
Do Not Email_Yes,-1.5394,0.211,-7.291,0.000,-1.953,-1.126
Last Activity_Had a Phone Conversation,2.6716,0.801,3.334,0.001,1.101,4.242
Last Activity_Olark Chat Conversation,-0.6206,0.191,-3.246,0.001,-0.995,-0.246


In [60]:
# Dropping Last Notable Activity_Email Bounced feature
X_train.drop('Last Notable Activity_Email Bounced', axis = 1, inplace = True)

In [61]:
# Refit the model
log_mod6 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
log_mod6.fit().summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,4461
Model:,GLM,Df Residuals:,4446
Model Family:,Binomial,Df Model:,14
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-2026.6
Date:,"Fri, 15 Nov 2019",Deviance:,4053.1
Time:,16:09:25,Pearson chi2:,4.63e+03
No. Iterations:,7,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.5243,0.201,2.609,0.009,0.130,0.918
TotalVisits,8.6504,2.570,3.366,0.001,3.613,13.688
Total Time Spent on Website,4.3937,0.187,23.540,0.000,4.028,4.759
Lead Origin_Lead Add Form,4.1775,0.260,16.068,0.000,3.668,4.687
Lead Source_Olark Chat,1.5459,0.127,12.183,0.000,1.297,1.795
Lead Source_Welingak Website,2.0860,1.037,2.011,0.044,0.053,4.119
Do Not Email_Yes,-1.4684,0.194,-7.567,0.000,-1.849,-1.088
Last Activity_Had a Phone Conversation,2.6713,0.801,3.334,0.001,1.101,4.242
Last Activity_Olark Chat Conversation,-0.6211,0.191,-3.249,0.001,-0.996,-0.246


In [62]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Unnamed: 0,Features,VIF
10,What is your current occupation_Unemployed,3.37
1,Total Time Spent on Website,2.0
11,Last Notable Activity_Modified,1.61
8,Last Activity_SMS Sent,1.59
0,TotalVisits,1.54
2,Lead Origin_Lead Add Form,1.46
3,Lead Source_Olark Chat,1.43
4,Lead Source_Welingak Website,1.31
7,Last Activity_Olark Chat Conversation,1.29
5,Do Not Email_Yes,1.09
