# **Lead Scoring Case Study**

In [141]:
# import libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

## *Step 1: Data cleaning and preparation*

- Handling categorical variables
    - Mapping categorical variables to integers
    - Dummy variable creation
- Handling outliers & missing values

### Data Importing

In [142]:
# import Leads data
leads = pd.read_csv('data/Leads.csv')
leads.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,...,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,...,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified


In [143]:
leads.shape

(9240, 37)

In [144]:
# check columns type
leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Prospect ID                                    9240 non-null   object 
 1   Lead Number                                    9240 non-null   int64  
 2   Lead Origin                                    9240 non-null   object 
 3   Lead Source                                    9204 non-null   object 
 4   Do Not Email                                   9240 non-null   object 
 5   Do Not Call                                    9240 non-null   object 
 6   Converted                                      9240 non-null   int64  
 7   TotalVisits                                    9103 non-null   float64
 8   Total Time Spent on Website                    9240 non-null   int64  
 9   Page Views Per Visit                           9103 

In [145]:
# Change name of some columns
leads.rename(columns={
    'How did you hear about X Education': 'source',
    'What is your current occupation': 'occupation',
    'What matters most to you in choosing a course': 'course_preference',
    'Total Time Spent on Website': 'total_time_on_website',
    'Receive More Updates About Our Courses': 'receive_updates',
    'Update me on Supply Chain Content': 'supply_chain_updates',
    'Get updates on DM Content': 'dm_content_updates',
    'I agree to pay the amount through cheque': 'cheque_payment_agreement',
    'A free copy of Mastering The Interview': 'free_copy_mastering_interview'
}, inplace=True)

### 1. Checking column with only one value (Unique value column)

In [146]:
uni_column = []
for i in leads.columns:
    if leads[i].unique().size ==1:
        uni_column.append(i)

In [147]:
# All columns with only one value.
uni_column

['Magazine',
 'receive_updates',
 'supply_chain_updates',
 'dm_content_updates',
 'cheque_payment_agreement']

- Take a look from above, there are some columns that contain only one value. (Single Unique Value)  
- Columns: **Magazine, receive_updates, supply_chain_updates, dm_content_updates, cheque_payment_agreement** 
 
=> Drop all these columns 

In [148]:
leads.drop(uni_column, axis=1, inplace= True)

In [149]:
leads.shape

(9240, 32)

### 1.1. Refine the dataset

In [150]:
# Check the values in the categorical variables
for i in leads.select_dtypes(include="object").columns:
    leads[i].value_counts(normalize=True)

Prospect ID
571b5c8e-a5b2-4d57-8574-f2ffb06fdeff    0.000108
7927b2df-8bba-4d29-b9a2-b6e0beafe620    0.000108
2a272436-5132-4136-86fa-dcc88c88f482    0.000108
8cc8c611-a219-4f35-ad23-fdfd2656bd8a    0.000108
0cc2df48-7cf4-4e39-9de9-19797f9b38cc    0.000108
                                          ...   
8bf76a52-2478-476b-8618-1688e07874ad    0.000108
9bc8ce93-6144-49e0-9f9d-080fc980f83c    0.000108
2a369e35-ca95-4ca9-9e4f-9d27175aa320    0.000108
af465dfc-7204-4130-9e05-33231863c4b5    0.000108
cfa0128c-a0da-4656-9d47-0aa4e67bf690    0.000108
Name: proportion, Length: 9240, dtype: float64

Lead Origin
Landing Page Submission    0.528788
API                        0.387446
Lead Add Form              0.077706
Lead Import                0.005952
Quick Add Form             0.000108
Name: proportion, dtype: float64

Lead Source
Google               0.311604
Direct Traffic       0.276293
Olark Chat           0.190678
Organic Search       0.125380
Reference            0.058018
Welingak Website     0.015428
Referral Sites       0.013581
Facebook             0.005976
bing                 0.000652
google               0.000543
Click2call           0.000435
Live Chat            0.000217
Social Media         0.000217
Press_Release        0.000217
Pay per Click Ads    0.000109
blog                 0.000109
WeLearn              0.000109
welearnblog_Home     0.000109
youtubechannel       0.000109
testone              0.000109
NC_EDM               0.000109
Name: proportion, dtype: float64

Do Not Email
No     0.920563
Yes    0.079437
Name: proportion, dtype: float64

Do Not Call
No     0.999784
Yes    0.000216
Name: proportion, dtype: float64

Last Activity
Email Opened                    0.376163
SMS Sent                        0.300427
Olark Chat Conversation         0.106490
Page Visited on Website         0.070045
Converted to Lead               0.046843
Email Bounced                   0.035679
Email Link Clicked              0.029222
Form Submitted on Website       0.012696
Unreachable                     0.010178
Unsubscribed                    0.006676
Had a Phone Conversation        0.003283
Approached upfront              0.000985
View in browser link Clicked    0.000657
Email Marked Spam               0.000219
Email Received                  0.000219
Resubscribed to emails          0.000109
Visited Booth in Tradeshow      0.000109
Name: proportion, dtype: float64

Country
India                   0.957663
United States           0.010178
United Arab Emirates    0.007818
Singapore               0.003540
Saudi Arabia            0.003098
United Kingdom          0.002213
Australia               0.001918
Qatar                   0.001475
Hong Kong               0.001033
Bahrain                 0.001033
Oman                    0.000885
France                  0.000885
unknown                 0.000738
Kuwait                  0.000590
South Africa            0.000590
Nigeria                 0.000590
Germany                 0.000590
Canada                  0.000590
Sweden                  0.000443
Italy                   0.000295
Belgium                 0.000295
Ghana                   0.000295
Philippines             0.000295
Netherlands             0.000295
China                   0.000295
Asia/Pacific Region     0.000295
Bangladesh              0.000295
Uganda                  0.000295
Russia                  0.000148
Sri Lanka               0.000148
Ta

Specialization
Select                               0.248911
Finance Management                   0.125096
Human Resource Management            0.108690
Marketing Management                 0.107408
Operations Management                0.064471
Business Administration              0.051653
IT Projects Management               0.046911
Supply Chain Management              0.044732
Banking, Investment And Insurance    0.043322
Media and Advertising                0.026019
Travel and Tourism                   0.026019
International Business               0.022815
Healthcare Management                0.020379
Hospitality Management               0.014612
E-COMMERCE                           0.014355
Retail Management                    0.012817
Rural and Agribusiness               0.009357
E-Business                           0.007306
Services Excellence                  0.005127
Name: proportion, dtype: float64

source
Select                   0.717048
Online Search            0.114887
Word Of Mouth            0.049481
Student of SomeSchool    0.044078
Other                    0.026447
Multiple Sources         0.021612
Advertisements           0.009953
Social Media             0.009527
Email                    0.003697
SMS                      0.003270
Name: proportion, dtype: float64

occupation
Unemployed              0.854962
Working Professional    0.107786
Student                 0.032061
Other                   0.002443
Housewife               0.001527
Businessman             0.001221
Name: proportion, dtype: float64

course_preference
Better Career Prospects      0.999541
Flexibility & Convenience    0.000306
Other                        0.000153
Name: proportion, dtype: float64

Search
No     0.998485
Yes    0.001515
Name: proportion, dtype: float64

Newspaper Article
No     0.999784
Yes    0.000216
Name: proportion, dtype: float64

X Education Forums
No     0.999892
Yes    0.000108
Name: proportion, dtype: float64

Newspaper
No     0.999892
Yes    0.000108
Name: proportion, dtype: float64

Digital Advertisement
No     0.999567
Yes    0.000433
Name: proportion, dtype: float64

Through Recommendations
No     0.999242
Yes    0.000758
Name: proportion, dtype: float64

Tags
Will revert after reading the email                  0.351962
Ringing                                              0.204349
Interested in other courses                          0.087141
Already a student                                    0.078988
Closed by Horizzon                                   0.060812
switched off                                         0.040768
Busy                                                 0.031595
Lost to EINS                                         0.029727
Not doing further education                          0.024631
Interested  in full time MBA                         0.019874
Graduation in progress                               0.018855
invalid number                                       0.014099
Diploma holder (Not Eligible)                        0.010702
wrong number given                                   0.007984
opp hangup                                           0.005606
number not provided                                  0.004586
in 

Lead Quality
Might be             0.348759
Not Sure             0.244131
High in Relevance    0.142410
Worst                0.134362
Low in Relevance     0.130338
Name: proportion, dtype: float64

Lead Profile
Select                         0.634819
Potential Lead                 0.246976
Other Leads                    0.074567
Student of SomeSchool          0.036901
Lateral Student                0.003675
Dual Specialization Student    0.003062
Name: proportion, dtype: float64

City
Mumbai                         0.412020
Select                         0.287596
Thane & Outskirts              0.096164
Other Cities                   0.087724
Other Cities of Maharashtra    0.058440
Other Metro Cities             0.048593
Tier II Cities                 0.009463
Name: proportion, dtype: float64

Asymmetrique Activity Index
02.Medium    0.764436
01.High      0.163481
03.Low       0.072083
Name: proportion, dtype: float64

Asymmetrique Profile Index
02.Medium    0.555157
01.High      0.438670
03.Low       0.006173
Name: proportion, dtype: float64

free_copy_mastering_interview
No     0.687446
Yes    0.312554
Name: proportion, dtype: float64

Last Notable Activity
Modified                        0.368723
Email Opened                    0.305952
SMS Sent                        0.235065
Page Visited on Website         0.034416
Olark Chat Conversation         0.019805
Email Link Clicked              0.018723
Email Bounced                   0.006494
Unsubscribed                    0.005087
Unreachable                     0.003463
Had a Phone Conversation        0.001515
Email Marked Spam               0.000216
Approached upfront              0.000108
Resubscribed to emails          0.000108
View in browser link Clicked    0.000108
Form Submitted on Website       0.000108
Email Received                  0.000108
Name: proportion, dtype: float64

"Lead Source" columns is quite messy and need to be cleaned.

In [151]:
# "Lead Source" columns are in both upper and lowercase, so we need to convert them to lowercase
leads['Lead Source'] = leads['Lead Source'].str.lower()

# there are multiple values that represent the same lead source
leads['Lead Source'] = leads['Lead Source'].replace(r'welearn.*', 'welearn', regex=True)
leads['Lead Source'] = leads['Lead Source'].replace(r'refer.*', 'referral', regex=True)

Many of the categorical variables have a level called 'Select' which needs to be handled because it is as good as a null value

In [152]:
# treat "Select" as NULL
leads.replace('Select', np.nan, inplace=True)

The "Lead Profile" column is labeled with either "Potential Lead" or meaningless labels such as "Other". We can take advantage of this "Potential Lead" label!

In [153]:
leads['Is_Potential_Lead'] = np.where(leads['Lead Profile'] == "Potential Lead", 1, 0)
leads['Is_Potential_Lead'].value_counts(normalize=True)

Is_Potential_Lead
0    0.825433
1    0.174567
Name: proportion, dtype: float64

In [154]:
# column "Country" is highly skewed toward "India", and column "Prospect ID" is not needed
leads.drop(columns=["Country", "Prospect ID"], inplace=True)

### 2. Handling with missing values

In [155]:
# Checking the missing values
missing_rate = leads.isnull().sum()*100/leads.shape[0]

# Extract all columns containing missing values
missing_rate[missing_rate!=0].sort_values(ascending=False)

source                         78.463203
Lead Profile                   74.188312
Lead Quality                   51.590909
Asymmetrique Profile Index     45.649351
Asymmetrique Activity Index    45.649351
Asymmetrique Activity Score    45.649351
Asymmetrique Profile Score     45.649351
City                           39.707792
Specialization                 36.580087
Tags                           36.287879
course_preference              29.318182
occupation                     29.112554
TotalVisits                     1.482684
Page Views Per Visit            1.482684
Last Activity                   1.114719
Lead Source                     0.389610
dtype: float64

#### Check high missing counts columns

In [156]:
# The 'source' column has high nulls count, so let drop it
leads.drop(columns= 'source', axis=1, inplace=True)

In [157]:
# since we already create a derived column from "Lead Profile", we can safely drop it
leads.drop(columns= 'Lead Profile', axis=1, inplace=True)

#### Check "Lead quality" column with ~50% missing values

In [158]:
leads["Lead Quality"].value_counts(normalize=True)

Lead Quality
Might be             0.348759
Not Sure             0.244131
High in Relevance    0.142410
Worst                0.134362
Low in Relevance     0.130338
Name: proportion, dtype: float64

- Data in this column has been not defined clearly. For example, it should be classified as **best**, **normal**, **worst** or **high**, **average**, **low**, but as this case, the data seems messy!  
=> This column will drop because of missing values and low quality

In [159]:
leads.drop(["Lead Quality"], axis=1, inplace = True)

#### Index and score columns with almost 50% missing value

In [160]:
asym_columns = ["Asymmetrique Profile Score", "Asymmetrique Activity Score", "Asymmetrique Profile Index", "Asymmetrique Activity Index"]

for i in asym_columns:
    leads[i].value_counts(normalize=True)

Asymmetrique Profile Score
15.0    0.350259
18.0    0.213262
16.0    0.119275
17.0    0.115293
20.0    0.061330
19.0    0.048785
14.0    0.045002
13.0    0.040621
12.0    0.004381
11.0    0.001792
Name: proportion, dtype: float64

Asymmetrique Activity Score
14.0    0.352648
15.0    0.257467
13.0    0.154321
16.0    0.092991
17.0    0.069494
12.0    0.039028
11.0    0.018917
10.0    0.011350
9.0     0.001792
18.0    0.000996
8.0     0.000796
7.0     0.000199
Name: proportion, dtype: float64

Asymmetrique Profile Index
02.Medium    0.555157
01.High      0.438670
03.Low       0.006173
Name: proportion, dtype: float64

Asymmetrique Activity Index
02.Medium    0.764436
01.High      0.163481
03.Low       0.072083
Name: proportion, dtype: float64

- In "Asymmetrique Profile Score" and "Asymmetrique Activity Score" columns, the difference between all values is not prominent.  
=> Imputing missing values with the median values.

In [161]:
leads["Asymmetrique Activity Score"].fillna(leads["Asymmetrique Activity Score"].median(), inplace = True)
leads["Asymmetrique Profile Score"].fillna(leads["Asymmetrique Profile Score"].median(), inplace = True)

- "Asymmetrique Profile Index" and "Asymmetrique Activity Index" columns are categorical and the proportions of values that occupy the highest part are over 50%.  
=> Replace missing values with the mode of the column.

In [162]:
leads["Asymmetrique Profile Index"].fillna(leads["Asymmetrique Profile Index"].mode()[0], inplace=True)
leads["Asymmetrique Activity Index"].fillna(leads["Asymmetrique Activity Index"].mode()[0], inplace=True)

#### All columns with moderate missing rates (15%-35%)

In [163]:
moderate_missing = ["City", "Specialization", "Tags", "course_preference", "occupation"]

In [173]:
for i in moderate_missing:
    leads[i].value_counts(normalize=True)

City
unknown              0.393983
Mumbai               0.350121
Other Cities         0.165638
Thane & Outskirts    0.082103
Tier II Cities       0.008155
Name: proportion, dtype: float64

Specialization
unknown                      0.361693
Other                        0.294688
Finance Management           0.105687
Human Resource Management    0.092242
Marketing Management         0.090699
Operations Management        0.054992
Name: proportion, dtype: float64

Tags
unknown                                0.366652
Will revert after reading the email    0.222173
Other                                  0.173022
Ringing                                0.130813
Interested in other courses            0.056094
Already a student                      0.051245
Name: proportion, dtype: float64

course_preference
Better Career Prospects      0.702006
unknown                      0.297774
Flexibility & Convenience    0.000110
Other                        0.000110
Name: proportion, dtype: float64

occupation
Unemployed              0.603482
unknown                 0.295680
Working Professional    0.074609
Student                 0.022702
Other                   0.001653
Housewife               0.000992
Businessman             0.000882
Name: proportion, dtype: float64

The ammount of null values (original known as "Select") are quite high for us to remove (since it will cost alot of data), but we cannot impute them with the median. So let convert them to "unknow" for now, and keep in mind to remove them in our future models.

In [165]:
# replace missing values with "unknown"
leads[moderate_missing] = leads[moderate_missing].fillna('unknown')

In [166]:
# too much "Other" labels in City variable
leads["City"].replace({
    "Other Cities of Maharashtra": "Other Cities", 
    "Other Metro Cities": "Other Cities"}, 
    inplace = True)

#### Columns with Low Missing Rates (<2%)

In [167]:
# Removing rows with missing values
leads.dropna(subset=["TotalVisits", "Page Views Per Visit", "Last Activity", "Lead Source"], inplace=True)

### 3. Handling redundancy in categorical columns

In [176]:
# check the values of categorical variables
for i in leads.select_dtypes(include="object").columns:
    leads[i].value_counts(normalize=True)

Lead Origin
Landing Page Submission    0.538351
API                        0.394313
Lead Add Form              0.064029
Lead Import                0.003306
Name: proportion, dtype: float64

Lead Source
google            0.316619
direct traffic    0.280251
olark chat        0.193189
organic search    0.127177
referral          0.062596
Other             0.020168
Name: proportion, dtype: float64

Last Activity
Email Opened               0.378223
SMS Sent                   0.299317
Other                      0.144809
Olark Chat Conversation    0.107119
Page Visited on Website    0.070531
Name: proportion, dtype: float64

Specialization
unknown                      0.361693
Other                        0.294688
Finance Management           0.105687
Human Resource Management    0.092242
Marketing Management         0.090699
Operations Management        0.054992
Name: proportion, dtype: float64

occupation
Unemployed              0.603482
unknown                 0.295680
Working Professional    0.074609
Student                 0.022702
Other                   0.001653
Housewife               0.000992
Businessman             0.000882
Name: proportion, dtype: float64

course_preference
Better Career Prospects      0.702006
unknown                      0.297774
Flexibility & Convenience    0.000110
Other                        0.000110
Name: proportion, dtype: float64

Tags
unknown                                0.366652
Will revert after reading the email    0.222173
Other                                  0.173022
Ringing                                0.130813
Interested in other courses            0.056094
Already a student                      0.051245
Name: proportion, dtype: float64

City
unknown              0.393983
Mumbai               0.350121
Other Cities         0.165638
Thane & Outskirts    0.082103
Tier II Cities       0.008155
Name: proportion, dtype: float64

Asymmetrique Activity Index
02.Medium    0.869738
01.High      0.090368
03.Low       0.039894
Name: proportion, dtype: float64

Asymmetrique Profile Index
02.Medium    0.759202
01.High      0.237382
03.Low       0.003416
Name: proportion, dtype: float64

Last Notable Activity
Modified        0.360040
Email Opened    0.311109
SMS Sent        0.237161
Other           0.091691
Name: proportion, dtype: float64

- There some categorical columns with reduntdant values: **"Lead Source", "Last Activity", "Specialization", "Tags", "Last Notable Activity"**

In [169]:
def handle_redundancies(column):
    proportions = leads[column].value_counts(normalize=True)

    # Set a threshold for significant categories
    threshold = 0.05 # under 5% will be count as "Other"

    # Create a list of categories to keep (above the threshold)
    significant_categories = proportions[proportions > threshold].index.tolist()

    # Create a mapping where significant categories map to themselves and others map to 'Other'
    mapping = {category: (category if category in significant_categories else 'Other') for category in leads[column].unique()}

    # Apply the mapping to the DataFrame
    leads[column].replace(mapping, inplace = True)

In [170]:
reduntant_col = ["Lead Source", "Last Activity", "Specialization", "Tags", "Last Notable Activity"]

In [171]:
for i in reduntant_col:
    handle_redundancies(i)

### 4. Converting some binary variables (Yes/No) to 0/1

In [172]:
# List of variables to map
bi_cols = []
for i in leads.columns:
    if leads[i].unique().size == 2:
        bi_cols.append(i)
# "Converted" column already contains binary with 1,0
bi_cols.remove('Converted')
bi_cols.remove('Is_Potential_Lead')
bi_cols

['Do Not Email',
 'Do Not Call',
 'Search',
 'Newspaper Article',
 'X Education Forums',
 'Newspaper',
 'Digital Advertisement',
 'Through Recommendations',
 'free_copy_mastering_interview']

In [175]:
# Defining the map function
def binary_map(x):
    return x.map({'Yes': 1, "No": 0})

# Applying the function to the binary columns
leads[bi_cols] = leads[bi_cols].apply(binary_map)

In [177]:
# Two columns "Asymmetrique Activity Index", "Asymmetrique Profile Index" contains three categories, so lets convert them to numbers
index_mapping = {
    '01.High': 1,
    '02.Medium': 2,
    '03.Low': 3
}

leads['Asymmetrique Activity Index'] = leads['Asymmetrique Activity Index'].map(index_mapping)
leads['Asymmetrique Profile Index'] = leads['Asymmetrique Profile Index'].map(index_mapping)

In [178]:
leads['Asymmetrique Activity Index'].value_counts()
leads['Asymmetrique Profile Index'].value_counts()

Asymmetrique Activity Index
2    7892
1     820
3     362
Name: count, dtype: int64

Asymmetrique Profile Index
2    6889
1    2154
3      31
Name: count, dtype: int64

## *Step 2: EDA*

### 5. Visualization

#### 5.1. Univariate analysis

In [None]:
leads["Lead Origin"].value_counts(normalize=True).plot.pie(autopct='%1.2f%%')
plt.show()

=> The origin identifier mainly is from "Landing Page Submission" and "API"

In [None]:
leads["Asymmetrique Activity Index"].value_counts(normalize=True).plot.pie(autopct='%1.2f%%')
plt.title("The proportion of Asymmetrique Activity Index")
plt.show()
leads["Asymmetrique Profile Index"].value_counts(normalize=True).plot.pie(autopct='%1.2f%%')
plt.title("The proportion of Asymmetrique Profile Index")
plt.show()

In [None]:
leads["Lead Source"].value_counts(normalize=True).plot.barh()
plt.show()

=> Google accounts for the highest rate among all sources of the lead

In [None]:
leads["Last Activity"].value_counts(normalize=True).plot.barh()
plt.show()

##### Visualizing the binary columns

In [None]:
plt.figure(figsize=(20, 12))
for i in range(len(bi_cols)):
    plt.subplot(3,3,i+1)
    counts = leads[bi_cols[i]].value_counts(normalize=True)
    plt.pie(counts, labels=counts.index, autopct='%1.1f%%')
    plt.title(bi_cols[i], color='blue')
plt.show()

- ALmost customers are okay with receiving emails and calls about the course with over 90% say "No" with "Do not email" or "Do not call" even nearly 100%.
- However, almost advertisement platforms or measurements are not realy effective, because almost 100% customers said "No" about seeing ad before. 
Only in "free_copy_mastering_interview" field, there is 31,8% customers having desire to receive a free copy of 'Mastering the Interview'.

#### 5.2. Bivariate analysis

In [None]:
leads.columns

In [None]:
leads.groupby('occupation')["total_time_on_website"].mean().plot.bar()
plt.show()

- People who spent the most time on website are businessman or working Professional. 

=> These groups of people are potential for this course.

In [None]:
leads.groupby('Specialization')["TotalVisits"].mean().plot.barh()
plt.show()

The customers who worked in "Travel and Tourism", "Supply Chain Management" or "Retail Management" is potential for this course because of their high visits' rate.

In [None]:
leads.groupby(['City'])['Page Views Per Visit'].aggregate(['mean','median']).plot.bar()
plt.show()

The chart shows the relationship between mean and median values across different city categories:

- Mumbai and Tier II Cities have equal mean and median values, indicating symmetric data distributions.
- Other Cities has a slightly higher mean than median, suggesting a right-skewed distribution.
- Thane & Outskirts also shows similar mean and median values, indicating a symmetric distribution.

#### 5.3. Multivariate analysis

In [None]:
leads.dtypes

In [593]:
num_col = []
for i in leads.columns:
    if (leads[i].unique().dtype=="float64" or leads[i].unique().dtype=="int64") and leads[i].unique().size>3:
        num_col.append(i)


In [None]:
num_col

In [None]:
sns.pairplot(leads[num_col])
plt.show()

In [None]:
days_corr=leads[num_col].corr()
sns.heatmap(days_corr, annot=True, cmap='Reds')
plt.title("Correlation between all numerical columns")

- Converted shows a moderate positive correlation with total_time_on_website (0.36).
- Converted also has a moderate positive correlation with TotalVisits (0.32).
- TotalVisits and Page Views Per Visit have a moderate positive correlation (0.51).

In [None]:
res=pd.pivot_table(data=leads, index="Lead Profile", columns="City", values="free_copy_mastering_interview")
sns.heatmap(res, annot= True, cmap="Blues", center= 0.1)
plt.title("Lead Profile vs. City through free_copy_mastering_interview")
plt.show()

- Student of SomeSchool has high correlations across all city categories, especially in Mumbai (0.79) and Thane & Outskirts (0.64).
- Potential Lead also shows relatively high correlations with Thane & Outskirts (0.78) and Tier II Cities (0.78).

=> This indicates that certain lead profiles, particularly "Student of SomeSchool," are more likely to be associated with receiving a free copy in various city categories.

### 6. Dummy variables

In [598]:
# Dummy columns contains binary columns
dummy_cols = bi_cols

In [599]:
dummy_cols.append("Asymmetrique Activity Index")
dummy_cols.append("Asymmetrique Profile Index")

In [None]:
dummy_cols

- When creating dummy variables, the `drop_first=True` parameter is often used to avoid the **dummy variable trap** and to simplify the resulting dataset.

In [601]:
# Get the dummy variables for categorical variables.
status = pd.get_dummies(leads[dummy_cols], columns=dummy_cols, drop_first = True).astype(int)

In [None]:
status.head()

In [603]:
leads = pd.concat([leads, status], axis = 1)

In [604]:
leads.drop(dummy_cols, axis=1, inplace=True)

In [None]:
leads.head()

In [None]:
leads.info()

In [None]:
leads.head()

In [None]:
for i in leads.columns:
    leads[i].value_counts()

In [None]:
leads.columns

=> These two columns 'Prospect ID', 'Lead Number': all values are unique, so creating dummy variables is not necessary
So I drop both.

In [610]:
leads.drop(['Prospect ID', 'Lead Number'], axis=1, inplace=True)

There are a lot of values in all these columns "Country", "Specialization", "Tags", so creating dummy is not effective for all these columns
=> Dropping all these columns

In [611]:
leads.drop(["Country", "Specialization", "Tags"], axis=1, inplace=True)

In [612]:
# Total Visits
# Group by the binned values
leads['TotalVisits_group'] = pd.cut(leads['TotalVisits'], bins=[0, 10, 30, 100, 300], labels=["Very Low", "Low", "Moderate", "High"])
leads.drop(['TotalVisits'], axis=1, inplace=True)

In [613]:
# total_time_on_website
# Group by the binned values
leads['total_time_group'] = pd.cut(leads['total_time_on_website'], bins=[0, 60, 120, 300, float('inf')], labels=['Very Low', 'Low', 'Moderate', 'High'])
leads.drop(['total_time_on_website'], axis=1, inplace=True)

In [614]:
# Page Views Per Visit
# Group by the binned values
leads['Views_per_visit_group'] = pd.cut(leads["Page Views Per Visit"], bins=[0, 2, 4, 6, float('inf')], labels=['Very Low', 'Low', 'Moderate', 'High'])
leads.drop(["Page Views Per Visit"], axis=1, inplace=True)

In [615]:
# Asymmetrique Activity Score
# Group by the binned values
leads['Asymmetrique Activity Score Group'] = pd.cut(leads["Asymmetrique Activity Score"], bins=[0, 8, 11, 14, float('inf')], labels=['Very Low', 'Low', 'Moderate', 'High'])
leads.drop(["Asymmetrique Activity Score"], axis=1, inplace=True)

In [616]:
# Asymmetrique Profile Score
# Group by the binned values
leads['Asymmetrique Profile Score Group'] = pd.cut(leads["Asymmetrique Profile Score"], bins=[0, 11, 14, 17, float('inf')], labels=['Very Low', 'Low', 'Moderate', 'High'])
leads.drop(["Asymmetrique Profile Score"], axis=1, inplace=True)

In [None]:
for i in leads.columns:
    leads[i].value_counts()

In [618]:
object_cols = []
for i in leads.columns:
    if leads[i].unique().dtype=="object":
        object_cols.append(i)

In [None]:
object_cols

In [620]:
# Get the dummy variables for categorical variables.
status = pd.get_dummies(leads[object_cols], columns=object_cols, drop_first = True).astype(int)

In [621]:
leads = pd.concat([leads, status], axis = 1)
leads.drop(object_cols, axis=1, inplace=True)

In [None]:
leads.head()

## *Step 3: Test-train split and scaling*

## *Step 4: Model Building*

**Methods on Model Building:**
- Feature elimination based on correlations
- Feature selection using Recursive Feature Elimination
- Manual feature elimination (using p-values and VIFs)

## *Step 5: Model Evaluation*

**Metrics for Model Evaluation:**
- Accuracy
- Sensitivity and Specificity
- Optimal cut-off using ROC curve
- Precision and Recall

## *Step 6: Predictions on the test set*