<a href="https://colab.research.google.com/github/jmohsbeck1/jpmc_mle/blob/machine_learning_bootcamp/Data_Leads_Lab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Leads Data ML model lab
# Machine learning bootcamp

In [1]:
import pandas as pd
import numpy as np
import math

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import display 

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
!pip install kaggle

import os

os.environ['KAGGLE_USERNAME'] = 'johnmohsbeck'
os.environ['KAGGLE_KEY'] = 'd1f230cf1aba75ec936a726ca195c7a1'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import kaggle

kaggle.api.dataset_download_files('ashydv/leads-dataset', path='./data', unzip=True)

## Read the Dataset

https://www.kaggle.com/datasets/ashydv/leads-dataset

In [4]:
df = pd.read_csv('./data/Leads.csv')

In [5]:
df.shape

(9240, 37)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Prospect ID                                    9240 non-null   object 
 1   Lead Number                                    9240 non-null   int64  
 2   Lead Origin                                    9240 non-null   object 
 3   Lead Source                                    9204 non-null   object 
 4   Do Not Email                                   9240 non-null   object 
 5   Do Not Call                                    9240 non-null   object 
 6   Converted                                      9240 non-null   int64  
 7   TotalVisits                                    9103 non-null   float64
 8   Total Time Spent on Website                    9240 non-null   int64  
 9   Page Views Per Visit                           9103 

## Transpose the Dataset to Make it Wide (not long)

In [7]:
df.head().T

Unnamed: 0,0,1,2,3,4
Prospect ID,7927b2df-8bba-4d29-b9a2-b6e0beafe620,2a272436-5132-4136-86fa-dcc88c88f482,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,3256f628-e534-4826-9d63-4a8b88782852
Lead Number,660737,660728,660727,660719,660681
Lead Origin,API,API,Landing Page Submission,Landing Page Submission,Landing Page Submission
Lead Source,Olark Chat,Organic Search,Direct Traffic,Direct Traffic,Google
Do Not Email,No,No,No,No,No
Do Not Call,No,No,No,No,No
Converted,0,0,1,0,1
TotalVisits,0.0,5.0,2.0,1.0,2.0
Total Time Spent on Website,0,674,1532,305,1428
Page Views Per Visit,0.0,2.5,2.0,1.0,1.0


Data Types

Column Names and Naming Conventions

In [8]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

In [9]:
for col in string_columns:
 df[col] = df[col].str.lower().str.replace(' ', '_')

In [10]:
df.dtypes

prospect_id                                       object
lead_number                                        int64
lead_origin                                       object
lead_source                                       object
do_not_email                                      object
do_not_call                                       object
converted                                          int64
totalvisits                                      float64
total_time_spent_on_website                        int64
page_views_per_visit                             float64
last_activity                                     object
country                                           object
specialization                                    object
how_did_you_hear_about_x_education                object
what_is_your_current_occupation                   object
what_matters_most_to_you_in_choosing_a_course     object
search                                            object
magazine                       

In [11]:
df.nunique()

prospect_id                                      9240
lead_number                                      9240
lead_origin                                         5
lead_source                                        20
do_not_email                                        2
do_not_call                                         2
converted                                           2
totalvisits                                        41
total_time_spent_on_website                      1731
page_views_per_visit                              114
last_activity                                      17
country                                            38
specialization                                     19
how_did_you_hear_about_x_education                 10
what_is_your_current_occupation                     6
what_matters_most_to_you_in_choosing_a_course       3
search                                              2
magazine                                            1
newspaper_article           

Drop unnecessary features

In [12]:
df.drop(['prospect_id','lead_number'], axis=1, inplace=True)

In [13]:
df.drop(['magazine','receive_more_updates_about_our_courses', 'update_me_on_supply_chain_content', 'get_updates_on_dm_content', 'i_agree_to_pay_the_amount_through_cheque' ], axis=1, inplace=True)

In [14]:
df.drop(['asymmetrique_activity_index', 'asymmetrique_profile_index', 'asymmetrique_activity_score', 'asymmetrique_profile_score'], axis=1, inplace=True)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 26 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   lead_origin                                    9240 non-null   object 
 1   lead_source                                    9204 non-null   object 
 2   do_not_email                                   9240 non-null   object 
 3   do_not_call                                    9240 non-null   object 
 4   converted                                      9240 non-null   int64  
 5   totalvisits                                    9103 non-null   float64
 6   total_time_spent_on_website                    9240 non-null   int64  
 7   page_views_per_visit                           9103 non-null   float64
 8   last_activity                                  9137 non-null   object 
 9   country                                        6779 

Check for duplicates

In [16]:
duplicates = df.duplicated()

In [17]:
duplicates.sum()

1465

In [18]:
df.drop_duplicates(keep='first', inplace=True)

In [19]:
df.shape

(7775, 26)

Check for missing values

In [20]:
df.isna().sum()

lead_origin                                         0
lead_source                                        29
do_not_email                                        0
do_not_call                                         0
converted                                           0
totalvisits                                       136
total_time_spent_on_website                         0
page_views_per_visit                              136
last_activity                                     102
country                                           998
specialization                                    655
how_did_you_hear_about_x_education               1423
what_is_your_current_occupation                  1906
what_matters_most_to_you_in_choosing_a_course    1925
search                                              0
newspaper_article                                   0
x_education_forums                                  0
newspaper                                           0
digital_advertisement       

In [21]:
df.dropna(inplace=True)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3474 entries, 2 to 9239
Data columns (total 26 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   lead_origin                                    3474 non-null   object 
 1   lead_source                                    3474 non-null   object 
 2   do_not_email                                   3474 non-null   object 
 3   do_not_call                                    3474 non-null   object 
 4   converted                                      3474 non-null   int64  
 5   totalvisits                                    3474 non-null   float64
 6   total_time_spent_on_website                    3474 non-null   int64  
 7   page_views_per_visit                           3474 non-null   float64
 8   last_activity                                  3474 non-null   object 
 9   country                                        3474 

In [23]:
df.nunique()

lead_origin                                         3
lead_source                                        12
do_not_email                                        2
do_not_call                                         2
converted                                           2
totalvisits                                        34
total_time_spent_on_website                      1465
page_views_per_visit                               91
last_activity                                      16
country                                            28
specialization                                     19
how_did_you_hear_about_x_education                 10
what_is_your_current_occupation                     6
what_matters_most_to_you_in_choosing_a_course       2
search                                              2
newspaper_article                                   2
x_education_forums                                  1
newspaper                                           2
digital_advertisement       

In [24]:
df.drop(['x_education_forums'], axis=1, inplace=True)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3474 entries, 2 to 9239
Data columns (total 25 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   lead_origin                                    3474 non-null   object 
 1   lead_source                                    3474 non-null   object 
 2   do_not_email                                   3474 non-null   object 
 3   do_not_call                                    3474 non-null   object 
 4   converted                                      3474 non-null   int64  
 5   totalvisits                                    3474 non-null   float64
 6   total_time_spent_on_website                    3474 non-null   int64  
 7   page_views_per_visit                           3474 non-null   float64
 8   last_activity                                  3474 non-null   object 
 9   country                                        3474 

In [25]:
df.to_csv("data_leads.csv")

################################################################################


Split the Data for Testing and Training

In [27]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

Train, Test, Validate

In [28]:
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

y_train = df_train.converted.values
y_val = df_val.converted.values

del df_train['converted']
del df_val['converted']

Exploratory Data Analysis

In [29]:
df_train_full.isnull().sum()

lead_origin                                      0
lead_source                                      0
do_not_email                                     0
do_not_call                                      0
converted                                        0
totalvisits                                      0
total_time_spent_on_website                      0
page_views_per_visit                             0
last_activity                                    0
country                                          0
specialization                                   0
how_did_you_hear_about_x_education               0
what_is_your_current_occupation                  0
what_matters_most_to_you_in_choosing_a_course    0
search                                           0
newspaper_article                                0
newspaper                                        0
digital_advertisement                            0
through_recommendations                          0
tags                           

Validate the Distribution of the Target Variable

In [30]:
df_train_full.converted.value_counts()

1    1456
0    1323
Name: converted, dtype: int64

In [31]:
stopped = 1456/(1456 + 1323)
print("percentage of customers STOPPED using the services: ", round(stopped, 5))

percentage of customers STOPPED using the services:  0.52393


Compute the MEAN of the Target Variable

In [32]:
global_mean = df_train_full.converted.mean()
round(global_mean, 3)


0.524

## We have an Balanced Dataset

# Categorical & Numerical Columns Require Different Treatments

## categorical:  which will contain the names of categorical variables
## numerical: will have the names of numerical variables

In [None]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
 'phoneservice', 'multiplelines', 'internetservice',
 'onlinesecurity', 'onlinebackup', 'deviceprotection',
 'techsupport', 'streamingtv', 'streamingmovies',
 'contract', 'paperlessbilling', 'paymentmethod']

numerical = ['converted', 'totalvisits', 'total_time_spent_on_website', 'page_views_per_visit']

# Categorical Data

In [None]:
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

# Numerical Data

## Get the Descriptive statistics for each column (Univariate Analysis)

In [None]:
df_train_full[numerical].describe()

Unnamed: 0,tenure,monthlycharges,totalcharges
count,5634.0,5634.0,5634.0
mean,32.277955,64.779127,2277.423953
std,24.555211,30.104993,2266.412636
min,0.0,18.25,0.0
25%,9.0,35.4,389.1375
50%,29.0,70.375,1391.0
75%,55.0,89.85,3787.5
max,72.0,118.65,8684.8


# Correlations

In [None]:
df_train_full.corr()

  df_train_full.corr()


Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges,churn
seniorcitizen,1.0,0.023443,0.225234,0.110459,0.141966
tenure,0.023443,1.0,0.251072,0.828268,-0.351885
monthlycharges,0.225234,0.251072,1.0,0.650913,0.196805
totalcharges,0.110459,0.828268,0.650913,1.0,-0.196353
churn,0.141966,-0.351885,0.196805,-0.196353,1.0


# Feature Importance

## Feature Importance Based on Gender

In [None]:
female_mean = df_train_full[df_train_full.gender == 'female'].churn.mean()

male_mean = df_train_full[df_train_full.gender == 'male'].churn.mean()

## Feature Importance based on Partner

In [None]:
partner_yes = df_train_full[df_train_full.partner == 'yes'].churn.mean()
print('partner == yes:', round(partner_yes, 3))

partner_no = df_train_full[df_train_full.partner == 'no'].churn.mean()
print('partner == no:', round(partner_no, 3))

partner == yes: 0.205
partner == no: 0.33


## Risk Ratio

In [None]:
# risk = group rate / global_rate

# For “gender == female”, for example, the risk of churning is 1.02:
# risk = 27.7% / 27% = 1.02

## Compute Risk Ratio

In [None]:
global_mean = df_train_full.churn.mean()
print('global_mean: ', round(global_mean, 5))

df_group = df_train_full.groupby(by='gender').churn.agg(['mean'])

df_group['diff'] = df_group['mean'] - global_mean

df_group['risk'] = df_group['mean'] / global_mean

df_group


global_mean:  0.26997


Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


## Risk Ratio for ALL Categorical Variables

## Churn Analysis 

## Mutual Information : Categorical

In [None]:
def calculate_mi(series):
 return mutual_info_score(series, df_train_full.churn)

In [None]:
df_mi = df_train_full[categorical].apply(calculate_mi)

df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')

df_mi

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


# Correlation Coefficient

In [None]:
df_train_full[numerical].corrwith(df_train_full.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

# Feature Engineering

## Transform all categorical variables to numeric forms

## One Hot Encoding

## DictVectorizer

In [None]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')

## Dictionary Vectorizer

In [None]:
dv = DictVectorizer(sparse=False)

dv.fit(train_dict)

X_train = dv.transform(train_dict)

## Peek at the Vectorized Data

In [None]:
X_train[0]

dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

# Machine learning

## Predictive Analytics from the clean Telco Dataset

## ML for Classification


## Linear Regression from Scratch in Python

In [None]:
def linear_regression(xi):
  result = bias
  for j in range(n):
    result = result + xi[j] * w[j]
  return result

## Logistic Regression from Scratch using Python

In [None]:
def logistic_regression(xi):
  score = bias
  for j in range(n):
    score = score + xi[j] * w[j]
    prob = sigmoid(score)
  return prob

def sigmoid(score):
  return 1 / (1 + math.exp(-score))

## Training the Logistic Regression Model

In [None]:
model = LogisticRegression(solver='liblinear', random_state=1)

model.fit(X_train, y_train)

## One Hot Encoding

In [None]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')

X_val = dv.transform(val_dict)

y_pred = model.predict_proba(X_val)

In [None]:
y_pred = model.predict_proba(X_val)[:, 1]

y_pred >= 0.5

array([False, False, False, ..., False,  True, False])

## Introducing Accuracy

In [None]:
churn = y_pred >= 0.5

(y_val == churn).mean() #Quality Measure called ACCURACY

0.8016129032258065

In [None]:
print(y_val)

print(churn)



[0 1 0 ... 0 0 0]
[False False False ... False  True False]


## Model Interpretation

## Coefficients

In [None]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'contract=month-to-month': 0.563,
 'contract=one_year': -0.086,
 'contract=two_year': -0.599,
 'dependents=no': -0.03,
 'dependents=yes': -0.092,
 'deviceprotection=no': 0.1,
 'deviceprotection=no_internet_service': -0.116,
 'deviceprotection=yes': -0.106,
 'gender=female': -0.027,
 'gender=male': -0.095,
 'internetservice=dsl': -0.323,
 'internetservice=fiber_optic': 0.317,
 'internetservice=no': -0.116,
 'monthlycharges': 0.001,
 'multiplelines=no': -0.168,
 'multiplelines=no_phone_service': 0.127,
 'multiplelines=yes': -0.081,
 'onlinebackup=no': 0.136,
 'onlinebackup=no_internet_service': -0.116,
 'onlinebackup=yes': -0.142,
 'onlinesecurity=no': 0.258,
 'onlinesecurity=no_internet_service': -0.116,
 'onlinesecurity=yes': -0.264,
 'paperlessbilling=no': -0.213,
 'paperlessbilling=yes': 0.091,
 'partner=no': -0.048,
 'partner=yes': -0.074,
 'paymentmethod=bank_transfer_(automatic)': -0.027,
 'paymentmethod=credit_card_(automatic)': -0.136,
 'paymentmethod=electronic_check': 0.175,


## Prepare a Small Subset to Break Down the Categoricals

In [None]:
small_subset = ['contract', 'tenure', 'totalcharges']

train_dict_small = df_train[small_subset].to_dict(orient='records')

dv_small = DictVectorizer(sparse=False)

dv_small.fit(train_dict_small)

X_small_train = dv_small.transform(train_dict_small)

dv_small.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'tenure', 'totalcharges'], dtype=object)

## Train the Small Subset

In [None]:
model_small = LogisticRegression(solver='liblinear', random_state=1)

model_small.fit(X_small_train, y_train)

model_small.intercept_[0] #Check the bias

dict(zip(dv_small.get_feature_names_out(), model_small.coef_[0].round(3))) #Check the other weights

{'contract=month-to-month': 0.91,
 'contract=one_year': -0.144,
 'contract=two_year': -1.404,
 'tenure': -0.097,
 'totalcharges': 0.001}

## Understanding The Importance of Categories

In [None]:
dict(zip(dv_small.get_feature_names_out(), model_small.coef_[0].round(3)))

{'contract=month-to-month': 0.91,
 'contract=one_year': -0.144,
 'contract=two_year': -1.404,
 'tenure': -0.097,
 'totalcharges': 0.001}

## Using the Churn Model

In [None]:
customer = {
 'customerid': '8879-zkjof',
 'gender': 'female',
 'seniorcitizen': 0,
 'partner': 'no',
 'dependents': 'no',
 'tenure': 41,
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'dsl',
 'onlinesecurity': 'yes',
 'onlinebackup': 'no',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'one_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'bank_transfer_(automatic)',
 'monthlycharges': 79.85,
 'totalcharges': 3320.75,
}


# Vectorized Input

In [None]:
X_test = dv.transform([customer])

print(X_test)

[[0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 1.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00
  0.00000e+00 7.98500e+01 1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00
  1.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 4.10000e+01 3.32075e+03]]


## Put the Matrix into the Trained Model

In [None]:
model.predict_proba(X_test)

array([[0.92667889, 0.07332111]])

In [None]:
model.predict_proba(X_test)[0, 1]

0.07332111084949638

## Take a look at another customer

In [None]:
customer = {
 'gender': 'female',
 'seniorcitizen': 1,
 'partner': 'no',
 'dependents': 'no',
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'no',
 'onlinebackup': 'no',
'deviceprotection': 'no',
 'techsupport': 'no',
 'streamingtv': 'yes',
 'streamingmovies': 'no',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'electronic_check',
 'tenure': 1,
 'monthlycharges': 85.7,
 'totalcharges': 85.7
}

## Let’s Make a Prediction

In [None]:
X_test = dv.transform([customer])

model.predict_proba(X_test)[0, 1]

0.8321656556055403