In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [23]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

In [24]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [25]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [26]:
##Data preparation
##Check if the missing values are presented in the features.
##If there are missing values:
##For caterogiral features, replace them with 'NA'
##For numerical features, replace with with 0.0

In [27]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [28]:
df.lead_source = df.lead_source.fillna('NA')
df.industry = df.industry.fillna('NA')
df.employment_status = df.employment_status.fillna('NA')
df.location = df.location.fillna('NA')
df.annual_income = df.annual_income.fillna(0.0)

In [29]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [30]:
##Question 1
# What is the most frequent observation (mode) for the column industry?
df.industry.mode()

0    retail
Name: industry, dtype: object

In [31]:
##Question 2
# Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

#What are the two features that have the biggest correlation?

#interaction_count and lead_score
#number_of_courses_viewed and lead_score
#number_of_courses_viewed and interaction_count
#annual_income and interaction_count

In [32]:
corr = df.corr(numeric_only=True).unstack().reset_index().rename(columns={0:"corr"})
corr = corr[corr['corr'] != 1].copy()
corr.sort_values(by='corr', ascending=False)

Unnamed: 0,level_0,level_1,corr
4,number_of_courses_viewed,converted,0.435914
20,converted,number_of_courses_viewed,0.435914
14,interaction_count,converted,0.374573
22,converted,interaction_count,0.374573
23,converted,lead_score,0.193673
19,lead_score,converted,0.193673
21,converted,annual_income,0.053131
9,annual_income,converted,0.053131
7,annual_income,interaction_count,0.027036
11,interaction_count,annual_income,0.027036


In [33]:
#annual_income and interaction_count

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [35]:
#Split the data
#Split your data in train/val/test sets with 60%/20%/20% distribution.
#Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
#Make sure that the target value y is not in your dataframe.

In [37]:
df['converted'].unique()

array([1, 0])

In [38]:
seed = 42
df_X, df_y = df.drop(['converted'], axis=1).copy(), df['converted'].copy()

X_train, X_val, y_train, y_val = train_test_split(df_X, df_y, train_size=0.6, random_state=seed)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, train_size=0.5, random_state=seed)

In [39]:
#Question 3
#Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
#Round the scores to 2 decimals using round(score, 2).
#Which of these variables has the biggest mutual information score?

#industry
#location
#lead_source
#employment_status

In [40]:
def MI_score(X, y):
 return mutual_info_score(X, y_train)

X_train.select_dtypes('object').apply(lambda x: round(MI_score(x, y_train), 2)).sort_values(ascending=False).to_frame(name='MI')

Unnamed: 0,MI
lead_source,0.03
industry,0.02
employment_status,0.02
location,0.0


In [41]:
#Question 4
#Now let's train a logistic regression.
#Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
#Fit the model on the training dataset.
#To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
#model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
#Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [42]:
#One Hot Encoding
train_dicts = X_train.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')
test_dicts = X_test.to_dict(orient='records')

dv = DictVectorizer(sparse=False).set_output(transform='pandas').fit(train_dicts)

X_train = dv.transform(train_dicts)
X_val = dv.transform(val_dicts)
X_test = dv.transform(test_dicts)

X_train.head(5)

Unnamed: 0,annual_income,employment_status=NA,employment_status=employed,employment_status=self_employed,employment_status=student,employment_status=unemployed,industry=NA,industry=education,industry=finance,industry=healthcare,...,lead_source=social_media,location=NA,location=africa,location=asia,location=australia,location=europe,location=middle_east,location=north_america,location=south_america,number_of_courses_viewed
0,61705.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,55199.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,40841.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,28242.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,64775.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [43]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=seed)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [44]:
y_pred_val = model.predict_proba(X_val)[:, 1] >= 0.5
acc = (y_val == y_pred_val).mean()
round(acc, 2)

np.float64(0.74)

In [45]:
#Question 5
#Let's find the least useful feature using the feature elimination technique.
#Train a model using the same features and parameters as in Q4 (without rounding).
#Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
#For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
#Which of following feature has the smallest difference?

#'industry'
#'employment_status'
#'lead_score'

In [46]:
features_list = ["industry", "employment_status", "lead_score"]
acc_features = []
for feature in features_list:
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=seed)
    mask = ~X_train.columns.str.contains(feature)
    model.fit(X_train.loc[:, mask], y_train)
    y_pred_val = model.predict_proba(X_val.loc[:, mask])[:, 1] >= 0.5
    acc_features.append((y_val == y_pred_val).mean())

pd.DataFrame(data={'accuracy':acc-acc_features}, index=[f"without_{feature}" for feature in features_list]).sort_values(by='accuracy', ascending=False)

Unnamed: 0,accuracy
without_industry,0.0
without_lead_score,0.0
without_employment_status,-0.003425


In [47]:
##Question 6
#Now let's train a regularized logistic regression.
#Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
#Train models using all the features as in Q4.
#Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
#Which of these C leads to the best accuracy on the validation set?

#0.01
#0.1
#1
#10
#100

In [48]:
C_list = [0.01, 0.1, 1, 10, 100]
acc_C = []
for C in C_list:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=seed)
    model.fit(X_train, y_train)
    y_pred_val = model.predict_proba(X_val)[:, 1] >= 0.5
    acc_C.append(round((y_val == y_pred_val).mean(), 3))

pd.DataFrame(data={'C':C_list, 'accuracy':acc_C}).sort_values(by='accuracy', ascending=False)

Unnamed: 0,C,accuracy
0,0.01,0.743
1,0.1,0.743
2,1.0,0.743
3,10.0,0.743
4,100.0,0.743
