In [3]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [4]:
df = pd.read_csv('../data/course_lead_scoring.csv')

In [5]:
df.shape

(1462, 9)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [7]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [50]:
df.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [8]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [9]:
df.columns = df.columns.str.replace(' ','_').str.lower()

In [32]:
df_data = df.copy()

# Data preparation

In [85]:
for column in df_data.columns:
    if df_data[column].dtype == 'object':
      df_data[column] = df_data[column].fillna('NA')
    else :
      df_data[column] = df_data[column].fillna(0)

In [87]:
df_data.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

# Question 1

In [92]:
df_data['industry'].mode()

0    retail
Name: industry, dtype: object

In [93]:
df_data.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [94]:
df_data[['number_of_courses_viewed','annual_income','interaction_count','lead_score']].head(1)

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
0,1,79450.0,4,0.94


# Question 2

In [95]:
df_data[['number_of_courses_viewed','annual_income','interaction_count','lead_score']].corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [47]:
from sklearn.metrics import mutual_info_score

In [58]:
def mutual_info_converted_score(serie):
    return mutual_info_score(serie,df_full_train.converted)

In [59]:
mi = df_full_train[['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score']].apply(mutual_info_converted_score)

  type_label = type_of_target(labels_true)


## Split the data

In [39]:
from sklearn.model_selection import train_test_split

In [41]:
df_full_train, df_test  = train_test_split(df_data,test_size=0.2,random_state=42)

In [42]:
df_train, df_val  = train_test_split(df_full_train,test_size=0.25,random_state=42)

In [43]:
len(df_train),len(df_val),len(df_test)

(876, 293, 293)

In [44]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [45]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [46]:
df_full_train.reset_index(drop=True)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,social_media,manufacturing,2,44403.0,self_employed,australia,1,0.71,0
1,events,retail,3,38048.0,student,north_america,6,0.97,1
2,social_media,education,2,71399.0,,europe,1,0.51,1
3,referral,education,2,47912.0,employed,australia,1,0.04,0
4,paid_ads,healthcare,1,34806.0,employed,europe,4,0.32,1
...,...,...,...,...,...,...,...,...,...
1164,events,manufacturing,1,57039.0,employed,south_america,2,0.30,0
1165,events,healthcare,2,56185.0,student,south_america,2,0.44,0
1166,paid_ads,manufacturing,1,56402.0,student,north_america,1,0.02,0
1167,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1


# Question 3

In [66]:
mi.round(2).sort_values(ascending=False)

annual_income               0.58
number_of_courses_viewed    0.12
interaction_count           0.08
lead_score                  0.07
lead_source                 0.03
industry                    0.01
employment_status           0.01
location                    0.00
dtype: float64

# Question 4

In [96]:
selected_columns = ['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score']

In [97]:
from sklearn.feature_extraction import DictVectorizer

In [98]:
train_dicts = df_train[selected_columns].to_dict(orient='records')

In [99]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [100]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [101]:
X_train = dv.transform(train_dicts)

In [109]:
X_train

array([[5.8472e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [7.1738e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [8.1973e+04, 0.0000e+00, 1.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        3.0000e+00],
       ...,
       [8.9042e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [5.0259e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+00]], shape=(876, 31))

In [102]:
from sklearn.linear_model import LogisticRegression

In [103]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [110]:
val_dicts = df_val[selected_columns].to_dict(orient='records')
#dv.fit(val_dicts)

In [111]:
X_val = dv.transform(val_dicts)

In [112]:
y_val_pred = model.predict(X_val)

In [114]:
model.intercept_[0].round(2)

np.float64(-0.07)

In [115]:
model.coef_[0].round(3)

array([-0.   , -0.015,  0.034,  0.003,  0.012, -0.102, -0.025,  0.049,
       -0.02 , -0.013, -0.003, -0.009, -0.032, -0.016,  0.312,  0.051,
        0.02 , -0.012, -0.012, -0.115,  0.08 , -0.03 ,  0.004, -0.011,
       -0.011, -0.006,  0.008,  0.006, -0.033, -0.025,  0.454])

In [116]:
model.predict_proba(X_val)[:,1]

array([0.61192003, 0.80002017, 0.53025988, 0.47134953, 0.57070183,
       0.44216912, 0.87161025, 0.84908203, 0.83304609, 0.61488559,
       0.54959148, 0.78180657, 0.69052348, 0.7703925 , 0.52650214,
       0.91728307, 0.53164244, 0.42109689, 0.30122867, 0.84899269,
       0.79513351, 0.73690363, 0.4451502 , 0.6485605 , 0.41743871,
       0.75413381, 0.90188557, 0.33864797, 0.43167007, 0.9681668 ,
       0.92037597, 0.37450253, 0.65229794, 0.90674059, 0.75179866,
       0.64204894, 0.82261517, 0.83404381, 0.65924804, 0.30960411,
       0.78973206, 0.35512648, 0.96527974, 0.63416602, 0.51261901,
       0.53222517, 0.82304982, 0.74423093, 0.73464818, 0.68960249,
       0.46953006, 0.84555566, 0.55629442, 0.92648205, 0.65283852,
       0.61506755, 0.6381533 , 0.28274461, 0.48022496, 0.57888151,
       0.35474335, 0.62169125, 0.38926373, 0.6116316 , 0.85325459,
       0.75439787, 0.89203927, 0.71942039, 0.95400251, 0.89236892,
       0.75276706, 0.33823598, 0.61404266, 0.51608065, 0.64097

In [117]:
all_features_accuracy = model.score(X_val,y_val)

In [133]:
all_features_accuracy

0.6996587030716723

In [140]:
(y_val == y_val_pred).mean().round(4)

np.float64(0.6997)

# Question 5

In [141]:
differences = dict()
total_features = selected_columns
for feature in total_features:
    #print(feature)
    
    df_train_copy = df_train[selected_columns].copy()
    del df_train_copy[feature]
    
    df_val_copy = df_val[selected_columns].copy()
    del df_val_copy[feature]
    
    dv = DictVectorizer(sparse=False)
    train_dict = df_train_copy.to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val_copy.to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    #print(dv.get_feature_names())
    model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
    model.fit(X_train, y_train)
    
    #y_pred = model.predict_proba(X_val)[:, 1]
    #above_average_decision = (y_pred >= 0.5)
    #no_feature_accuracy = (above_average_val == above_average_decision).mean()
    no_feature_accuracy = model.score(X_val, y_val)
    differences[feature] = (all_features_accuracy - no_feature_accuracy)
    
for key, value in differences.items():
    print(f"{key}:{value}")

lead_source:-0.0034129692832765013
industry:0.0
number_of_courses_viewed:0.14334470989761094
annual_income:-0.15358361774744034
employment_status:0.0034129692832763903
location:-0.010238907849829393
interaction_count:0.14334470989761094
lead_score:-0.0068259385665528916


# Question 6

In [142]:
dv = DictVectorizer(sparse=False)

In [143]:
train_dicts = df_train[selected_columns].to_dict(orient='records')
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)

In [144]:
val_dicts = df_val[selected_columns].to_dict(orient='records')
dv.fit(val_dicts)
X_val = dv.transform(val_dicts)

In [145]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [146]:
results = dict()
best_rmse = None
best_rmse_arg = None

for a in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=a, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    raw_rmse = rmse(y_val, y_pred)
    if best_rmse is None:
        best_rmse = raw_rmse
        best_rmse_arg = a
    elif raw_rmse < best_rmse:
        best_rmse = raw_rmse
        best_rmse_arg = a
    results[a] = round(raw_rmse, 3)

print(f"[ANSWER-6] Best RMSE is {round(best_rmse, 3)} for r value: {best_rmse_arg}")

[ANSWER-6] Best RMSE is 0.548 for r value: 0.01
