In [2]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

In [4]:
import pandas as pd

In [33]:
df = pd.read_csv("data/course_lead_scoring.csv")
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [34]:
# check missing values
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [35]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [36]:
for col in categorical:
    df[col] = df[col].fillna("NA")
    
for col in numerical:
    df[col] = df[col].fillna(0)

In [37]:
# Q1: the most frequent observation (mode) for the column industry
df["industry"].value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [38]:
# Q2: What are the two features that have the biggest correlation?
cor = df[numerical].corr()
cor

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [49]:
# split the data
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, stratify=df["converted"],
                                     random_state=42)

df_trn, df_val = train_test_split(df_train, test_size=0.25, stratify=df_train["converted"],
                                     random_state=42)

df_train.reset_index(inplace=True)
df_test.reset_index(inplace=True)
df_trn.reset_index(inplace=True)
df_val.reset_index(inplace=True)

In [50]:
# Q3: mutual information
from sklearn.metrics import mutual_info_score

for col in categorical:
    score = mutual_info_score(df_trn[col], df_trn["converted"])
    print(col, round(score, 2))

lead_source 0.03
industry 0.01
employment_status 0.01
location 0.0


In [56]:
# Q4
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

ohe = OneHotEncoder(drop="first", handle_unknown="ignore")
ohe.fit(df_trn[categorical])
col_cat = list(ohe.get_feature_names_out())

X_trn_cat = ohe.transform(df_trn[categorical]).toarray()
X_trn_cat = pd.DataFrame(X_trn_cat, columns=col_cat)

X_trn_num = df_trn[numerical]

X_trn = pd.concat([X_trn_num, X_trn_cat], axis=1)
y_trn = df_trn["converted"]

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_trn, y_trn)

In [64]:
from sklearn.metrics import accuracy_score

X_val_cat = ohe.transform(df_val[categorical]).toarray()
X_val_cat = pd.DataFrame(X_val_cat, columns=col_cat)
X_val_num = df_val[numerical]
X_val = pd.concat([X_val_num, X_val_cat], axis=1)
y_val = df_val["converted"]

y_val_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
print(accuracy)

0.7303754266211604


In [66]:
# Q5
opt_cols = ['industry', 'employment_status', 'lead_score']

for opt in opt_cols:
    cols_use = [col for col in X_trn.columns if opt not in col]
    # print(cols_use)
    
    X = X_trn[cols_use]
    
    model.fit(X, y_trn)
    
    X_v = X_val[cols_use]
    print(X_v.shape)
    y_pred = model.predict(X_v)
    acc = accuracy_score(y_val, y_pred)
    print(opt, abs(acc - accuracy))

(293, 20)
industry 0.0
(293, 23)
employment_status 0.0034129692832765013
(293, 26)
lead_score 0.0034129692832765013


In [69]:
# Q6
c_opts = [0.01, 0.1, 1, 10, 100]
for c in c_opts:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_trn, y_trn)
    
    y_val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    print(c, round(acc, 3))

0.01 0.734
0.1 0.73
1 0.73
10 0.73
100 0.73
