# 03 CLASSIFICATION

## 0. Get data & imports

In [238]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-13 21:13:16--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-13 21:13:16 (58.7 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [239]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [240]:
df = pd.read_csv('course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


## 1. Data Preparation

In [241]:
for col in df.columns:
    print(col)
    print(df[col].unique()[:10])
    print(df[col].nunique())
    print()

lead_source
['paid_ads' 'social_media' 'events' 'referral' 'organic_search' nan]
5

industry
[nan 'retail' 'healthcare' 'education' 'manufacturing' 'technology'
 'other' 'finance']
7

number_of_courses_viewed
[1 5 2 3 0 4 6 8 7 9]
10

annual_income
[79450. 46992. 78796. 83843. 85012. 59904. 51283. 62975. 38648. 59866.]
1267

employment_status
['unemployed' 'employed' nan 'self_employed' 'student']
4

location
['south_america' 'australia' 'europe' 'africa' 'middle_east' nan
 'north_america' 'asia']
7

interaction_count
[4 1 3 6 2 0 5 7 9 8]
12

lead_score
[0.94 0.8  0.69 0.87 0.62 0.83 0.57 0.86 0.43 0.92]
101

converted
[1 0]
2



* All the column names are prepared
* All the categorical values are prepared

In [242]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

* lead_source -> categorical
* industry -> categorical
* number_of_courses_viewed -> numerical
* annual_income -> numerical
* employment_status -> categorical
* location -> categorical
* interaction_count -> numerical
* lead_score -> numerical
* converted -> target

In [243]:
numerical = ['number_of_courses_viewed', 
             'annual_income', 
             'interaction_count', 
             'lead_score'
            ]

In [244]:
categorical = ['lead_source',
               'industry',
               'employment_status',
               'location',
              ]

In [245]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [246]:
df.annual_income = df.annual_income.fillna(0)

In [247]:
df.lead_source = df.lead_source.fillna('NA')
df.industry = df.industry.fillna('NA')
df.employment_status = df.employment_status.fillna('NA')
df.location = df.location.fillna('NA')

In [248]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

## Question 1

In [249]:
df.industry.nunique()

8

In [250]:
df.industry.unique()

array(['NA', 'retail', 'healthcare', 'education', 'manufacturing',
       'technology', 'other', 'finance'], dtype=object)

In [251]:
df.industry.mode()

0    retail
Name: industry, dtype: object

## Question 2

In [252]:
df[numerical].corrwith(df.converted).abs()

number_of_courses_viewed    0.435914
annual_income               0.053131
interaction_count           0.374573
lead_score                  0.193673
dtype: float64

In [253]:
numerical_2 = ['interaction_count',
               'number_of_courses_viewed',
            ]
df[numerical_2].corrwith(df.lead_score).abs()

interaction_count           0.009888
number_of_courses_viewed    0.004879
dtype: float64

In [254]:
numerical_3 = ['number_of_courses_viewed',
               'annual_income',
            ]
df[numerical_3].corrwith(df.interaction_count).abs()

number_of_courses_viewed    0.023565
annual_income               0.027036
dtype: float64

annual_income and interaction_count have biggest correlation (0.027036)

## Split the data

In [255]:
from sklearn.model_selection import train_test_split

In [256]:
ramdom_seed = 42

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=ramdom_seed)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=ramdom_seed)

In [257]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [258]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [259]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

## Question 3

In [260]:
from sklearn.metrics import mutual_info_score

In [261]:
def mutual_info_churn_score(series):
    return round(mutual_info_score(series, y_train), 2)

In [262]:
mi = df_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

lead_source          0.04
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

## Question 4

In [263]:
from sklearn.feature_extraction import DictVectorizer

Hot Encoding

In [264]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

Logistic regression

In [265]:
from sklearn.linear_model import LogisticRegression

In [266]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [267]:
#model.coef_[0] = model.coef_[0].round(3)
y_pred = model.predict_proba(X_val)[:, 1]

In [268]:
converted_decision = (y_pred >= 0.5)

In [269]:
accuracy = round((y_val == converted_decision).mean(), 2)
accuracy

np.float64(0.7)

## Question 5

Train a model using the same features and parameters as in Q4 (without rounding).

In [270]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [271]:
y_pred = model.predict_proba(X_val)[:, 1]
converted_decision = (y_pred >= 0.5)
accuracy = (y_val == converted_decision).mean()
accuracy

np.float64(0.6996587030716723)

In [272]:
def model_calculation(features):
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[features].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[features].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    converted_decision = (y_pred >= 0.5)
    accuracy_minus_feature = (y_val == converted_decision).mean()
    return accuracy_minus_feature

In [273]:
original_features = categorical + numerical

arrays_of_features = [original_features[:i] + original_features[i+1:] for i in range(len(original_features))]

deleted_feature = [original_features[i] for i in range(len(original_features))]

In [274]:
accuracies = []
for i in arrays_of_features:
    accuracy_deleted_feature = model_calculation(i)
    accuracies.append(accuracy_deleted_feature)
    print(accuracy_deleted_feature)

0.7030716723549488
0.6996587030716723
0.6962457337883959
0.7098976109215017
0.5563139931740614
0.8532423208191127
0.5563139931740614
0.7064846416382252


In [275]:
df_accuracies = pd.DataFrame({
    "deleted_feature": deleted_feature,
    "accuracies": accuracies,
    "accuracy_delta": abs(accuracy - accuracies),
})

print(df_accuracies)

            deleted_feature  accuracies  accuracy_delta
0               lead_source    0.703072        0.003413
1                  industry    0.699659        0.000000
2         employment_status    0.696246        0.003413
3                  location    0.709898        0.010239
4  number_of_courses_viewed    0.556314        0.143345
5             annual_income    0.853242        0.153584
6         interaction_count    0.556314        0.143345
7                lead_score    0.706485        0.006826


In [276]:
df_accuracies.accuracy_delta.min()

np.float64(0.0)

In [277]:
df_accuracies["accuracy_delta"].sort_values()

1    0.000000
2    0.003413
0    0.003413
7    0.006826
3    0.010239
4    0.143345
6    0.143345
5    0.153584
Name: accuracy_delta, dtype: float64

## Question 6

In [278]:
def model_calculation_c(c):
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[original_features].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[original_features].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)

    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    converted_decision = (y_pred >= 0.5)
    accuracy_c = (y_val == converted_decision).mean()
    accuracy_c_rounded = round(accuracy_c, 3)
    return accuracy_c

In [279]:
parameter_c = [0.01, 0.1, 1, 10, 100]

In [280]:
accuracies_c = []
for i in parameter_c:
    accuracy_different_c = model_calculation_c(i)
    accuracies_c.append(accuracy_different_c)
    print(accuracy_different_c)

0.6996587030716723
0.6996587030716723
0.6996587030716723
0.6996587030716723
0.6996587030716723


In [281]:
df_accuracies_c = pd.DataFrame({
    "parameter_c": parameter_c,
    "accuracies_c": accuracies_c,
    "accuracy_c_delta": abs(accuracy - accuracies_c),
})

print(df_accuracies_c)

   parameter_c  accuracies_c  accuracy_c_delta
0         0.01      0.699659               0.0
1         0.10      0.699659               0.0
2         1.00      0.699659               0.0
3        10.00      0.699659               0.0
4       100.00      0.699659               0.0
