In [46]:
import pandas as pd
import numpy as np

In [350]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-14 01:55:32--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-14 01:55:32 (11.6 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [351]:
data = pd.read_csv('course_lead_scoring.csv')

In [352]:
df = pd.DataFrame(data)

In [353]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [354]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [355]:
df.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [356]:
categorical = ['lead_source','industry','employment_status','location']

In [357]:
numerical = ['number_of_courses_viewed', 'annual_income','interaction_count', 'lead_score']

In [358]:
df['lead_source'] = df.lead_source.fillna('NA')

In [359]:
df['industry'] = df.industry.fillna('NA')

In [360]:
df['employment_status'] = df['employment_status'].fillna('NA')

In [361]:
df['location'] = df['location'].fillna('NA')

In [362]:
df['annual_income'] = df.annual_income.fillna(0.0)

In [363]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [407]:
dict = {}
sum = 0
for x in df.industry.unique():
    dict[x]=len(df[df['industry']==x])
    sum += dict[x]
dict, sum

({'NA': 134,
  'retail': 203,
  'healthcare': 187,
  'education': 187,
  'manufacturing': 174,
  'technology': 179,
  'other': 198,
  'finance': 200},
 1462)

#### ANSWER 1: retail

In [365]:
corr_list = ['number_of_courses_viewed', 'annual_income','interaction_count', 'lead_score']
corr_df = df[corr_list]
corr_df.corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


#### ANSWER 2: interaction count and annual_income

In [366]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [367]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [368]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [369]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [370]:
df_train.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score'],
      dtype='object')

industry<br>
location<br>
lead_source<br>
employment_status

In [371]:
from sklearn.metrics import mutual_info_score

In [372]:
mutual_info_score(df_full_train.industry,df_full_train.converted)

0.008173022583466888

In [373]:
mutual_info_score(df_full_train.location,df_full_train.converted)

0.0012115327428980638

In [374]:
mutual_info_score(df_full_train.lead_source,df_full_train.converted)

0.024561797800259202

In [375]:
mutual_info_score(df_full_train.employment_status,df_full_train.converted)

0.012690204266619348

#### ANSWER 3: lead_source and converted

In [376]:
#model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [377]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [378]:
df[categorical].nunique().sum()

np.int64(27)

In [379]:
len(numerical)

4

In [380]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')

In [381]:
dv = DictVectorizer(sparse=False)

In [382]:
X_train = dv.fit_transform(train_dicts)

In [383]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

In [384]:
X_val = dv.fit_transform(val_dicts)

In [385]:
X_train.shape

(876, 31)

In [386]:
X_val.shape

(293, 31)

In [387]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [439]:
model = LogisticRegression(solver='liblinear', C=100, max_iter=1000, random_state=42)

In [440]:
model.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,100
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [441]:
model.predict(X_train)
y_pred = model.predict_proba(X_val)[:,1]

scores = []
for x in range(50,90,1):
    metric = float(x/100)
    convert_decision = (y_pred >= metric)
    
    df_PredVsVal = pd.DataFrame()
    df_PredVsVal['pred']= y_pred
    df_PredVsVal['prediction'] = convert_decision.astype(int)
    df_PredVsVal['val'] = y_val
    
    summ = 0
    cnt = 0
    for y in range(0,len(df_PredVsVal.val)):
        if df_PredVsVal['val'].iloc[y] == 1:
            cnt +=1
            summ += df_PredVsVal['prediction'].iloc[y]

    del df_PredVsVal
    scores.append((metric,summ/cnt))

In [442]:
convert_decision = (y_pred >= .5)
    
df_PredVsVal = pd.DataFrame()
df_PredVsVal['pred']= y_pred
df_PredVsVal['prediction'] = convert_decision.astype(int)
df_PredVsVal['val'] = y_val

In [443]:
df_PredVsVal['correct'] = df_PredVsVal.prediction == df_PredVsVal.val
df_PredVsVal['correct'].mean()

np.float64(0.6996587030716723)

In [444]:
(y_val==convert_decision).mean()

np.float64(0.6996587030716723)

#### ANSWER 4: 0.70 Accuracy

accuracy_w/all_features = 0.6996587 <br>
lead_score = 0.7030717<br>
employment = 0.7030717<br>
industry = 0.6996587

#### ANSWER 5: industry yields the smallest difference in accuracy

C = 0.01 -> 0.6996587 <br>
C = 0.1  -> 0.6996587 <br>
C = 1    -> 0.6996587 <br>
C = 10   -> 0.6996587 <br>
C = 100  -> 0.6996587