In [58]:
import pandas as pd
df=pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
df.head()
df.head().T

Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


In [59]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [60]:
df.columns = df.columns.str.lower()
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

In [61]:
for c in df.columns:
    if df[c].dtype == 'object':  
        df[c] = df[c].fillna('NA')
    else:                         
        df[c] = df[c].fillna(0.0)
for c in categorical_columns:    
    if set(df[c].unique()) == {'yes', 'no'}:  
        df[c] = (df[c] == 'yes').astype(int) 

In [62]:
df[df['industry'].isnull()][['industry']]

Unnamed: 0,industry


In [63]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [64]:
df['industry'].value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [65]:
numerical = ['interaction_count', 'lead_score', 
             'number_of_courses_viewed', 'annual_income']
df[numerical].corr()

Unnamed: 0,interaction_count,lead_score,number_of_courses_viewed,annual_income
interaction_count,1.0,0.009888,-0.023565,0.027036
lead_score,0.009888,1.0,-0.004879,0.01561
number_of_courses_viewed,-0.023565,-0.004879,1.0,0.00977
annual_income,0.027036,0.01561,0.00977,1.0


In [66]:
r1 = df['interaction_count'].corr(df['lead_score'])
r2 = df['number_of_courses_viewed'].corr(df['lead_score'])
r3 = df['number_of_courses_viewed'].corr(df['interaction_count'])
r4 = df['annual_income'].corr(df['interaction_count'])
print(r1, r2, r3, r4)

0.009888182496913115 -0.00487899835468123 -0.023565222882888086 0.02703647240481435


In [67]:
from sklearn.model_selection import train_test_split
TARGET = 'converted' 
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
df_train = df_train.reset_index(drop=True)
df_val   = df_val.reset_index(drop=True)
df_test  = df_test.reset_index(drop=True)
y_train = df_train[TARGET].values
y_val   = df_val[TARGET].values
y_test  = df_test[TARGET].values
del df_train[TARGET]
del df_val[TARGET]
del df_test[TARGET]
print(len(df_train), len(df_val), len(df_test)) 

876 293 293


In [68]:
from sklearn.metrics import mutual_info_score
categorical = df_train.select_dtypes(include='object').columns
def mi_with_y(series):
    return mutual_info_score(series, y_train)
mi = df_train[categorical].apply(mi_with_y).sort_values(ascending=False).round(2)

print(mi)



lead_source          0.04
employment_status    0.01
industry             0.01
location             0.00
dtype: float64


In [74]:
from sklearn.feature_extraction import DictVectorizer

categorical = df_train.select_dtypes(include='object').columns.tolist()
numerical   = df_train.select_dtypes(exclude='object').columns.tolist()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
val_dicts   = df_val[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(train_dicts)
X_val   = dv.transform(val_dicts)

X_train.shape

(876, 31)

In [95]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_val)[:, 1]   
y_pred_proba[:5]

y_pred_base = (model.predict_proba(X_val)[:, 1] >= 0.5).astype(int)
acc_base = accuracy_score(y_val, y_pred_base)
print('baseline accuracy:', round(acc_base, 3))

baseline accuracy: 0.7


In [96]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

for f in ['industry', 'employment_status', 'lead_score']:
    print(f)

    cols = [c for c in (categorical + numerical) if c != f]

    train_dicts_f = df_train[cols].to_dict(orient='records')
    val_dicts_f   = df_val  [cols].to_dict(orient='records')

    dv_f = DictVectorizer(sparse=False)
    X_train_f = dv_f.fit_transform(train_dicts_f)
    X_val_f   = dv_f.transform(val_dicts_f)

    model_f = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_f.fit(X_train_f, y_train)

    y_pred_proba_f = model_f.predict_proba(X_val_f)[:, 1]
    y_pred_f = (y_pred_proba_f >= 0.5).astype(int)
    acc_f = accuracy_score(y_val, y_pred_f)

    diff = acc_base - acc_f
    print('accuracy:', round(acc_f, 3), 'diff:', round(diff, 3))
    print()


industry
accuracy: 0.7 diff: 0.0

employment_status
accuracy: 0.696 diff: 0.003

lead_score
accuracy: 0.706 diff: -0.007



In [101]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

for C in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]
    y_pred = (y_pred >= 0.5).astype(int)
    acc = accuracy_score(y_val, y_pred)

    print(f"C={C}  accuracy={round(acc, 3)}")

C=0.01  accuracy=0.7
C=0.1  accuracy=0.7
C=1  accuracy=0.7
C=10  accuracy=0.7
C=100  accuracy=0.7
