In [10]:
import pandas as pd

# Load the CSV
df = pd.read_csv('course_lead_scoring.csv')
print(df.columns)
# Inspect
df['industry'].mode()[0]



Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')


'retail'

In [6]:
corr = df.corr(numeric_only=True)
corr


Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.031551,-0.023565,-0.004879,0.435914
annual_income,0.031551,1.0,0.048618,0.005334,0.078256
interaction_count,-0.023565,0.048618,1.0,0.009888,0.374573
lead_score,-0.004879,0.005334,0.009888,1.0,0.193673
converted,0.435914,0.078256,0.374573,0.193673,1.0


In [7]:
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]
for a, b in pairs:
    print(f'{a} & {b}: {corr.loc[a, b]:.3f}')


interaction_count & lead_score: 0.010
number_of_courses_viewed & lead_score: -0.005
number_of_courses_viewed & interaction_count: -0.024
annual_income & interaction_count: 0.049


In [17]:
from sklearn.model_selection import train_test_split

# Preprocess once so later cells don't need to drop/impute repeatedly
# Work on a copy to keep original raw df intact
df_prep = df.copy()

# Define categorical and numeric columns we will impute
cat_cols = ['lead_source', 'industry', 'employment_status', 'location']
num_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

# Drop rows with missing target (can't train without target)
before_rows = len(df_prep)
df_prep = df_prep.dropna(subset=['converted'])
dropped_target = before_rows - len(df_prep)
print(f"Dropped {dropped_target} rows with missing 'converted' target")

# Fill categorical NaNs with a placeholder
for c in cat_cols:
    if c in df_prep.columns:
        df_prep[c] = df_prep[c].fillna('missing')

# Fill numeric NaNs with median (computed on the whole preprocessed set)
for n in num_cols:
    if n in df_prep.columns:
        med = df_prep[n].median()
        df_prep[n] = df_prep[n].fillna(med)

# Now perform the 60/20/20 split on the preprocessed dataframe
# This ensures downstream cells can assume missing values are handled
df_full_train, df_test = train_test_split(df_prep, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

len(df_train), len(df_val), len(df_test)


Dropped 0 rows with missing 'converted' target


(876, 293, 293)

In [18]:
from sklearn.metrics import mutual_info_score

# Mutual information for categorical features (preprocessing already filled NaNs)
categorical = ['lead_source', 'industry', 'employment_status', 'location']

for col in categorical:
    score = mutual_info_score(df_train['converted'], df_train[col])
    print(col, round(score, 2))


lead_source 0.04
industry 0.01
employment_status 0.01
location 0.0


In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define features (all except target)
features = [
    'lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
    'employment_status', 'location', 'interaction_count', 'lead_score'
]

# Prepare training and validation dictionaries (preprocessing already applied)
train_dicts = df_train[features].to_dict(orient='records')
val_dicts = df_val[features].to_dict(orient='records')

# Vectorize
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

# Targets
y_train = df_train['converted']
y_val = df_val['converted']

# Train model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print('Validation accuracy:', round(acc, 2))


Dropped 0 rows from train (missing target), 0 from val
No NaNs in X_train/X_val — safe to fit
Validation accuracy: 0.75
Validation accuracy: 0.75


In [19]:
base_acc = accuracy_score(y_val, y_pred)

for f in features:
    sub_features = [x for x in features if x != f]
    train_dicts = df_train[sub_features].to_dict(orient='records')
    val_dicts = df_val[sub_features].to_dict(orient='records')
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)
    model.fit(X_train, y_train)
    acc = accuracy_score(y_val, model.predict(X_val))
    print(f"{f}: {base_acc - acc:.4f}")


lead_source: -0.0102
industry: 0.0000
number_of_courses_viewed: 0.1911
annual_income: -0.1058
employment_status: -0.0068
location: 0.0000
interaction_count: 0.1911
lead_score: -0.0034


In [20]:
for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    acc = accuracy_score(y_val, model.predict(X_val))
    print(c, round(acc, 3))


0.01 0.754
0.1 0.744
1 0.751
10 0.751
100 0.751
