# Lead Scoring Homework - Bank Marketing Dataset

This notebook solves all 6 questions from the homework assignment.

**Dataset:** course_lead_scoring.csv

**Download dataset:**
```bash
wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
```

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, mutual_info_score
import warnings
warnings.filterwarnings('ignore')


In [11]:
df = pd.read_csv('course_lead_scoring.csv')

In [12]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [13]:
# Identify column types
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()


In [14]:
if 'converted' in categorical_cols:
    categorical_cols.remove('converted')
if 'converted' in numerical_cols:
    numerical_cols.remove('converted')


In [15]:
for col in categorical_cols:
    df[col] = df[col].fillna('NA')

for col in numerical_cols:
    df[col] = df[col].fillna(0.0)

In [16]:
industry_mode = df['industry'].mode()[0]
print(f"Q1 Answer - Mode of industry: {industry_mode}")


Q1 Answer - Mode of industry: retail


In [17]:
corr_matrix = df[numerical_cols].corr()

# Check specified pairs
pairs_to_check = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

In [18]:
# Find pair with biggest correlation
max_corr = -1
max_pair = None

In [19]:
for feat1, feat2 in pairs_to_check:
    if feat1 in numerical_cols and feat2 in numerical_cols:
        corr_value = abs(corr_matrix.loc[feat1, feat2])
        if corr_value > max_corr:
            max_corr = corr_value
            max_pair = (feat1, feat2)

print(f"Q2 Answer - Pair with biggest correlation: {max_pair[0]} and {max_pair[1]}")


Q2 Answer - Pair with biggest correlation: annual_income and interaction_count


In [20]:
# Split Data: 60% train / 20% val / 20% test
X = df.drop('converted', axis=1)
y = df['converted']

In [21]:
# First split: 60% train, 40% temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)

In [22]:
# Second split: 20% val, 20% test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

In [23]:
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()


In [24]:
# Calculate MI scores
mi_scores = {}
for col in categorical_features:
    mi_score = mutual_info_score(y_train, X_train[col])
    mi_scores[col] = round(mi_score, 2)


In [25]:
max_mi_var = max(mi_scores, key=mi_scores.get)
print(f"Q3 Answer - Variable with biggest MI score: {max_mi_var}")


Q3 Answer - Variable with biggest MI score: lead_source


In [26]:
train_dicts = X_train.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')


In [27]:
dv = DictVectorizer(sparse=False)
X_train_encoded = dv.fit_transform(train_dicts)
X_val_encoded = dv.transform(val_dicts)


In [28]:
# Train logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)


In [29]:
# Calculate validation accuracy
y_val_pred = model.predict(X_val_encoded)
baseline_accuracy = accuracy_score(y_val, y_val_pred)
accuracy_rounded = round(baseline_accuracy, 2)


In [30]:
print(f"Q4 Answer - Validation Accuracy: {accuracy_rounded}")


Q4 Answer - Validation Accuracy: 0.74


In [31]:
all_features = X_train.columns.tolist()
feature_differences = {}


In [32]:
for feature in all_features:
    # Create dataset without this feature
    X_train_without = X_train.drop(feature, axis=1)
    X_val_without = X_val.drop(feature, axis=1)
    
    # Encode
    train_dicts_without = X_train_without.to_dict(orient='records')
    val_dicts_without = X_val_without.to_dict(orient='records')
    
    dv_without = DictVectorizer(sparse=False)
    X_train_encoded_without = dv_without.fit_transform(train_dicts_without)
    X_val_encoded_without = dv_without.transform(val_dicts_without)
    
    # Train model without this feature
    model_without = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_without.fit(X_train_encoded_without, y_train)
    
    # Calculate accuracy and difference
    y_val_pred_without = model_without.predict(X_val_encoded_without)
    accuracy_without = accuracy_score(y_val, y_val_pred_without)
    
    difference = baseline_accuracy - accuracy_without
    feature_differences[feature] = difference


In [33]:
# Find feature with smallest absolute difference
min_diff_feature = min(feature_differences, key=lambda k: abs(feature_differences[k]))
print(f"Q5 Answer - Feature with smallest difference: {min_diff_feature}")


Q5 Answer - Feature with smallest difference: industry


In [34]:
C_values = [0.01, 0.1, 1, 10, 100]
c_results = {}

In [35]:
# Test different C values
for C in C_values:
    model_c = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_c.fit(X_train_encoded, y_train)
    
    y_val_pred_c = model_c.predict(X_val_encoded)
    accuracy_c = accuracy_score(y_val, y_val_pred_c)
    c_results[C] = round(accuracy_c, 3)


In [36]:
# Find best C (smallest C with best accuracy)
best_accuracy = max(c_results.values())
best_c = min([c for c, acc in c_results.items() if acc == best_accuracy])


In [37]:
print(f"Q6 Answer - Best C value: {best_c}")


Q6 Answer - Best C value: 0.01
