# Lead Scoring Homework - Bank Marketing Dataset

This notebook solves all 6 questions from the homework assignment.

**Dataset:** course_lead_scoring.csv

**Download dataset:**
```bash
wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
```

In [None]:
# Lead Scoring Homework - Bank Marketing Dataset# Dataset: course_lead_scoring.csvimport pandas as pdimport numpy as npfrom sklearn.model_selection import train_test_splitfrom sklearn.linear_model import LogisticRegressionfrom sklearn.feature_extraction import DictVectorizerfrom sklearn.metrics import accuracy_score, mutual_info_scoreimport warningswarnings.filterwarnings('ignore')# Load the dataset# Download: wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csvdf = pd.read_csv('course_lead_scoring.csv')print("Dataset shape:", df.shape)print("\nFirst few rows:")print(df.head())print("\nDataset info:")print(df.info())# ============================================================# DATA PREPARATION# ============================================================print("\n" + "="*60)print("DATA PREPARATION")print("="*60)# Check for missing valuesprint("\nMissing values per column:")print(df.isnull().sum())# Identify categorical and numerical columnscategorical_cols = df.select_dtypes(include=['object']).columns.tolist()numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()# Remove 'converted' from lists if presentif 'converted' in categorical_cols:    categorical_cols.remove('converted')if 'converted' in numerical_cols:    numerical_cols.remove('converted')print(f"\nCategorical columns: {categorical_cols}")print(f"Numerical columns: {numerical_cols}")# Handle missing values# For categorical features: replace with 'NA'for col in categorical_cols:    df[col] = df[col].fillna('NA')# For numerical features: replace with 0.0for col in numerical_cols:    df[col] = df[col].fillna(0.0)print("\nMissing values after handling:")print(df.isnull().sum().sum(), "missing values")# ============================================================# QUESTION 1: Mode of 'industry' column# ============================================================print("\n" + "="*60)print("QUESTION 1: Most frequent observation (mode) for 'industry'")print("="*60)industry_mode = df['industry'].mode()[0]print(f"\nMode of 'industry': {industry_mode}")# Show value countsprint("\nIndustry value counts:")print(df['industry'].value_counts())# ============================================================# QUESTION 2: Correlation between numerical features# ============================================================print("\n" + "="*60)print("QUESTION 2: Correlation matrix for numerical features")print("="*60)# Create correlation matrix for numerical featurescorr_matrix = df[numerical_cols].corr()print("\nCorrelation Matrix:")print(corr_matrix)# Check specific pairs mentioned in the questionpairs_to_check = [    ('interaction_count', 'lead_score'),    ('number_of_courses_viewed', 'lead_score'),    ('number_of_courses_viewed', 'interaction_count'),    ('annual_income', 'interaction_count')]print("\nCorrelations for specified pairs:")for feat1, feat2 in pairs_to_check:    if feat1 in numerical_cols and feat2 in numerical_cols:        corr_value = corr_matrix.loc[feat1, feat2]        print(f"{feat1} <-> {feat2}: {corr_value:.4f}")# Find the pair with biggest correlationmax_corr = -1max_pair = Nonefor feat1, feat2 in pairs_to_check:    if feat1 in numerical_cols and feat2 in numerical_cols:        corr_value = abs(corr_matrix.loc[feat1, feat2])        if corr_value > max_corr:            max_corr = corr_value            max_pair = (feat1, feat2)print(f"\nPair with biggest correlation: {max_pair[0]} and {max_pair[1]} ({max_corr:.4f})")# ============================================================# SPLIT THE DATA# ============================================================print("\n" + "="*60)print("DATA SPLITTING: 60% train / 20% val / 20% test")print("="*60)# Prepare features and targetX = df.drop('converted', axis=1)y = df['converted']# First split: 60% train, 40% temp (which will be split into val and test)X_train, X_temp, y_train, y_temp = train_test_split(    X, y, test_size=0.4, random_state=42)# Second split: split temp into 50/50 for val and test (20% each of original)X_val, X_test, y_val, y_test = train_test_split(    X_temp, y_temp, test_size=0.5, random_state=42)print(f"\nTrain set size: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")print(f"Validation set size: {len(X_val)} ({len(X_val)/len(X)*100:.1f}%)")print(f"Test set size: {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")# ============================================================# QUESTION 3: Mutual Information Score# ============================================================print("\n" + "="*60)print("QUESTION 3: Mutual Information Score for categorical variables")print("="*60)# Get categorical columns from training datacategorical_features = X_train.select_dtypes(include=['object']).columns.tolist()print(f"\nCategorical features: {categorical_features}")# Calculate mutual information for each categorical variablemi_scores = {}for col in categorical_features:    mi_score = mutual_info_score(y_train, X_train[col])    mi_scores[col] = round(mi_score, 2)    print(f"{col}: {mi_scores[col]}")# Find the variable with biggest MI scoremax_mi_var = max(mi_scores, key=mi_scores.get)print(f"\nVariable with biggest mutual information score: {max_mi_var} ({mi_scores[max_mi_var]})")# ============================================================# QUESTION 4: Logistic Regression with One-Hot Encoding# ============================================================print("\n" + "="*60)print("QUESTION 4: Logistic Regression with One-Hot Encoding")print("="*60)# Prepare data using DictVectorizer for one-hot encodingtrain_dicts = X_train.to_dict(orient='records')val_dicts = X_val.to_dict(orient='records')dv = DictVectorizer(sparse=False)X_train_encoded = dv.fit_transform(train_dicts)X_val_encoded = dv.transform(val_dicts)print(f"\nOriginal features: {X_train.shape[1]}")print(f"After one-hot encoding: {X_train_encoded.shape[1]}")# Train logistic regressionmodel = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)model.fit(X_train_encoded, y_train)# Calculate accuracy on validation sety_val_pred = model.predict(X_val_encoded)accuracy = accuracy_score(y_val, y_val_pred)accuracy_rounded = round(accuracy, 2)print(f"\nValidation Accuracy: {accuracy:.4f}")print(f"Validation Accuracy (rounded to 2 decimals): {accuracy_rounded}")# ============================================================# QUESTION 5: Feature Elimination# ============================================================print("\n" + "="*60)print("QUESTION 5: Feature Elimination - Finding least useful feature")print("="*60)baseline_accuracy = accuracyprint(f"Baseline accuracy (all features): {baseline_accuracy:.4f}")print("\nTesting feature elimination:")# Store differencesfeature_differences = {}# Test each featureall_features = X_train.columns.tolist()for feature in all_features:    # Create dataset without this feature    X_train_without = X_train.drop(feature, axis=1)    X_val_without = X_val.drop(feature, axis=1)        # Encode    train_dicts_without = X_train_without.to_dict(orient='records')    val_dicts_without = X_val_without.to_dict(orient='records')        dv_without = DictVectorizer(sparse=False)    X_train_encoded_without = dv_without.fit_transform(train_dicts_without)    X_val_encoded_without = dv_without.transform(val_dicts_without)        # Train model    model_without = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)    model_without.fit(X_train_encoded_without, y_train)        # Calculate accuracy    y_val_pred_without = model_without.predict(X_val_encoded_without)    accuracy_without = accuracy_score(y_val, y_val_pred_without)        # Calculate difference (baseline - without_feature)    difference = baseline_accuracy - accuracy_without    feature_differences[feature] = difference        print(f"{feature}: accuracy without = {accuracy_without:.4f}, difference = {difference:.4f}")# Find feature with smallest differencemin_diff_feature = min(feature_differences, key=lambda k: abs(feature_differences[k]))print(f"\nFeature with smallest difference: {min_diff_feature} (difference: {feature_differences[min_diff_feature]:.4f})")# ============================================================# QUESTION 6: Regularized Logistic Regression# ============================================================print("\n" + "="*60)print("QUESTION 6: Regularized Logistic Regression with different C values")print("="*60)C_values = [0.01, 0.1, 1, 10, 100]results = {}print("\nTesting different C values:")for C in C_values:    # Train model with this C value    model_c = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)    model_c.fit(X_train_encoded, y_train)        # Calculate accuracy on validation set    y_val_pred_c = model_c.predict(X_val_encoded)    accuracy_c = accuracy_score(y_val, y_val_pred_c)    accuracy_c_rounded = round(accuracy_c, 3)        results[C] = accuracy_c_rounded    print(f"C = {C:6.2f}: Validation Accuracy = {accuracy_c_rounded}")# Find best C (smallest C with best accuracy)best_accuracy = max(results.values())best_C = min([c for c, acc in results.items() if acc == best_accuracy])print(f"\nBest C value: {best_C} with accuracy: {best_accuracy}")# ============================================================# SUMMARY OF ANSWERS# ============================================================print("\n" + "="*60)print("SUMMARY OF ANSWERS")print("="*60)print(f"\nQuestion 1: Mode of 'industry' = {industry_mode}")print(f"Question 2: Pair with biggest correlation = {max_pair[0]} and {max_pair[1]}")print(f"Question 3: Variable with biggest MI score = {max_mi_var}")print(f"Question 4: Validation accuracy = {accuracy_rounded}")print(f"Question 5: Feature with smallest difference = {min_diff_feature}")print(f"Question 6: Best C value = {best_C}")print("\n" + "="*60)print("HOMEWORK COMPLETE!")print("="*60)