In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [5]:
# Load the course lead scoring dataset
df = pd.read_csv('course_lead_scoring.csv')

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
df.head()


Dataset shape: (1462, 9)

First few rows:


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [6]:
# Data preparation: Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")

# Replace missing values
# For categorical features, replace with 'NA'
# For numerical features, replace with 0.0

df_clean = df.copy()

# Identify categorical and numerical columns
categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()

print(f"\nCategorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")

# Fill missing values
for col in categorical_cols:
    df_clean[col] = df_clean[col].fillna('NA')

for col in numerical_cols:
    df_clean[col] = df_clean[col].fillna(0.0)

print("\nAfter filling missing values:")
print("Missing values in each column:")
print(df_clean.isnull().sum())


Missing values in each column:
lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

Total missing values: 606

Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']
Numerical columns: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']

After filling missing values:
Missing values in each column:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [7]:
# Split the data into train/validation/test with 60%/20%/20% distribution
# Use random_state=1

# First split: 60% train, 40% temp (which will be split into 20% val, 20% test)
df_train_full, df_temp = train_test_split(df_clean, test_size=0.4, random_state=1)

# Second split: Split the 40% temp into 20% validation and 20% test
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=1)

print(f"Training set size: {len(df_train_full)} ({len(df_train_full)/len(df_clean)*100:.1f}%)")
print(f"Validation set size: {len(df_val)} ({len(df_val)/len(df_clean)*100:.1f}%)")
print(f"Test set size: {len(df_test)} ({len(df_test)/len(df_clean)*100:.1f}%)")
print(f"Total: {len(df_train_full) + len(df_val) + len(df_test)}")

# Verify the split
assert len(df_train_full) + len(df_val) + len(df_test) == len(df_clean)


Training set size: 877 (60.0%)
Validation set size: 292 (20.0%)
Test set size: 293 (20.0%)
Total: 1462


In [8]:
# Question 1: ROC AUC feature importance
# For each numerical variable, use it as score (prediction) and compute AUC with y as ground truth
# Use the training dataset for that
# If AUC < 0.5, invert the variable by putting "-" in front

# Define the numerical variables to evaluate
numerical_vars = ['lead_score', 'number_of_courses_viewed', 'interaction_count', 'annual_income']
y_train = df_train_full['converted']

auc_scores = {}

print("ROC AUC scores for each numerical variable:")
print("=" * 50)

for var in numerical_vars:
    # Use the variable as prediction score
    scores = df_train_full[var]
    
    # Calculate AUC
    auc = roc_auc_score(y_train, scores)
    
    # If AUC < 0.5, invert the variable
    if auc < 0.5:
        scores_inverted = -scores
        auc_inverted = roc_auc_score(y_train, scores_inverted)
        auc_scores[var] = auc_inverted
        print(f"{var}: AUC = {auc:.4f} -> Inverted AUC = {auc_inverted:.4f}")
    else:
        auc_scores[var] = auc
        print(f"{var}: AUC = {auc:.4f}")

print("\n" + "=" * 50)


ROC AUC scores for each numerical variable:
lead_score: AUC = 0.6111
number_of_courses_viewed: AUC = 0.7652
interaction_count: AUC = 0.7272
annual_income: AUC = 0.5446



In [13]:
# Find which variable has the highest AUC
print("Final AUC scores (after inversion if needed):")
print("-" * 40)

for var, auc in auc_scores.items():
    print(f"{var}: {auc:.4f}")

print("\n" + "-" * 40)
highest_var = max(auc_scores, key=auc_scores.get)
highest_auc = auc_scores[highest_var]

print(f"\nAnswer: {highest_var} has the highest AUC = {highest_auc:.4f}")



Final AUC scores (after inversion if needed):
----------------------------------------
lead_score: 0.6111
number_of_courses_viewed: 0.7652
interaction_count: 0.7272
annual_income: 0.5446

----------------------------------------

Answer: number_of_courses_viewed has the highest AUC = 0.7652


In [10]:
# Question 2: Training the model
# Apply one-hot-encoding using DictVectorizer and train the logistic regression

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

# Prepare features and target
features = ['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 
           'employment_status', 'location', 'interaction_count', 'lead_score']

# Convert training data to dictionary format for DictVectorizer
train_dict = df_train_full[features].to_dict(orient='records')
val_dict = df_val[features].to_dict(orient='records')

# Initialize DictVectorizer
dv = DictVectorizer(sparse=False)

# Fit and transform training data
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

# Get target variables
y_train = df_train_full['converted']
y_val = df_val['converted']

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Feature names: {dv.feature_names_[:10]}...")  # Show first 10 features


Training set shape: (877, 31)
Validation set shape: (292, 31)
Feature names: ['annual_income', 'employment_status=NA', 'employment_status=employed', 'employment_status=self_employed', 'employment_status=student', 'employment_status=unemployed', 'industry=NA', 'industry=education', 'industry=finance', 'industry=healthcare']...


In [12]:
# Train logistic regression with specified parameters
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=1)
model.fit(X_train, y_train)

# Make predictions on validation set
y_pred = model.predict_proba(X_val)[:, 1]  # Get probability of positive class

# Calculate AUC on validation dataset
val_auc = roc_auc_score(y_val, y_pred)

print(f"Validation AUC: {val_auc:.3f}")



Validation AUC: 0.794


In [15]:
# Question 3: Precision and Recall
# Evaluate the model on all thresholds from 0.0 to 1.0 with step 0.01
# For each threshold, compute precision and recall
# Find where precision and recall curves intersect

from sklearn.metrics import precision_score, recall_score

# Generate thresholds from 0.0 to 1.0 with step 0.01
thresholds = np.arange(0.0, 1.01, 0.01)

# Lists to store precision and recall values
precisions = []
recalls = []

# Calculate precision and recall for each threshold
for threshold in thresholds:
    # Convert probabilities to binary predictions using threshold
    y_pred_binary = (y_pred >= threshold).astype(int)
    
    # Calculate precision and recall
    precision = precision_score(y_val, y_pred_binary, zero_division=0)
    recall = recall_score(y_val, y_pred_binary, zero_division=0)
    
    precisions.append(precision)
    recalls.append(recall)

print(f"Calculated precision and recall for {len(thresholds)} thresholds")
print(f"Threshold range: {thresholds[0]} to {thresholds[-1]} with step {thresholds[1] - thresholds[0]}")


Calculated precision and recall for 101 thresholds
Threshold range: 0.0 to 1.0 with step 0.01


In [None]:
# Find where precision and recall curves intersect
# Find the threshold where the difference between precision and recall is minimized
differences = np.abs(np.array(precisions) - np.array(recalls))
intersection_idx = np.argmin(differences)
intersection_threshold = thresholds[intersection_idx]
intersection_precision = precisions[intersection_idx]
intersection_recall = recalls[intersection_idx]

print(f"\nIntersection point:")
print(f"Threshold: {intersection_threshold:.3f}")
print(f"Precision: {intersection_precision:.3f}")
print(f"Recall: {intersection_recall:.3f}")

# Display some sample values around the intersection point
print(f"\nSample values around intersection:")
start_idx = max(0, intersection_idx - 5)
end_idx = min(len(thresholds), intersection_idx + 6)
for i in range(start_idx, end_idx):
    print(f"Threshold {thresholds[i]:.2f}: Precision={precisions[i]:.3f}, Recall={recalls[i]:.3f}, Diff={abs(precisions[i] - recalls[i]):.3f}")





Intersection point:
Threshold: 0.590
Precision: 0.807
Recall: 0.807

Sample values around intersection:
Threshold 0.54: Precision=0.782, Recall=0.880, Diff=0.098
Threshold 0.55: Precision=0.789, Recall=0.875, Diff=0.086
Threshold 0.56: Precision=0.799, Recall=0.870, Diff=0.071
Threshold 0.57: Precision=0.807, Recall=0.849, Diff=0.042
Threshold 0.58: Precision=0.805, Recall=0.818, Diff=0.013
Threshold 0.59: Precision=0.807, Recall=0.807, Diff=0.000
Threshold 0.60: Precision=0.811, Recall=0.802, Diff=0.008
Threshold 0.61: Precision=0.820, Recall=0.781, Diff=0.038
Threshold 0.62: Precision=0.830, Recall=0.760, Diff=0.069
Threshold 0.63: Precision=0.822, Recall=0.724, Diff=0.099
Threshold 0.64: Precision=0.823, Recall=0.703, Diff=0.120

Closest option to 0.590: 0.545

Answer: Precision and recall curves intersect at threshold 0.590


In [17]:
# Question 4: F1 Score
# Compute F1 score for all thresholds from 0.0 to 1.0 with increment 0.01
# Find at which threshold F1 is maximal
# F1 = 2 * P * R / (P + R) where P is precision and R is recall

from sklearn.metrics import f1_score

# Calculate F1 score for each threshold
f1_scores = []

for threshold in thresholds:
    # Convert probabilities to binary predictions using threshold
    y_pred_binary = (y_pred >= threshold).astype(int)
    
    # Calculate F1 score
    f1 = f1_score(y_val, y_pred_binary, zero_division=0)
    f1_scores.append(f1)

# Find the threshold where F1 is maximal
max_f1_idx = np.argmax(f1_scores)
max_f1_threshold = thresholds[max_f1_idx]
max_f1_score = f1_scores[max_f1_idx]

print(f"Maximum F1 score: {max_f1_score:.3f}")
print(f"Threshold at maximum F1: {max_f1_threshold:.3f}")

# Display some sample values around the maximum F1 point
print(f"\nSample values around maximum F1:")
start_idx = max(0, max_f1_idx - 5)
end_idx = min(len(thresholds), max_f1_idx + 6)
for i in range(start_idx, end_idx):
    print(f"Threshold {thresholds[i]:.2f}: F1={f1_scores[i]:.3f}")

# Check which option this matches
options = [0.14, 0.34, 0.54, 0.74]
closest_option = min(options, key=lambda x: abs(x - max_f1_threshold))
print(f"\nClosest option to {max_f1_threshold:.3f}: {closest_option}")

print(f"\nAnswer: F1 score is maximal at threshold {max_f1_threshold:.3f}")


Maximum F1 score: 0.848
Threshold at maximum F1: 0.470

Sample values around maximum F1:
Threshold 0.42: F1=0.831
Threshold 0.43: F1=0.831
Threshold 0.44: F1=0.836
Threshold 0.45: F1=0.840
Threshold 0.46: F1=0.843
Threshold 0.47: F1=0.848
Threshold 0.48: F1=0.846
Threshold 0.49: F1=0.837
Threshold 0.50: F1=0.837
Threshold 0.51: F1=0.841
Threshold 0.52: F1=0.834

Closest option to 0.470: 0.54

Answer: F1 score is maximal at threshold 0.470


In [19]:
# Question 5: 5-Fold Cross-Validation
# Use KFold class from Scikit-Learn to evaluate our model on 5 different folds
# KFold(n_splits=5, shuffle=True, random_state=1)
# Calculate standard deviation of AUC scores across different folds

from sklearn.model_selection import KFold

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=1)

# Prepare features and target for cross-validation
# Use the same features as before
features = ['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 
           'employment_status', 'location', 'interaction_count', 'lead_score']

# Convert full training data to dictionary format for DictVectorizer
train_dict_full = df_train_full[features].to_dict(orient='records')

# Initialize DictVectorizer and fit on full training data
dv_cv = DictVectorizer(sparse=False)
dv_cv.fit(train_dict_full)

# Get target variable
y_train_full = df_train_full['converted']

# Lists to store AUC scores for each fold
auc_scores_cv = []

print("5-Fold Cross-Validation Results:")
print("=" * 40)

# Iterate over different folds
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(df_train_full)):
    # Split the data into train and validation for this fold
    df_fold_train = df_train_full.iloc[train_idx]
    df_fold_val = df_train_full.iloc[val_idx]
    
    # Convert to dictionary format
    train_dict_fold = df_fold_train[features].to_dict(orient='records')
    val_dict_fold = df_fold_val[features].to_dict(orient='records')
    
    # Transform using the fitted DictVectorizer
    X_train_fold = dv_cv.transform(train_dict_fold)
    X_val_fold = dv_cv.transform(val_dict_fold)
    
    # Get target variables
    y_train_fold = df_fold_train['converted']
    y_val_fold = df_fold_val['converted']
    
    # Train the model on train with specified parameters
    model_fold = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=1)
    model_fold.fit(X_train_fold, y_train_fold)
    
    # Make predictions on validation set
    y_pred_fold = model_fold.predict_proba(X_val_fold)[:, 1]
    
    # Calculate AUC on validation
    auc_fold = roc_auc_score(y_val_fold, y_pred_fold)
    auc_scores_cv.append(auc_fold)
    
    print(f"Fold {fold_idx + 1}: AUC = {auc_fold:.4f}")

print("=" * 40)


5-Fold Cross-Validation Results:
Fold 1: AUC = 0.8117
Fold 2: AUC = 0.8232
Fold 3: AUC = 0.8364
Fold 4: AUC = 0.8392
Fold 5: AUC = 0.8283


In [20]:
# Calculate statistics for cross-validation results
mean_auc = np.mean(auc_scores_cv)
std_auc = np.std(auc_scores_cv)

print(f"Cross-Validation Statistics:")
print(f"Mean AUC: {mean_auc:.4f}")
print(f"Standard Deviation: {std_auc:.4f}")
print(f"AUC Scores: {[f'{score:.4f}' for score in auc_scores_cv]}")

# Check which option this matches
options = [0.0001, 0.006, 0.06, 0.36]
closest_option = min(options, key=lambda x: abs(x - std_auc))
print(f"\nClosest option to {std_auc:.4f}: {closest_option}")

print(f"\nAnswer: The standard deviation of AUC scores across different folds is {std_auc:.4f}")


Cross-Validation Statistics:
Mean AUC: 0.8278
Standard Deviation: 0.0098
AUC Scores: ['0.8117', '0.8232', '0.8364', '0.8392', '0.8283']

Closest option to 0.0098: 0.006

Answer: The standard deviation of AUC scores across different folds is 0.0098


In [21]:
# Question 6: Hyperparameter Tuning
# Use 5-Fold cross-validation to find the best parameter C
# Iterate over C values: [0.000001, 0.001, 1]
# Find which C leads to the best mean score

# C values to test
C_values = [0.000001, 0.001, 1]

# Initialize KFold with the same parameters as previously
kf_tuning = KFold(n_splits=5, shuffle=True, random_state=1)

# Prepare features (same as before)
features = ['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 
           'employment_status', 'location', 'interaction_count', 'lead_score']

# Convert full training data to dictionary format for DictVectorizer
train_dict_full = df_train_full[features].to_dict(orient='records')

# Initialize DictVectorizer and fit on full training data
dv_tuning = DictVectorizer(sparse=False)
dv_tuning.fit(train_dict_full)

# Get target variable
y_train_full = df_train_full['converted']

# Dictionary to store results for each C value
results = {}

print("Hyperparameter Tuning Results:")
print("=" * 50)

# Iterate over C values
for C in C_values:
    auc_scores_cv = []
    
    print(f"\nTesting C = {C}")
    print("-" * 30)
    
    # Iterate over different folds for this C value
    for fold_idx, (train_idx, val_idx) in enumerate(kf_tuning.split(df_train_full)):
        # Split the data into train and validation for this fold
        df_fold_train = df_train_full.iloc[train_idx]
        df_fold_val = df_train_full.iloc[val_idx]
        
        # Convert to dictionary format
        train_dict_fold = df_fold_train[features].to_dict(orient='records')
        val_dict_fold = df_fold_val[features].to_dict(orient='records')
        
        # Transform using the fitted DictVectorizer
        X_train_fold = dv_tuning.transform(train_dict_fold)
        X_val_fold = dv_tuning.transform(val_dict_fold)
        
        # Get target variables
        y_train_fold = df_fold_train['converted']
        y_val_fold = df_fold_val['converted']
        
        # Train the model with current C value
        model_fold = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=1)
        model_fold.fit(X_train_fold, y_train_fold)
        
        # Make predictions on validation set
        y_pred_fold = model_fold.predict_proba(X_val_fold)[:, 1]
        
        # Calculate AUC on validation
        auc_fold = roc_auc_score(y_val_fold, y_pred_fold)
        auc_scores_cv.append(auc_fold)
        
        print(f"  Fold {fold_idx + 1}: AUC = {auc_fold:.4f}")
    
    # Calculate mean and std for this C value
    mean_auc = np.mean(auc_scores_cv)
    std_auc = np.std(auc_scores_cv)
    
    # Store results
    results[C] = {
        'mean': mean_auc,
        'std': std_auc,
        'scores': auc_scores_cv
    }
    
    print(f"  Mean AUC: {mean_auc:.3f}")
    print(f"  Std AUC:  {std_auc:.3f}")

print("\n" + "=" * 50)


Hyperparameter Tuning Results:

Testing C = 1e-06
------------------------------
  Fold 1: AUC = 0.5231
  Fold 2: AUC = 0.6092
  Fold 3: AUC = 0.5775
  Fold 4: AUC = 0.4885
  Fold 5: AUC = 0.5435
  Mean AUC: 0.548
  Std AUC:  0.042

Testing C = 0.001
------------------------------
  Fold 1: AUC = 0.8440
  Fold 2: AUC = 0.8817
  Fold 3: AUC = 0.8817
  Fold 4: AUC = 0.8875
  Fold 5: AUC = 0.8667
  Mean AUC: 0.872
  Std AUC:  0.016

Testing C = 1
------------------------------
  Fold 1: AUC = 0.8117
  Fold 2: AUC = 0.8232
  Fold 3: AUC = 0.8364
  Fold 4: AUC = 0.8392
  Fold 5: AUC = 0.8283
  Mean AUC: 0.828
  Std AUC:  0.010



In [22]:
# Find the best C parameter according to the criteria:
# 1. Best mean score
# 2. If ties, select the score with the lowest std
# 3. If still ties, select the smallest C

print("Summary of Results:")
print("-" * 30)

# Display all results
for C in C_values:
    mean_auc = results[C]['mean']
    std_auc = results[C]['std']
    print(f"C = {C}: Mean = {mean_auc:.3f}, Std = {std_auc:.3f}")

print("\nFinding the best C parameter:")
print("-" * 30)

# Find the best C based on the criteria
best_mean = max(results[C]['mean'] for C in C_values)
print(f"Best mean score: {best_mean:.3f}")

# Find all C values with the best mean score
best_mean_candidates = [C for C in C_values if abs(results[C]['mean'] - best_mean) < 1e-6]

if len(best_mean_candidates) > 1:
    print(f"Tie in mean scores for C values: {best_mean_candidates}")
    
    # Among those with best mean, find the one with lowest std
    best_std = min(results[C]['std'] for C in best_mean_candidates)
    print(f"Best std among tied means: {best_std:.3f}")
    
    # Find all C values with the best mean AND best std
    best_std_candidates = [C for C in best_mean_candidates if abs(results[C]['std'] - best_std) < 1e-6]
    
    if len(best_std_candidates) > 1:
        print(f"Tie in std scores for C values: {best_std_candidates}")
        # Among those with best mean and best std, select the smallest C
        best_C = min(best_std_candidates)
        print(f"Selecting smallest C among tied std scores: {best_C}")
    else:
        best_C = best_std_candidates[0]
        print(f"Selected C with best mean and lowest std: {best_C}")
else:
    best_C = best_mean_candidates[0]
    print(f"Selected C with best mean score: {best_C}")

# Display final result
best_mean_auc = results[best_C]['mean']
best_std_auc = results[best_C]['std']

print(f"\nBest C parameter: {best_C}")
print(f"Best mean AUC: {best_mean_auc:.3f}")
print(f"Best std AUC: {best_std_auc:.3f}")

# Check which option this matches
options = [0.000001, 0.001, 1]
print(f"\nAnswer: The best C parameter is {best_C}")


Summary of Results:
------------------------------
C = 1e-06: Mean = 0.548, Std = 0.042
C = 0.001: Mean = 0.872, Std = 0.016
C = 1: Mean = 0.828, Std = 0.010

Finding the best C parameter:
------------------------------
Best mean score: 0.872
Selected C with best mean score: 0.001

Best C parameter: 0.001
Best mean AUC: 0.872
Best std AUC: 0.016

Answer: The best C parameter is 0.001
