In [70]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
import numpy as np


In [22]:
df = pd.read_csv("./course_lead_scoring.csv")

df.sample(5)


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
300,paid_ads,other,3,76502.0,student,south_america,2,0.42,1
1442,paid_ads,healthcare,1,62536.0,,asia,3,0.34,0
1334,social_media,other,2,44395.0,,south_america,6,0.63,1
1289,social_media,finance,3,57588.0,employed,australia,6,0.45,1
982,events,education,2,,self_employed,north_america,4,0.43,1


In [24]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [101]:
categorical_features = ['lead_source', 'industry', 'employment_status', 'location']
numerical_features = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [26]:
df[numerical_features] = df[numerical_features].fillna(0)
df[categorical_features] = df[categorical_features].fillna('NA')

## Q1
Most Frequent Observation Mode for column industry

In [27]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [28]:
df.describe(include='all')

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
count,1462,1462,1462.0,1462.0,1462,1462,1462.0,1462.0,1462.0
unique,6,8,,,5,8,,,
top,organic_search,retail,,,self_employed,north_america,,,
freq,282,203,,,352,225,,,
mean,,,2.031464,52472.172367,,,2.976744,0.506108,0.619015
std,,,1.449717,24254.34703,,,1.681564,0.288465,0.485795
min,,,0.0,0.0,,,0.0,0.0,0.0
25%,,,1.0,44097.25,,,2.0,0.2625,0.0
50%,,,2.0,57449.5,,,3.0,0.51,1.0
75%,,,3.0,68241.0,,,4.0,0.75,1.0


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1462 non-null   object 
 1   industry                  1462 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1462 non-null   float64
 4   employment_status         1462 non-null   object 
 5   location                  1462 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


## Q2 
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

In [30]:
df[[i for i in df.columns if (df[i].dtype in ['int64', 'float64'])]].corr().round(2)

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.01,-0.02,-0.0,0.44
annual_income,0.01,1.0,0.03,0.02,0.05
interaction_count,-0.02,0.03,1.0,0.01,0.37
lead_score,-0.0,0.02,0.01,1.0,0.19
converted,0.44,0.05,0.37,0.19,1.0


In [31]:
X = df.drop('converted', axis=1)  # Replace 'target_column' with your actual target column name
y = df['converted']

# split for train
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)

# split for val and test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

In [32]:
X_test

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
1291,events,manufacturing,2,64134.0,student,north_america,3,0.93
282,referral,healthcare,0,75911.0,,middle_east,4,0.17
836,social_media,manufacturing,3,70900.0,student,europe,4,0.38
594,paid_ads,technology,3,0.0,student,middle_east,4,0.10
1377,referral,,0,48225.0,unemployed,africa,3,0.40
...,...,...,...,...,...,...,...,...
847,organic_search,healthcare,2,65227.0,,south_america,4,0.41
900,,,4,50215.0,employed,australia,1,0.48
1042,referral,technology,3,49937.0,self_employed,asia,4,0.19
486,referral,manufacturing,4,68335.0,student,europe,1,0.07


## Q3

- Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
- Round the scores to 2 decimals using round(score, 2).

Which of these variables has the biggest mutual information score?

In [33]:
# First, identify categorical columns in the training set
categorical_columns = X_train.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_columns}")

# Create a copy of training data for processing
X_train_encoded = X_train.copy()

# Encode categorical variables for mutual information calculation
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train[col].astype(str))
    label_encoders[col] = le

# Calculate mutual information scores for categorical variables
mi_scores = mutual_info_classif(X_train_encoded[categorical_columns], y_train, random_state=42)

# Create a dictionary with column names and their MI scores
mi_dict = {}
for i, col in enumerate(categorical_columns):
    mi_dict[col] = round(mi_scores[i], 2)

print("Mutual Information Scores:")
for col, score in mi_dict.items():
    print(f"{col}: {score}")

# Find the variable with the highest MI score
max_mi_var = max(mi_dict, key=mi_dict.get)
print(f"\nVariable with highest mutual information score: {max_mi_var} ({mi_dict[max_mi_var]})")

Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']
Mutual Information Scores:
lead_source: 0.04
industry: 0.03
employment_status: 0.02
location: 0.02

Variable with highest mutual information score: lead_source (0.04)


# Question 4

- Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.


In [34]:
X_train.sample(10)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
657,events,finance,0,49936.0,employed,south_america,0,0.2
207,events,finance,2,51245.0,employed,middle_east,2,0.53
555,paid_ads,,1,72926.0,self_employed,south_america,2,0.71
77,referral,manufacturing,5,39588.0,self_employed,africa,3,0.02
1159,organic_search,retail,1,40440.0,student,australia,0,0.38
1070,referral,,3,64895.0,employed,middle_east,2,0.39
228,events,retail,3,45170.0,student,middle_east,3,0.38
540,social_media,technology,1,66963.0,employed,north_america,0,0.71
511,,manufacturing,2,47243.0,self_employed,asia,2,0.41
121,events,healthcare,4,82646.0,self_employed,australia,3,0.35


In [57]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [63]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ], remainder='passthrough'
)
preprocessor.set_output(transform='pandas')

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [65]:
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)

In [66]:
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
])

In [67]:
model_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [69]:
y_val_pred = model_pipeline.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.2f}")

Validation Accuracy: 0.74


# Question 5
- Let's find the least useful feature using the feature elimination technique.
- Train a model using the same features and parameters as in Q4 (without rounding).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

In [111]:
# Question 5: Feature Elimination to find least useful feature

# First, get baseline accuracy with all features
baseline_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ], remainder='passthrough'
)
baseline_preprocessor.set_output(transform='pandas')

baseline_model = Pipeline(steps=[
    ('preprocessor', baseline_preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
])

baseline_model.fit(X_train, y_train)
y_val_pred_baseline = baseline_model.predict(X_val)
baseline_accuracy = accuracy_score(y_val, y_val_pred_baseline)
print(f"Baseline accuracy (all features): {baseline_accuracy:.6f}")

# Get all feature names
all_features = categorical_features
print(f"All features to test: {all_features}")

# Dictionary to store results
feature_elimination_results = {}

print("\n" + "="*60)
print("TESTING FEATURE ELIMINATION:")
print("="*60)

# Test removing each feature one by one
for feature_to_remove in all_features:
    print(f"\nRemoving feature: {feature_to_remove}")
    
    # Create lists of remaining features
    remaining_categorical = [f for f in categorical_features if f != feature_to_remove]
    remaining_numerical = [f for f in numerical_features if f != feature_to_remove]
    remaining_all = remaining_categorical + remaining_numerical
    
    # Create preprocessor for remaining features
    if remaining_categorical:
        temp_preprocessor = ColumnTransformer(
            transformers=[
                ('cat', categorical_transformer, remaining_categorical)
            ], remainder='passthrough'
        )
    else:
        # If no categorical features remain, just pass through numerical
        temp_preprocessor = ColumnTransformer(
            transformers=[], remainder='passthrough'
        )
    
    temp_preprocessor.set_output(transform='pandas')
    
    # Select remaining features
    X_train_reduced = X_train[remaining_all]
    X_val_reduced = X_val[remaining_all]
    
    # Create and train model without this feature
    temp_model = Pipeline(steps=[
        ('preprocessor', temp_preprocessor),
        ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
    ])
    
    temp_model.fit(X_train_reduced, y_train)
    y_val_pred_temp = temp_model.predict(X_val_reduced)
    temp_accuracy = accuracy_score(y_val, y_val_pred_temp)
    
    # Calculate accuracy difference
    accuracy_difference = baseline_accuracy - temp_accuracy
    
    feature_elimination_results[feature_to_remove] = {
        'accuracy_without': temp_accuracy,
        'accuracy_difference': accuracy_difference
    }
    
    print(f"  Accuracy without {feature_to_remove}: {temp_accuracy:.6f}")
    print(f"  Difference from baseline: {accuracy_difference:.6f}")

print("\n" + "="*60)
print("FEATURE ELIMINATION SUMMARY:")
print("="*60)

# Sort by accuracy difference (ascending - least useful first)
sorted_results = sorted(feature_elimination_results.items(), key=lambda x: x[1]['accuracy_difference'])

for feature, results in sorted_results:
    print(f"{feature:25} | Difference: {results['accuracy_difference']:+.6f} | Accuracy: {results['accuracy_without']:.6f}")

# Identify least useful feature
least_useful_feature = sorted_results[0][0]
smallest_difference = sorted_results[0][1]['accuracy_difference']

print(f"\n🎯 LEAST USEFUL FEATURE: {least_useful_feature}")
print(f"   Removing it causes accuracy change of: {smallest_difference:+.6f}")
print(f"   (Smallest change = least impact = least useful)")

# Also show most useful feature for comparison
most_useful_feature = sorted_results[-1][0]
largest_difference = sorted_results[-1][1]['accuracy_difference']
print(f"\n🏆 MOST USEFUL FEATURE: {most_useful_feature}")
print(f"   Removing it causes accuracy change of: {largest_difference:+.6f}")

Baseline accuracy (all features): 0.743151
All features to test: ['lead_source', 'industry', 'employment_status', 'location']

TESTING FEATURE ELIMINATION:

Removing feature: lead_source
  Accuracy without lead_source: 0.729452
  Difference from baseline: 0.013699

Removing feature: industry
  Accuracy without industry: 0.743151
  Difference from baseline: 0.000000

Removing feature: employment_status
  Accuracy without employment_status: 0.746575
  Difference from baseline: -0.003425

Removing feature: location
  Accuracy without location: 0.743151
  Difference from baseline: 0.000000

FEATURE ELIMINATION SUMMARY:
employment_status         | Difference: -0.003425 | Accuracy: 0.746575
industry                  | Difference: +0.000000 | Accuracy: 0.743151
location                  | Difference: +0.000000 | Accuracy: 0.743151
lead_source               | Difference: +0.013699 | Accuracy: 0.729452

🎯 LEAST USEFUL FEATURE: employment_status
   Removing it causes accuracy change of: -0.00342

# Question 6
- Now let's train a regularized logistic regression.
- Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
- Train models using all the features as in Q4.
- Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

In [93]:
for i in [0.01, 0.1, 1, 10, 100]:
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(penalty='l1', solver='liblinear', C=i, random_state=42))
    ])
    
    model_pipeline.fit(X_train, y_train)
    
    y_val_pred = model_pipeline.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"Validation Accuracy with C={i}: {val_accuracy:.3f}")

Validation Accuracy with C=0.01: 0.723
Validation Accuracy with C=0.1: 0.856
Validation Accuracy with C=1: 0.846
Validation Accuracy with C=10: 0.856
Validation Accuracy with C=100: 0.856


In [94]:
for i in [0.01, 0.1, 1, 10, 100]:
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(penalty='l1', solver='liblinear', C=i, random_state=42))
    ])
    
    model_pipeline.fit(X_train, y_train)
    
    y_test_pred = model_pipeline.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"testidation Accuracy with C={i}: {test_accuracy:.3f}")

testidation Accuracy with C=0.01: 0.686
testidation Accuracy with C=0.1: 0.805
testidation Accuracy with C=1: 0.802
testidation Accuracy with C=10: 0.805
testidation Accuracy with C=100: 0.809
