In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("course_lead_scoring.csv")
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [2]:
categorical_features = ["lead_source", "industry", "employment_status", "location"]
df_categorical = df[categorical_features]

numerical_features = ["number_of_courses_viewed", "annual_income", "interaction_count", "lead_score"]
df_numerical = df[numerical_features]

Check if the missing values are presented in the features.

If there are missing values:

For categorical features, replace them with 'NA'

For numerical features, replace with with 0.0

In [3]:
df_categorical = df_categorical.fillna("NA")
df_numerical = df_numerical.fillna(0.0)

In [4]:
print(df_numerical.isnull().sum())
print(df_categorical.isnull().sum())

number_of_courses_viewed    0
annual_income               0
interaction_count           0
lead_score                  0
dtype: int64
lead_source          0
industry             0
employment_status    0
location             0
dtype: int64


In [5]:
df_non_null = pd.concat([df_categorical, df_numerical, df["converted"]], axis=1)
df_non_null

Unnamed: 0,lead_source,industry,employment_status,location,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
0,paid_ads,,unemployed,south_america,1,79450.0,4,0.94,1
1,social_media,retail,employed,south_america,1,46992.0,1,0.80,0
2,events,healthcare,unemployed,australia,5,78796.0,3,0.69,1
3,paid_ads,retail,,australia,2,83843.0,1,0.87,0
4,referral,education,self_employed,europe,3,85012.0,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,self_employed,north_america,1,0.0,4,0.53,1
1458,referral,technology,student,europe,3,65259.0,2,0.24,1
1459,paid_ads,technology,student,north_america,1,45688.0,3,0.02,1
1460,referral,,self_employed,north_america,5,71016.0,0,0.25,1


# Question 1

In [21]:
mode_industry = df_non["industry"].mode()
mode_industry

0    retail
Name: industry, dtype: object


# Question 2

In [7]:
df_numerical.corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


interaction_count and lead_score 0.009888

number_of_courses_viewed and lead_score -0.004879

number_of_courses_viewed and interaction_count -0.023565

annual_income and interaction_count 0.027036	

In [8]:
from sklearn.model_selection import train_test_split
X = df_non_null.drop(columns=['converted'])
y = df_non_null['converted']

# split into Train (60%) and Temp (40%)
df_train, df_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)

# split Temp into Validation (20%) and Test (20%)
df_val, df_test, y_val, y_test = train_test_split(
    df_temp, y_temp, test_size=0.5, random_state=42
)

# Check proportions
print("Train size:", len(df_train), len(y_train))
print("Validation size:", len(df_val), len(y_val))
print("Test size:", len(df_test), len(y_test))

Train size: 877 877
Validation size: 292 292
Test size: 293 293


# Question 3

In [9]:
from sklearn.metrics import mutual_info_score

def mutual_info_churn_score(series):
    return mutual_info_score(series, y_train)
    
mi = round(df_train[categorical_features].apply(mutual_info_churn_score),2)
mi.sort_values(ascending=False)

lead_source          0.03
industry             0.02
employment_status    0.02
location             0.00
dtype: float64

# Question 4

In [10]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [11]:
from sklearn.linear_model import LogisticRegression

# Initialize the model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Fit the model on training data
model.fit(X_train, y_train)

In [12]:
from sklearn.metrics import accuracy_score

# Make predictions on validation set
y_val_pred = model.predict(X_val)

# Calculate accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)

val_accuracy_rounded = round(val_accuracy, 2)
print("Validation Accuracy:", val_accuracy)
print("Rounded Validation Accuracy:", val_accuracy_rounded)

Validation Accuracy: 0.7431506849315068
Rounded Validation Accuracy: 0.74


# Question 5


In [13]:
drop_features = [['industry'], ['employment_status'], ['lead_score']]
for feature in drop_features:
    
    df_train_drop_feature = df_train.drop(columns=feature) 
    df_val_drop_feature = df_val.drop(columns=feature) 
    df_test_drop_feature = df_test.drop(columns=feature) 
    dv = DictVectorizer(sparse=False)
    
    train_dict_drop_feature = df_train_drop_feature.to_dict(orient='records')
    X_train_drop_feature = dv.fit_transform(train_dict_drop_feature)
    
    val_dict_drop_feature = df_val_drop_feature.to_dict(orient='records')
    X_val_drop_feature = dv.transform(val_dict_drop_feature)
    
    
    # Initialize the model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    
    # Fit the model on training data
    model.fit(X_train_drop_feature, y_train)
    
    # Make predictions on validation set
    y_val_pred = model.predict(X_val_drop_feature)
    
    # Calculate accuracy
    val_accuracy_drop = accuracy_score(y_val, y_val_pred)
    print(f"Eliminated feature: {feature}")
    print("Validation Accuracy:", val_accuracy_drop)
    print("Difference:", abs(val_accuracy-val_accuracy_drop))
    print("")
    print("")


Eliminated feature: ['industry']
Validation Accuracy: 0.7431506849315068
Difference: 0.0


Eliminated feature: ['employment_status']
Validation Accuracy: 0.7465753424657534
Difference: 0.003424657534246589


Eliminated feature: ['lead_score']
Validation Accuracy: 0.7431506849315068
Difference: 0.0




# Question 6

In [14]:
C_values = [0.01, 0.1, 1, 10, 100]

# Empty list to store results
results = []

for C in C_values:
    # Initialize the model
    model = LogisticRegression(solver='liblinear',C=C,max_iter=1000,random_state=42)

    # Train on training data
    model.fit(X_train, y_train)

    # Predict on validation data
    y_val_pred = model.predict(X_val)

    # Compute accuracy
    acc = accuracy_score(y_val, y_val_pred)

    # Round to 3 decimal digits
    acc_rounded = round(acc, 3)

    # Store results
    results.append((C, acc_rounded))

# Display results
for C, acc in results:
    print(f"C = {C}: Validation Accuracy = {acc}")

C = 0.01: Validation Accuracy = 0.743
C = 0.1: Validation Accuracy = 0.743
C = 1: Validation Accuracy = 0.743
C = 10: Validation Accuracy = 0.743
C = 100: Validation Accuracy = 0.743
