In [1]:
import pandas as pd

# data preparation
df = pd.read_csv('course_lead_scoring.csv')

In [2]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [3]:
numerical_column = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
categorical_column = ['lead_source', 'industry', 'employment_status', 'location']

# cek missing value
print(df[numerical_column].isnull().sum())
print(df[categorical_column].isnull().sum())

number_of_courses_viewed      0
annual_income               181
interaction_count             0
lead_score                    0
dtype: int64
lead_source          128
industry             134
employment_status    100
location              63
dtype: int64


In [4]:
# create fill dictionary based on column types
fill_dict = {}

# for numerical columns  
for col in numerical_column:
    fill_dict[col] = 0.0

# for categorical columns
for col in categorical_column:
    fill_dict[col] = 'NA'

# fill all at once
df = df.fillna(fill_dict)
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [5]:
# 1. industry mode
print(df.industry.mode())
print(df.industry.value_counts())

0    retail
Name: industry, dtype: object
industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64


In [6]:
# 2. correlation
df[numerical_column].corrwith(df.converted).sort_values(ascending=False)

number_of_courses_viewed    0.435914
interaction_count           0.374573
lead_score                  0.193673
annual_income               0.053131
dtype: float64

In [7]:
# split data for validation framework
from sklearn.model_selection import train_test_split

# need to split data to 60 train 20 val and 20 test
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [8]:
# 3. mutual information
from sklearn.metrics import mutual_info_score

def mutual_info_converted_score(feature):
    return round(mutual_info_score(feature, y_train),2)

df_train[categorical_column].apply(mutual_info_converted_score)

lead_source          0.04
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

In [9]:
# transform to one hot encoding
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

# train data set. convert categorical to one hot encoding
train_dicts_categorical = df_train[categorical_column].to_dict(orient="records")
X_train_one_hot = dv.fit_transform(train_dicts_categorical)

# convert one-hot encoded features to DataFrame with proper column names
X_train_categorical = pd.DataFrame(X_train_one_hot, columns=dv.get_feature_names_out())
# get numerical feature
X_train_numerical = df_train[numerical_column]

# reset indices to ensure proper alignment
X_train_numerical = X_train_numerical.reset_index(drop=True)
X_train_categorical = X_train_categorical.reset_index(drop=True)

# combine horizontally categorical feature that already convert with one hot encode + numerical feature
X_train = pd.concat([X_train_numerical, X_train_categorical], axis=1)

# val data set. convert categorical to one hot encoding
val_dicts_categorical = df_val[categorical_column].to_dict(orient="records")
X_val_one_hot = dv.fit_transform(val_dicts_categorical)

# convert one-hot encoded features to DataFrame with proper column names
X_val_categorical = pd.DataFrame(X_val_one_hot, columns=dv.get_feature_names_out())
# get numerical feature
X_val_numerical = df_val[numerical_column]

# reset indices to ensure proper alignment
X_val_numerical = X_val_numerical.reset_index(drop=True)
X_val_categorical = X_val_categorical.reset_index(drop=True)

# combine horizontally categorical feature that already convert with one hot encode + numerical feature
X_val = pd.concat([X_val_numerical, X_val_categorical], axis=1)


In [10]:
# train logistic regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# predict proba will produce two column
# first column probability of 0, second column probability of 1
# in this case 0 = not converted, 1 = converted
y_val_pred = model.predict_proba(X_val)[:,1]
converted_decision = (y_val_pred >= 0.5)

# 4. count the accuracy
accuracy = (y_val == converted_decision.astype('int')).mean()
print(accuracy)
print(round(accuracy, 2))

0.6996587030716723
0.7


In [12]:
# 5.
# first, let's get the original accuracy with all features
original_accuracy = (y_val == converted_decision.astype('int')).mean()
print(f"Original accuracy: {original_accuracy}")

# dictionary to store accuracy differences for the three features
accuracy_differences = {}

# features to test excluding
features_to_test = ['industry', 'employment_status', 'lead_score']

for feature_to_exclude in features_to_test:
    # check if the feature exists in our dataset
    if feature_to_exclude in X_train.columns:
        # exclude the single feature
        X_train_reduced = X_train.drop(columns=[feature_to_exclude])
        X_val_reduced = X_val.drop(columns=[feature_to_exclude])
    else:
        # if it's a categorical feature that was one-hot encoded, find all related columns
        related_columns = [col for col in X_train.columns if col.startswith(feature_to_exclude + '=')]
        if related_columns:
            # exclude all columns related to this categorical feature
            X_train_reduced = X_train.drop(columns=related_columns)
            X_val_reduced = X_val.drop(columns=related_columns)
        else:
            print(f"Warning: Feature '{feature_to_exclude}' not found in dataset")
            continue
    
    # train logistic regression with the same parameters
    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)
    
    # make predictions
    y_val_pred_reduced = model_reduced.predict_proba(X_val_reduced)[:, 1]
    converted_decision_reduced = (y_val_pred_reduced >= 0.5)
    
    # calculate accuracy
    accuracy_reduced = (y_val == converted_decision_reduced.astype('int')).mean()
    
    # calculate the difference from original accuracy
    difference = original_accuracy - accuracy_reduced
    accuracy_differences[feature_to_exclude] = difference
    
    print(f"Without '{feature_to_exclude}': accuracy = {accuracy_reduced:.6f}, difference = {difference:.6f}")

# find which feature has the smallest absolute difference
smallest_diff_feature = min(accuracy_differences.items(), key=lambda x: abs(x[1]))
print(f"\nThe feature with the smallest difference is '{smallest_diff_feature[0]}' with difference = {smallest_diff_feature[1]:.6f}")

# print the answer
print(f"\nANSWER: {smallest_diff_feature[0]}")

Original accuracy: 0.6996587030716723
Without 'industry': accuracy = 0.699659, difference = 0.000000
Without 'employment_status': accuracy = 0.696246, difference = 0.003413
Without 'lead_score': accuracy = 0.706485, difference = -0.006826

The feature with the smallest difference is 'industry' with difference = 0.000000

ANSWER: industry


In [18]:
# 6
# values of C to try
c_values = [0.01, 0.1, 1, 10, 100]

# dictionary to store accuracies for each C
accuracies = {}

print("Training regularized logistic regression with different C values:")
print("C\t\tAccuracy")

for c in c_values:
    # train logistic regression with current C value
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions
    y_val_pred = model.predict_proba(X_val)[:, 1]
    converted_decision = (y_val_pred >= 0.5)
    
    # calculate accuracy and round to 3 decimal digits
    accuracy = (y_val == converted_decision.astype('int')).mean()
    accuracy_rounded = round(accuracy, 3)
    accuracies[c] = accuracy_rounded
    
    print(f"{c}\t\t{accuracy}")

# find the C value that gives the best accuracy
best_c = max(accuracies.items(), key=lambda x: x[1])
print(f"\nBest C value: {best_c[0]} with accuracy: {best_c[1]}")
# import numpy as np
# print("Debugging the model predictions:")

# for c in [0.01, 0.1, 1, 10, 100]:
#     # Train model
#     model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
#     model.fit(X_train, y_train)
    
#     # Get predictions and probabilities
#     y_val_pred_proba = model.predict_proba(X_val)[:, 1]
#     y_val_pred = model.predict(X_val)
    
#     # Check if predictions are all the same class
#     unique_predictions = np.unique(y_val_pred)
#     proba_range = y_val_pred_proba.max() - y_val_pred_proba.min()
    
#     print(f"\nC = {c}:")
#     print(f"  Unique predictions: {unique_predictions}")
#     print(f"  Probability range: {proba_range:.6f}")
#     print(f"  Mean probability: {y_val_pred_proba.mean():.6f}")
    
#     # Calculate accuracy using different methods to verify
#     accuracy1 = (y_val == y_val_pred).mean()
#     accuracy2 = (y_val == (y_val_pred_proba >= 0.5).astype(int)).mean()
#     print(f"  Accuracy (predict): {accuracy1}")
#     print(f"  Accuracy (proba>=0.5): {accuracy2}")

# # Let's also check the data characteristics
# print("\n" + "="*50)
# print("Data diagnostics:")
# print(f"Training set shape: {X_train.shape}")
# print(f"Validation set shape: {X_val.shape}")
# print(f"y_train value counts:\n{y_train.value_counts()}")
# print(f"y_val value counts:\n{y_val.value_counts()}")

# # Check for constant predictions
# print(f"\nBaseline accuracy (predicting majority class): {max(y_val.value_counts(normalize=True))}")

# # Check if features are scaled properly (regularization is sensitive to scale)
# print(f"\nFeature ranges (min, max):")
# for col in X_train.columns[:5]:  # Check first 5 features
#     print(f"  {col}: [{X_train[col].min():.3f}, {X_train[col].max():.3f}]")

# # Let's try with feature scaling
# from sklearn.preprocessing import StandardScaler

# print("\n" + "="*50)
# print("Trying with feature scaling:")
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_val_scaled = scaler.transform(X_val)

# for c in [0.01, 0.1, 1, 10, 100]:
#     model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
#     model.fit(X_train_scaled, y_train)
    
#     y_val_pred_proba = model.predict_proba(X_val_scaled)[:, 1]
#     converted_decision = (y_val_pred_proba >= 0.5)
#     accuracy = (y_val == converted_decision.astype('int')).mean()
    
#     print(f"C = {c}: accuracy = {accuracy:.6f}")

# # Let's also check the coefficients to see if regularization is working
# print("\n" + "="*50)
# print("Checking coefficients for different C values:")

# for c in [0.01, 0.1, 1, 10, 100]:
#     model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
#     model.fit(X_train, y_train)
    
#     coef_sum = np.sum(np.abs(model.coef_))
#     print(f"C = {c}: sum of absolute coefficients = {coef_sum:.6f}")

Training regularized logistic regression with different C values:
C		Accuracy
0.01		0.6996587030716723
0.1		0.6996587030716723
1		0.6996587030716723
10		0.6996587030716723
100		0.6996587030716723

Best C value: 0.01 with accuracy: 0.7
