In [39]:
from IPython.display import HTML

In [40]:
# @title


html_content = """
# Lead Scoring Project

<div style="background-color: #f0f8ff; padding: 15px; border-left: 5px solid #007bff; margin: 10px 0;">
<p><strong>Project Overview:</strong> This project focuses on lead scoring, which is the process of assigning a numerical score to each lead to determine their likelihood of becoming a paying customer. The goal is to identify the most promising leads for follow-up, optimizing marketing and sales efforts.</p>
</div>

## Breakdown of Steps

### <span style="color: #28a745;">Data Loading and Initial Exploration</span>
<ul>
  <li>Loaded a dataset containing information about potential leads, including their source, industry, viewing habits, income, employment status, location, and interaction count, along with a target variable indicating whether they converted (<code>'converted'</code>).</li>
  <li>Displayed the first few rows to get a glimpse of the data.</li>
</ul>

### <span style="color: #28a745;">Data Preparation</span>
<ul>
  <li>Handled missing values by replacing them with <code>'NA'</code> for categorical features and <code>0.0</code> for numerical features.</li>
</ul>

### <span style="color: #28a745;">Exploratory Data Analysis (EDA)</span>
<ul>
  <li>Performed basic EDA, including:
    <ul>
      <li>Finding the most frequent observation for the <code>'industry'</code> column.</li>
      <li>Calculating the correlation matrix for numerical features.</li>
      <li>Calculating mutual information scores between categorical features and the target variable to understand their relationship.</li>
    </ul>
  </li>
</ul>

### <span style="color: #28a745;">Data Splitting</span>
<ul>
  <li>Split the data into training (60%), validation (20%), and test (20%) sets to prepare for model development and evaluation.</li>
</ul>

### <span style="color: #28a745;">Feature Engineering</span>
<ul>
  <li>Applied one-hot encoding to the categorical features to convert them into a numerical format suitable for the logistic regression model.</li>
</ul>

### <span style="color: #28a745;">Model Training and Evaluation</span>
<ul>
  <li>Trained a baseline logistic regression model using all features on the training set and evaluated its accuracy on the validation set.</li>
  <li>Performed a feature elimination analysis by removing specific features (<code>'industry'</code>, <code>'employment_status'</code>, <code>'lead_score'</code>) one by one, retraining the model, and observing the change in accuracy to understand the importance of each feature.</li>
  <li>Trained regularized logistic regression models with different values of the regularization parameter (<code>C</code>) to find the optimal value that provides the best accuracy on the validation set.</li>
</ul>

<div style="background-color: #fff3cd; padding: 10px; border: 1px solid #ffeaa7; margin-top: 20px;">
<p><em>Overall, this project aims to build a predictive model to score leads based on their characteristics and interactions, ultimately improving the efficiency of lead conversion.</em></p>
</div>
"""

HTML(html_content)

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries imported successfully!")

Libraries imported successfully!


In [9]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)
display(df.head())

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [10]:
df.isnull().sum()

Unnamed: 0,0
lead_source,128
industry,134
number_of_courses_viewed,0
annual_income,181
employment_status,100
location,63
interaction_count,0
lead_score,0
converted,0


In [11]:
# Check for missing values
print("Missing values before handling:")
display(df.isnull().sum())

# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include='object').columns
numerical_cols = df.select_dtypes(include=np.number).columns

# Fill missing values in categorical columns with 'NA'
df[categorical_cols] = df[categorical_cols].fillna('NA')

# Fill missing values in numerical columns with 0.0
df[numerical_cols] = df[numerical_cols].fillna(0.0)

print("\nMissing values after handling:")
display(df.isnull().sum())

Missing values before handling:


Unnamed: 0,0
lead_source,128
industry,134
number_of_courses_viewed,0
annual_income,181
employment_status,100
location,63
interaction_count,0
lead_score,0
converted,0



Missing values after handling:


Unnamed: 0,0
lead_source,0
industry,0
number_of_courses_viewed,0
annual_income,0
employment_status,0
location,0
interaction_count,0
lead_score,0
converted,0


In [12]:
most_frequent_industry = df['industry'].mode()[0]
print(f"The most frequent observation for the 'industry' column is: {most_frequent_industry}")

The most frequent observation for the 'industry' column is: retail


In [13]:
# Select numerical columns
numerical_cols = df.select_dtypes(include=np.number).columns

# Calculate the correlation matrix
correlation_matrix = df[numerical_cols].corr()

# Display the correlation matrix
print("Correlation Matrix for Numerical Features:")
display(correlation_matrix)

# Extract correlation values for the specified pairs
specified_pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

print("\nCorrelation values for specified pairs:")
for col1, col2 in specified_pairs:
    correlation_value = correlation_matrix.loc[col1, col2]
    print(f"{col1} and {col2}: {correlation_value:.4f}")

# Find the pair with the biggest absolute correlation among the specified pairs
max_correlation = 0
most_correlated_pair = None

for col1, col2 in specified_pairs:
    correlation_value = correlation_matrix.loc[col1, col2]
    if abs(correlation_value) > abs(max_correlation):
        max_correlation = correlation_value
        most_correlated_pair = (col1, col2)

print(f"\nThe two features that have the biggest correlation among the specified pairs are: {most_correlated_pair[0]} and {most_correlated_pair[1]} with a correlation of {max_correlation:.4f}")

Correlation Matrix for Numerical Features:


Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0



Correlation values for specified pairs:
interaction_count and lead_score: 0.0099
number_of_courses_viewed and lead_score: -0.0049
number_of_courses_viewed and interaction_count: -0.0236
annual_income and interaction_count: 0.0270

The two features that have the biggest correlation among the specified pairs are: annual_income and interaction_count with a correlation of 0.0270


In [17]:
from sklearn.model_selection import train_test_split

# Define the target variable
target = 'converted'
features = df.drop(columns=[target])
target_values = df[target]

# Split the data into train (60%) and temp (40%)
features_train, features_temp, target_train, target_temp = train_test_split(features, target_values, test_size=0.4, random_state=42)

# Split the temp data into validation (50% of temp, 20% of total) and test (50% of temp, 20% of total)
features_val, features_test, target_val, target_test = train_test_split(features_temp, target_temp, test_size=0.5, random_state=42)

# Display the sizes of the resulting dataframes
print("Train set size:", len(features_train))
print("Validation set size:", len(features_val))
print("Test set size:", len(features_test))



Train set size: 877
Validation set size: 292
Test set size: 293


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1462 non-null   object 
 1   industry                  1462 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1462 non-null   float64
 4   employment_status         1462 non-null   object 
 5   location                  1462 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [16]:
from sklearn.metrics import mutual_info_score as mis

In [24]:
categorical_cols = features_train.select_dtypes(include='object').columns

# Calculate mutual information scores
def calculate_mi(series):
    return mis(series, target_train)

# Apply the function to each categorical column in the training features
mi_scores = features_train[categorical_cols].apply(calculate_mi)



In [25]:
# Round the scores to 2 decimals
mi_scores_rounded = mi_scores.round(2)

print("Mutual Information Scores for Categorical Features (Training Set):")
display(mi_scores_rounded)



Mutual Information Scores for Categorical Features (Training Set):


Unnamed: 0,0
lead_source,0.03
industry,0.02
employment_status,0.02
location,0.0


Apply one-hot encoding to the categorical features in the training, validation, and test sets.


In [27]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = features_train.select_dtypes(include='object').columns

# Initialize OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit the encoder on the training data's categorical columns
encoder.fit(features_train[categorical_cols])

# Transform the categorical columns in train, validation, and test sets
features_train_encoded = encoder.transform(features_train[categorical_cols])
features_val_encoded = encoder.transform(features_val[categorical_cols])
features_test_encoded = encoder.transform(features_test[categorical_cols])

# Convert transformed arrays to DataFrames with appropriate column names
encoded_col_names = encoder.get_feature_names_out(categorical_cols)

features_train_encoded_df = pd.DataFrame(features_train_encoded, columns=encoded_col_names, index=features_train.index)
features_val_encoded_df = pd.DataFrame(features_val_encoded, columns=encoded_col_names, index=features_val.index)
features_test_encoded_df = pd.DataFrame(features_test_encoded, columns=encoded_col_names, index=features_test.index)

# Drop original categorical columns
features_train = features_train.drop(columns=categorical_cols)
features_val = features_val.drop(columns=categorical_cols)
features_test = features_test.drop(columns=categorical_cols)

# Concatenate numerical and one-hot encoded features
features_train = pd.concat([features_train, features_train_encoded_df], axis=1)
features_val = pd.concat([features_val, features_val_encoded_df], axis=1)
features_test = pd.concat([features_test, features_test_encoded_df], axis=1)

display(features_train.head())
display(features_val.head())
display(features_test.head())

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,lead_source_NA,lead_source_events,lead_source_organic_search,lead_source_paid_ads,lead_source_referral,lead_source_social_media,...,employment_status_student,employment_status_unemployed,location_NA,location_africa,location_asia,location_australia,location_europe,location_middle_east,location_north_america,location_south_america
442,1,61705.0,4,0.65,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
319,1,55199.0,4,0.09,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
767,1,40841.0,4,0.61,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
756,1,28242.0,3,0.84,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
424,0,64775.0,3,0.7,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,lead_source_NA,lead_source_events,lead_source_organic_search,lead_source_paid_ads,lead_source_referral,lead_source_social_media,...,employment_status_student,employment_status_unemployed,location_NA,location_africa,location_asia,location_australia,location_europe,location_middle_east,location_north_america,location_south_america
886,1,63127.0,6,0.7,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
619,6,75389.0,2,0.04,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
548,2,66519.0,4,0.33,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1046,3,60910.0,3,0.32,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
618,0,63425.0,2,0.4,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,lead_source_NA,lead_source_events,lead_source_organic_search,lead_source_paid_ads,lead_source_referral,lead_source_social_media,...,employment_status_student,employment_status_unemployed,location_NA,location_africa,location_asia,location_australia,location_europe,location_middle_east,location_north_america,location_south_america
1291,2,64134.0,3,0.93,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
282,0,75911.0,4,0.17,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
836,3,70900.0,4,0.38,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
594,3,0.0,4,0.1,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1377,0,48225.0,3,0.4,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0



Train a logistic regression model on the one-hot encoded training data.



In [28]:
from sklearn.linear_model import LogisticRegression

# Instantiate a LogisticRegression model
model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)

# Fit the model to the training data
model.fit(features_train, target_train)

## Predict on validation set



In [29]:
target_pred = model.predict(features_val)


Calculate the accuracy of the model on the validation set and round it to 2 decimal places.



In [30]:
from sklearn.metrics import accuracy_score

# Calculate the accuracy on the validation set
accuracy = accuracy_score(target_val, target_pred)

# Round the accuracy to 2 decimal places
accuracy = round(accuracy, 2)

print(f"Accuracy on the validation set: {accuracy:.2f}")

Accuracy on the validation set: 0.74



Find the least useful feature among 'industry', 'employment_status', and 'lead_score' by training a logistic regression model, excluding each of these features one by one, and comparing the accuracy on the validation set to the baseline accuracy obtained with all features. Report the feature with the smallest difference in accuracy.


Train a logistic regression model with the same features and parameters as in the previous step (Question 4) on the training data and evaluate its accuracy on the validation set. This will be our baseline accuracy.


In [31]:
# Instantiate a LogisticRegression model with the same parameters
model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)

# Fit the model to the training data
model.fit(features_train, target_train)

# Make predictions on the validation set
target_pred = model.predict(features_val)

# Calculate the accuracy on the validation set
baseline_accuracy = accuracy_score(target_val, target_pred)

print(f"Baseline Accuracy on the validation set: {baseline_accuracy:.2f}")

Baseline Accuracy on the validation set: 0.74


## Define features to exclude

### Subtask:
Create a list of the features to be excluded one by one ('industry', 'employment_status', 'lead_score').


**Reasoning**:
Create a list containing the names of the features to be excluded.



In [32]:
features_to_exclude = ['industry', 'employment_status', 'lead_score']
print("Features to exclude list created:")
display(features_to_exclude)

Features to exclude list created:


['industry', 'employment_status', 'lead_score']

## Iterate and evaluate


In [34]:
accuracy_differences = {}

features_to_exclude = ['industry', 'employment_status', 'lead_score']

for feature in features_to_exclude:
    print(f"Training model excluding: {feature}")

    # Identify columns to drop based on whether the feature is categorical or numerical
    if feature in ['industry', 'employment_status', 'lead_source', 'location']:
        # For categorical features, find all one-hot encoded columns related to this feature
        cols_to_drop = [col for col in features_train.columns if col.startswith(f'{feature}_')]
    elif feature == 'lead_score':
        # For 'lead_score', it's a single numerical column
        cols_to_drop = [feature]
    else:
        # If the feature is not in the expected list, skip or handle as needed
        print(f"Warning: Feature '{feature}' not recognized and will be skipped.")
        continue


    # Create new training and validation sets by dropping the specified columns
    features_train_modified = features_train.drop(columns=cols_to_drop)
    features_val_modified = features_val.drop(columns=cols_to_drop)

    # Instantiate a new LogisticRegression model
    model_modified = LogisticRegression(solver='liblinear', C=1.0, random_state=42)

    # Fit the new model to the modified training set
    model_modified.fit(features_train_modified, target_train)

    # Make predictions on the modified validation set
    target_pred_modified = model_modified.predict(features_val_modified)

    # Calculate the accuracy of the predictions on the modified validation set
    accuracy_modified = accuracy_score(target_val, target_pred_modified)

    # Calculate the difference between the baseline accuracy and the accuracy of the model without the current feature
    diff = baseline_accuracy - accuracy_modified

    # Store the calculated difference
    accuracy_differences[feature] = diff

print("\nAccuracy differences compared to baseline:")
display(accuracy_differences)

Training model excluding: industry
Training model excluding: employment_status
Training model excluding: lead_score

Accuracy differences compared to baseline:


{'industry': 0.0,
 'employment_status': -0.003424657534246589,
 'lead_score': 0.0}

## Identify least useful feature



In [35]:
# Find the feature with the smallest absolute accuracy difference
least_useful_feature = min(accuracy_differences, key=lambda k: abs(accuracy_differences[k]))
smallest_accuracy_difference = accuracy_differences[least_useful_feature]

print(f"The least useful feature among the excluded ones is: '{least_useful_feature}'")
print(f"The smallest accuracy difference (compared to baseline) is: {smallest_accuracy_difference:.4f}")

The least useful feature among the excluded ones is: 'industry'
The smallest accuracy difference (compared to baseline) is: 0.0000



Train regularized logistic regression models with C values [0.01, 0.1, 1, 10, 100] using the training data, evaluate their accuracy on the validation set, and identify the C value that yields the best accuracy.

## Define c values


In [36]:
c_values = [0.01, 0.1, 1.0, 10.0, 100.0]
print("C values to be tested:")
display(c_values)

C values to be tested:


[0.01, 0.1, 1.0, 10.0, 100.0]

## Iterate and train models



In [37]:
results = {}

for c in c_values:
    print(f"Training model with C={c}")
    # Instantiate a LogisticRegression model
    model = LogisticRegression(solver='liblinear', C=c, random_state=42)

    # Fit the model to the training data
    model.fit(features_train, target_train)

    # Make predictions on the validation set
    target_pred = model.predict(features_val)

    # Calculate the accuracy on the validation set
    accuracy = accuracy_score(target_val, target_pred)

    # Round the calculated accuracy to 3 decimal places
    accuracy = round(accuracy, 3)

    # Store the calculated difference
    results[c] = accuracy

print("\nAccuracy for each C value on the validation set:")
display(results)

Training model with C=0.01
Training model with C=0.1
Training model with C=1.0
Training model with C=10.0
Training model with C=100.0

Accuracy for each C value on the validation set:


{0.01: 0.743, 0.1: 0.743, 1.0: 0.743, 10.0: 0.743, 100.0: 0.743}

## Identify best c



In [38]:
# Find the C value that resulted in the highest accuracy
best_c = max(results, key=results.get)

# Get the highest accuracy
best_accuracy = results[best_c]

# Print the best C value and its corresponding accuracy
print(f"The C value that resulted in the highest accuracy is: {best_c}")
print(f"The highest accuracy on the validation set is: {best_accuracy:.3f}")

The C value that resulted in the highest accuracy is: 0.01
The highest accuracy on the validation set is: 0.743
