# Data Preparation

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

# Get the data

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

In [3]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [5]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

# Data preparation

In [4]:
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [6]:
# For caterogical features, replace them with 'NA'
categorical = df.select_dtypes(include='object').columns
df[categorical] = df[categorical].fillna('NA')  


In [7]:
# For numerical features, replace with with 0.0
numerical = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical] = df[numerical].fillna(0.0)

In [8]:
df.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

# Question 1


In [9]:
# What is the most frequent observation (mode) for the column industry?
df['industry'].mode()[0]

'retail'

# Question 2

In [12]:
# Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.
corr_matrix = df[numerical].corr()
corr_matrix.style.background_gradient(cmap='coolwarm')

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [16]:
# Extract the relevant pairs
relevant_pairs = {
    "interaction_count and lead_score": corr_matrix.loc["interaction_count", "lead_score"],
    "number_of_courses_viewed and lead_score": corr_matrix.loc["number_of_courses_viewed", "lead_score"],
    "number_of_courses_viewed and interaction_count": corr_matrix.loc["number_of_courses_viewed", "interaction_count"],
    "annual_income and interaction_count": corr_matrix.loc["annual_income", "interaction_count"]
}

# Find the pair with the highest correlation
sorted_pairs = sorted(relevant_pairs.items(), key=lambda x: abs(x[1]), reverse=True)
highest_correlation_pair = sorted_pairs[0]

print("The two features with the biggest correlation are:", highest_correlation_pair[0])
print("Correlation value:", highest_correlation_pair[1])

The two features with the biggest correlation are: annual_income and interaction_count
Correlation value: 0.027036472404814396


In [14]:
corr_matrix_unstacked

number_of_courses_viewed  annual_income        0.009770
                          interaction_count   -0.023565
                          lead_score          -0.004879
                          converted            0.435914
annual_income             interaction_count    0.027036
                          lead_score           0.015610
                          converted            0.053131
interaction_count         lead_score           0.009888
                          converted            0.374573
lead_score                converted            0.193673
dtype: float64

# Split the data

In [17]:
from sklearn.model_selection import train_test_split

In [24]:
# Split the data into 80% full train and 20% test
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

# Split the full train into 60% train and 20% validation
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)  # 0.25 * 80% = 20%

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

# Verify the sizes
print(f"Train size: {len(df_train)}")
print(f"Validation size: {len(df_val)}")
print(f"Test size: {len(df_test)}")

Train size: 876
Validation size: 293
Test size: 293


# Question 3

In [25]:
from sklearn.metrics import mutual_info_score

# Select categorical columns from the training set
categorical_columns = df_train.select_dtypes(include=['object']).columns

# Calculate mutual information scores
mi_scores = {}
for col in categorical_columns:
    mi_scores[col] = mutual_info_score(df_train[col], y_train)

# Sort and display the mutual information scores
sorted_mi_scores = sorted(mi_scores.items(), key=lambda x: x[1], reverse=True)
for col, score in sorted_mi_scores:
    print(f"Mutual Information between {col} and target: {round(score,3)}")

Mutual Information between lead_source and target: 0.029
Mutual Information between employment_status and target: 0.014
Mutual Information between industry and target: 0.01
Mutual Information between location and target: 0.001


# Question 4

In [36]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
accuracy_score(y_val, y_pred)

0.7303754266211604

In [39]:
df_train.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score'],
      dtype='object')

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define the features and target
features = ['industry', 
            'employment_status', 'lead_score']  
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
baseline_accuracy = accuracy_score(y_val, y_pred)

# Perform feature elimination
accuracy_differences = {}
for feature in features:
    # Exclude the feature

    train_dict = df_train.drop(columns=[feature]).to_dict(orient='records')
    X_train_subset= dv.fit_transform(train_dict)

    val_dict = df_val.drop(columns=[feature]).to_dict(orient='records')
    X_val_subset = dv.transform(val_dict)

    # Train the model without the feature
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_subset, y_train)
    y_pred = model.predict(X_val_subset)
    accuracy = accuracy_score(y_val, y_pred)
    
    # Calculate the difference
    accuracy_differences[feature] = baseline_accuracy - accuracy

# Find the feature with the smallest difference
least_useful_feature = min(accuracy_differences, key=accuracy_differences.get)
print("Feature with the smallest difference:", least_useful_feature)
print("Accuracy differences:", accuracy_differences)

Feature with the smallest difference: industry
Accuracy differences: {'industry': -0.010238907849829393, 'employment_status': 0.0, 'lead_score': 0.0}


# Question 6

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define the features and target
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

# Define the values of C to try
C_values = [0.01, 0.1, 1, 10, 100]

# Train models and calculate accuracy
for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"C={C}, Validation Accuracy={round(accuracy, 4)}")

C=0.01, Validation Accuracy=0.7304
C=0.1, Validation Accuracy=0.7304
C=1, Validation Accuracy=0.7304
C=10, Validation Accuracy=0.7304
C=100, Validation Accuracy=0.7304
