In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv('lead_scoring.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [3]:
# Check missing values in the dataset
print("Missing values per column:")
print(df.isnull().sum())
print("\nTotal missing values:", df.isnull().sum().sum())


Missing values per column:
lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

Total missing values: 606


In [4]:
# Handle missing values as per instructions
# For categorical features, replace with 'NA'
# For numerical features, replace with 0.0

# Identify categorical and numerical columns
categorical_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

print("Categorical columns:", list(categorical_columns))
print("Numerical columns:", list(numerical_columns))

# Replace missing values
for col in categorical_columns:
    df[col] = df[col].fillna('NA')

for col in numerical_columns:
    df[col] = df[col].fillna(0.0)

print("\nAfter handling missing values:")
print("Missing values per column:")
print(df.isnull().sum())


Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']
Numerical columns: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']

After handling missing values:
Missing values per column:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [5]:
# Question 1: What is the most frequent observation (mode) for the column industry?
print("Question 1: Most frequent observation for 'industry' column")
print("=" * 60)

# Get the value counts for the industry column
industry_counts = df['industry'].value_counts()
print("Industry value counts:")
print(industry_counts)
print()

# Get the mode (most frequent value)
mode_industry = df['industry'].mode()[0]
print(f"Most frequent observation (mode): {mode_industry}")
print(f"Count: {industry_counts.iloc[0]}")
print(f"Percentage: {(industry_counts.iloc[0] / len(df)) * 100:.2f}%")


Question 1: Most frequent observation for 'industry' column
Industry value counts:
industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

Most frequent observation (mode): retail
Count: 203
Percentage: 13.89%


In [6]:
# Question 2: Create correlation matrix for numerical features
print("Question 2: Correlation matrix for numerical features")
print("=" * 60)

# Get numerical columns (excluding 'converted' as it's our target variable)
numerical_features = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

# Create correlation matrix
correlation_matrix = df[numerical_features].corr()
print("Correlation matrix:")
print(correlation_matrix)
print()

# Find the pair with the highest correlation (excluding diagonal)
# Create a mask to exclude diagonal elements (correlation with itself = 1.0)
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool), k=1)
correlation_values = correlation_matrix.where(mask).stack()

# Find the maximum correlation
max_corr = correlation_values.max()
max_corr_pair = correlation_values.idxmax()

print(f"Highest correlation: {max_corr:.4f}")
print(f"Between features: {max_corr_pair[0]} and {max_corr_pair[1]}")
print()

# Show all correlations for the specified pairs
print("Correlations for the specified pairs:")
print(f"interaction_count and lead_score: {correlation_matrix.loc['interaction_count', 'lead_score']:.4f}")
print(f"number_of_courses_viewed and lead_score: {correlation_matrix.loc['number_of_courses_viewed', 'lead_score']:.4f}")
print(f"number_of_courses_viewed and interaction_count: {correlation_matrix.loc['number_of_courses_viewed', 'interaction_count']:.4f}")
print(f"annual_income and interaction_count: {correlation_matrix.loc['annual_income', 'interaction_count']:.4f}")


Question 2: Correlation matrix for numerical features
Correlation matrix:
                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   

                          interaction_count  lead_score  
number_of_courses_viewed          -0.023565   -0.004879  
annual_income                      0.027036    0.015610  
interaction_count                  1.000000    0.009888  
lead_score                         0.009888    1.000000  

Highest correlation: 0.0270
Between features: annual_income and interaction_count

Correlations for the specified pairs:
interaction_count and lead_score: 0.0099
number_of_courses_viewed and lead_score: -0.0049
number_of_courses_viewed and interaction_count: -0.0236
annual_income and inte

In [8]:
# Split the data into train/val/test sets (60%/20%/20%)
from sklearn.model_selection import train_test_split

# Set the target variable
y = df['converted']
X = df.drop('converted', axis=1)

print("Question 3: Data splitting and mutual information")
print("=" * 60)

# First split: 60% train, 40% temp (which will be split into 20% val, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Second split: split the 40% temp into 20% val and 20% test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set size: {len(X_train)} ({len(X_train)/len(df)*100:.1f}%)")
print(f"Validation set size: {len(X_val)} ({len(X_val)/len(df)*100:.1f}%)")
print(f"Test set size: {len(X_test)} ({len(X_test)/len(df)*100:.1f}%)")
print(f"Total: {len(X_train) + len(X_val) + len(X_test)}")
print()

# Verify target variable is not in the dataframe
print("Columns in X_train:", list(X_train.columns))
print("Target variable 'converted' removed:", 'converted' not in X_train.columns)


Question 3: Data splitting and mutual information
Training set size: 877 (60.0%)
Validation set size: 292 (20.0%)
Test set size: 293 (20.0%)
Total: 1462

Columns in X_train: ['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 'employment_status', 'location', 'interaction_count', 'lead_score']
Target variable 'converted' removed: True


In [21]:
# Calculate mutual information scores between y and categorical variables (FIXED VERSION)
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

# Get categorical columns from the training set
categorical_columns = X_train.select_dtypes(include=['object']).columns
print("Categorical columns:", list(categorical_columns))
print()

# Encode categorical variables for mutual information calculation
X_train_encoded = X_train.copy()
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train[col].astype(str))
    label_encoders[col] = le

print("Sample of encoded data:")
print(X_train_encoded[categorical_columns].head())
print()

# Calculate mutual information scores
mi_scores = mutual_info_classif(X_train_encoded[categorical_columns], y_train, random_state=42)

# Create a dictionary to store the results
mi_results = dict(zip(categorical_columns, mi_scores))

print("Mutual Information Scores (rounded to 2 decimals):")
for feature, score in mi_results.items():
    print(f"{feature}: {round(score, 2)}")

print()

# Find the feature with the highest mutual information score
max_mi_feature = max(mi_results, key=mi_results.get)
max_mi_score = mi_results[max_mi_feature]

print(f"Feature with highest mutual information score: {max_mi_feature}")
print(f"Score: {round(max_mi_score, 2)}")


Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']

Sample of encoded data:
     lead_source  industry  employment_status  location
442            4         2                  4         7
319            4         1                  1         7
767            4         6                  2         1
756            4         5                  1         5
424            1         6                  2         7

Mutual Information Scores (rounded to 2 decimals):
lead_source: 0.04
industry: 0.03
employment_status: 0.02
location: 0.02

Feature with highest mutual information score: lead_source
Score: 0.04


In [23]:
# Question 4: Train logistic regression with one-hot encoding
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import pandas as pd

print("Question 4: Logistic Regression with One-Hot Encoding")
print("=" * 60)

# Apply one-hot encoding to categorical variables
# Get categorical columns
categorical_columns = X_train.select_dtypes(include=['object']).columns
numerical_columns = X_train.select_dtypes(include=['int64', 'float64']).columns

print("Categorical columns:", list(categorical_columns))
print("Numerical columns:", list(numerical_columns))
print()

# Initialize OneHotEncoder
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

# Fit and transform categorical features on training set
X_train_cat_encoded = ohe.fit_transform(X_train[categorical_columns])
X_val_cat_encoded = ohe.transform(X_val[categorical_columns])

# Get feature names for encoded categorical variables
cat_feature_names = ohe.get_feature_names_out(categorical_columns)

# Combine numerical and encoded categorical features
X_train_processed = pd.DataFrame(
    data=np.column_stack([X_train[numerical_columns].values, X_train_cat_encoded]),
    columns=list(numerical_columns) + list(cat_feature_names)
)

X_val_processed = pd.DataFrame(
    data=np.column_stack([X_val[numerical_columns].values, X_val_cat_encoded]),
    columns=list(numerical_columns) + list(cat_feature_names)
)

print(f"Training set shape after encoding: {X_train_processed.shape}")
print(f"Validation set shape after encoding: {X_val_processed.shape}")
print()

# Train logistic regression model with specified parameters
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_processed, y_train)

# Make predictions on validation set
y_val_pred = model.predict(X_val_processed)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_val_pred)
accuracy_rounded = round(accuracy, 2)

print(f"Validation accuracy: {accuracy:.4f}")
print(f"Validation accuracy (rounded to 2 decimal places): {accuracy_rounded}")
print()



Question 4: Logistic Regression with One-Hot Encoding
Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']
Numerical columns: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

Training set shape after encoding: (877, 27)
Validation set shape after encoding: (292, 27)

Validation accuracy: 0.7432
Validation accuracy (rounded to 2 decimal places): 0.74



In [24]:
# Question 5: Feature elimination to find the least useful feature
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import pandas as pd

print("Question 5: Feature Elimination")
print("=" * 60)

# First, train the baseline model with all features (same as Q4)
print("1. Training baseline model with all features...")

# Apply one-hot encoding to categorical variables (same as Q4)
categorical_columns = X_train.select_dtypes(include=['object']).columns
numerical_columns = X_train.select_dtypes(include=['int64', 'float64']).columns

# Initialize OneHotEncoder
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

# Fit and transform categorical features
X_train_cat_encoded = ohe.fit_transform(X_train[categorical_columns])
X_val_cat_encoded = ohe.transform(X_val[categorical_columns])

# Get feature names
cat_feature_names = ohe.get_feature_names_out(categorical_columns)

# Combine numerical and encoded categorical features
X_train_all_features = pd.DataFrame(
    data=np.column_stack([X_train[numerical_columns].values, X_train_cat_encoded]),
    columns=list(numerical_columns) + list(cat_feature_names)
)

X_val_all_features = pd.DataFrame(
    data=np.column_stack([X_val[numerical_columns].values, X_val_cat_encoded]),
    columns=list(numerical_columns) + list(cat_feature_names)
)

# Train baseline model
baseline_model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
baseline_model.fit(X_train_all_features, y_train)
baseline_pred = baseline_model.predict(X_val_all_features)
baseline_accuracy = accuracy_score(y_val, baseline_pred)

print(f"Baseline accuracy (all features): {baseline_accuracy:.4f}")
print()

# Now test each feature individually by removing it
print("2. Testing each feature by removing it...")
print()

feature_differences = {}

# Test removing 'industry' (categorical feature)
print("Testing without 'industry':")
industry_features = [col for col in X_train_all_features.columns if 'industry' in col]
X_train_no_industry = X_train_all_features.drop(columns=industry_features)
X_val_no_industry = X_val_all_features.drop(columns=industry_features)

model_no_industry = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_no_industry.fit(X_train_no_industry, y_train)
pred_no_industry = model_no_industry.predict(X_val_no_industry)
accuracy_no_industry = accuracy_score(y_val, pred_no_industry)

difference_industry = baseline_accuracy - accuracy_no_industry
feature_differences['industry'] = difference_industry

print(f"  Accuracy without industry: {accuracy_no_industry:.4f}")
print(f"  Difference: {difference_industry:.4f}")
print()

# Test removing 'employment_status' (categorical feature)
print("Testing without 'employment_status':")
employment_features = [col for col in X_train_all_features.columns if 'employment_status' in col]
X_train_no_employment = X_train_all_features.drop(columns=employment_features)
X_val_no_employment = X_val_all_features.drop(columns=employment_features)

model_no_employment = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_no_employment.fit(X_train_no_employment, y_train)
pred_no_employment = model_no_employment.predict(X_val_no_employment)
accuracy_no_employment = accuracy_score(y_val, pred_no_employment)

difference_employment = baseline_accuracy - accuracy_no_employment
feature_differences['employment_status'] = difference_employment

print(f"  Accuracy without employment_status: {accuracy_no_employment:.4f}")
print(f"  Difference: {difference_employment:.4f}")
print()

# Test removing 'lead_score' (numerical feature)
print("Testing without 'lead_score':")
X_train_no_lead_score = X_train_all_features.drop(columns=['lead_score'])
X_val_no_lead_score = X_val_all_features.drop(columns=['lead_score'])

model_no_lead_score = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_no_lead_score.fit(X_train_no_lead_score, y_train)
pred_no_lead_score = model_no_lead_score.predict(X_val_no_lead_score)
accuracy_no_lead_score = accuracy_score(y_val, pred_no_lead_score)

difference_lead_score = baseline_accuracy - accuracy_no_lead_score
feature_differences['lead_score'] = difference_lead_score

print(f"  Accuracy without lead_score: {accuracy_no_lead_score:.4f}")
print(f"  Difference: {difference_lead_score:.4f}")
print()

# Find the feature with the smallest difference (least useful)
print("3. Summary of differences:")
print("=" * 40)
for feature, diff in feature_differences.items():
    print(f"{feature}: {diff:.4f}")

min_diff_feature = min(feature_differences, key=lambda x: abs(feature_differences[x]))
min_diff_value = feature_differences[min_diff_feature]

print()
print(f"Feature with smallest difference (least useful): {min_diff_feature}")
print(f"Smallest difference: {min_diff_value:.4f}")
print()



Question 5: Feature Elimination
1. Training baseline model with all features...
Baseline accuracy (all features): 0.7432

2. Testing each feature by removing it...

Testing without 'industry':
  Accuracy without industry: 0.7432
  Difference: 0.0000

Testing without 'employment_status':
  Accuracy without employment_status: 0.7466
  Difference: -0.0034

Testing without 'lead_score':
  Accuracy without lead_score: 0.7432
  Difference: 0.0000

3. Summary of differences:
industry: 0.0000
employment_status: -0.0034
lead_score: 0.0000

Feature with smallest difference (least useful): industry
Smallest difference: 0.0000



In [26]:
# Question 6: Regularized logistic regression with different C values
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import pandas as pd

print("Question 6: Regularized Logistic Regression")
print("=" * 60)

# Use the same feature preparation as Q4 (all features)
print("1. Preparing features (same as Q4)...")

# Apply one-hot encoding to categorical variables
categorical_columns = X_train.select_dtypes(include=['object']).columns
numerical_columns = X_train.select_dtypes(include=['int64', 'float64']).columns

# Initialize OneHotEncoder
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

# Fit and transform categorical features
X_train_cat_encoded = ohe.fit_transform(X_train[categorical_columns])
X_val_cat_encoded = ohe.transform(X_val[categorical_columns])

# Get feature names
cat_feature_names = ohe.get_feature_names_out(categorical_columns)

# Combine numerical and encoded categorical features
X_train_processed = pd.DataFrame(
    data=np.column_stack([X_train[numerical_columns].values, X_train_cat_encoded]),
    columns=list(numerical_columns) + list(cat_feature_names)
)

X_val_processed = pd.DataFrame(
    data=np.column_stack([X_val[numerical_columns].values, X_val_cat_encoded]),
    columns=list(numerical_columns) + list(cat_feature_names)
)

print(f"Training set shape: {X_train_processed.shape}")
print(f"Validation set shape: {X_val_processed.shape}")
print()

# Test different C values
print("2. Testing different C values...")
print()

C_values = [0.01, 0.1, 1, 10, 100]
results = {}

for C in C_values:
    print(f"Training model with C = {C}")
    
    # Train logistic regression with current C value
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_processed, y_train)
    
    # Make predictions on validation set
    y_val_pred = model.predict(X_val_processed)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_val_pred)
    accuracy_rounded = round(accuracy, 3)
    
    results[C] = accuracy_rounded
    
    print(f"  Validation accuracy: {accuracy:.6f}")
    print(f"  Validation accuracy (rounded to 3 decimals): {accuracy_rounded}")
    print()

# Find the best C value
print("3. Summary of results:")
print("=" * 40)
for C, acc in results.items():
    print(f"C = {C}: {acc}")

# Find the C with the best accuracy
best_accuracy = max(results.values())
best_C_values = [C for C, acc in results.items() if acc == best_accuracy]

print()
print(f"Best accuracy: {best_accuracy}")
print(f"C values with best accuracy: {best_C_values}")

# If multiple C values have the same best accuracy, select the smallest one
best_C = min(best_C_values)

print(f"Selected C (smallest among best): {best_C}")
print()


Question 6: Regularized Logistic Regression
1. Preparing features (same as Q4)...
Training set shape: (877, 27)
Validation set shape: (292, 27)

2. Testing different C values...

Training model with C = 0.01
  Validation accuracy: 0.739726
  Validation accuracy (rounded to 3 decimals): 0.74

Training model with C = 0.1
  Validation accuracy: 0.743151
  Validation accuracy (rounded to 3 decimals): 0.743

Training model with C = 1
  Validation accuracy: 0.743151
  Validation accuracy (rounded to 3 decimals): 0.743

Training model with C = 10
  Validation accuracy: 0.743151
  Validation accuracy (rounded to 3 decimals): 0.743

Training model with C = 100
  Validation accuracy: 0.743151
  Validation accuracy (rounded to 3 decimals): 0.743

3. Summary of results:
C = 0.01: 0.74
C = 0.1: 0.743
C = 1: 0.743
C = 10: 0.743
C = 100: 0.743

Best accuracy: 0.743
C values with best accuracy: [0.1, 1, 10, 100]
Selected C (smallest among best): 0.1

