In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mutual_info_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline



In [28]:
with open('course_lead_scoring.txt', 'r') as file:
    df = pd.read_csv(file)


In [29]:
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [30]:
def handle_null_values(df):
    """
    More concise version using select_dtypes
    """
    result_df = df.copy()
    
    # Handle object columns
    object_columns = result_df.select_dtypes(include=['object']).columns
    result_df[object_columns] = result_df[object_columns].fillna('NA')
    
    # Handle numeric columns
    numeric_columns = result_df.select_dtypes(include=[np.number]).columns
    result_df[numeric_columns] = result_df[numeric_columns].fillna(0)
    
    return result_df


result_df = handle_null_values(df)
result_df.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [31]:
result_df.industry.value_counts().sort_values(ascending=False)

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [32]:
numerical_df = result_df.select_dtypes(include=[np.number])
correlation_matrix = numerical_df.corr()
np.fill_diagonal(correlation_matrix.values, np.nan)
print("Correlation Matrix:")
print(correlation_matrix)
print("\n" + "="*50 + "\n")

max_corr = correlation_matrix.abs().max().max()
    
    # Find the location of the maximum correlation
max_corr_idx = correlation_matrix.abs().stack().idxmax()
feature1, feature2 = max_corr_idx
    
# Get the actual correlation value (not absolute)
actual_corr = correlation_matrix.loc[feature1, feature2]

print(f"'{feature1}' and '{feature2}'", actual_corr)


Correlation Matrix:
                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                       NaN       0.009770   
annual_income                             0.009770            NaN   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   
converted                                 0.435914       0.053131   

                          interaction_count  lead_score  converted  
number_of_courses_viewed          -0.023565   -0.004879   0.435914  
annual_income                      0.027036    0.015610   0.053131  
interaction_count                       NaN    0.009888   0.374573  
lead_score                         0.009888         NaN   0.193673  
converted                          0.374573    0.193673        NaN  


'number_of_courses_viewed' and 'converted' 0.4359136580211793


In [39]:
X = result_df.drop(columns=['converted'])  # Replace 'converted' with your target column
y = result_df['converted']

# Step 2: First split - 80% train+val, 20% test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# Step 3: Second split - 75% of 80% = 60% train, 25% of 80% = 20% val
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42
)


categorical_cols = X_train.select_dtypes(include=['object']).columns



mi_results = {}
for col in categorical_cols:
    mi_results[col] = round(mutual_info_score(X_train[col], y_train), 2)

for feature, score in sorted(mi_results.items(), key=lambda x: x[1], reverse=True):
    print(f"{feature}: {score}")

lead_source: 0.04
industry: 0.01
employment_status: 0.01
location: 0.0


In [43]:


def simple_logistic_regression(X_train, X_val, y_train, y_val):
    """
    Simple approach using pandas get_dummies
    """
    # One-hot encode categorical variables
    X_train_encoded = pd.get_dummies(X_train, drop_first=True)
    X_val_encoded = pd.get_dummies(X_val, drop_first=True)
    
    # Align columns (important!)
    X_train_encoded, X_val_encoded = X_train_encoded.align(X_val_encoded, join='left', axis=1, fill_value=0)
    
    # Train model
    model = LogisticRegression(
        solver='liblinear', 
        C=1.0, 
        max_iter=1000, 
        random_state=42
    )
    model.fit(X_train_encoded, y_train)
    
    # Predict and calculate accuracy
    y_val_pred = model.predict(X_val_encoded)
    accuracy = round(accuracy_score(y_val, y_val_pred), 2)
    
    print(f"Validation Accuracy (rounded to 2 decimals): {accuracy}")
    
    return model, accuracy

# Usage:
model, accuracy = simple_logistic_regression(X_train, X_val, y_train, y_val)

Validation Accuracy (rounded to 2 decimals): 0.7


In [47]:
def simple_feature_elimination(X_train, X_val, y_train, y_val):
    """
    Simplified version using pandas get_dummies
    """
    # First get original accuracy
    X_train_encoded = pd.get_dummies(X_train, drop_first=True)
    X_val_encoded = pd.get_dummies(X_val, drop_first=True)
    X_train_encoded, X_val_encoded = X_train_encoded.align(X_val_encoded, join='left', axis=1, fill_value=0)
    
    original_model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    original_model.fit(X_train_encoded, y_train)
    original_accuracy = accuracy_score(y_val, original_model.predict(X_val_encoded))
    
    print(f"Original accuracy: {original_accuracy:.4f}")
    
    # Feature elimination
    accuracy_differences = {}
    
    for feature in X_train.columns:
        # Create datasets without the feature
        X_train_reduced = X_train.drop(columns=[feature])
        X_val_reduced = X_val.drop(columns=[feature])
        
        # One-hot encode
        X_train_encoded_red = pd.get_dummies(X_train_reduced, drop_first=True)
        X_val_encoded_red = pd.get_dummies(X_val_reduced, drop_first=True)
        X_train_encoded_red, X_val_encoded_red = X_train_encoded_red.align(
            X_val_encoded_red, join='left', axis=1, fill_value=0
        )
        
        # Train model
        model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
        model_reduced.fit(X_train_encoded_red, y_train)
        
        # Calculate accuracy
        accuracy_reduced = accuracy_score(y_val, model_reduced.predict(X_val_encoded_red))
        accuracy_diff = original_accuracy - accuracy_reduced
        
        accuracy_differences[feature] = accuracy_diff
        
        print(f"Without '{feature}': {accuracy_reduced:.4f} (diff: {accuracy_diff:+.4f})")
    
    
    return accuracy_differences

# Usage for simplified version:
accuracy_differences = simple_feature_elimination(X_train, X_val, y_train, y_val)


Original accuracy: 0.6997
Without 'lead_source': 0.7031 (diff: -0.0034)
Without 'industry': 0.6997 (diff: +0.0000)
Without 'number_of_courses_viewed': 0.5563 (diff: +0.1433)
Without 'annual_income': 0.8532 (diff: -0.1536)
Without 'employment_status': 0.6962 (diff: +0.0034)
Without 'location': 0.7099 (diff: -0.0102)
Without 'interaction_count': 0.5563 (diff: +0.1433)
Without 'lead_score': 0.7065 (diff: -0.0068)


In [61]:


def simple_logistic_regression(X_train, X_val, y_train, y_val, C_values=[0.01, 0.1, 1, 10, 100]):
    """
    Simple approach using pandas get_dummies
    """
    # One-hot encode categorical variables
    X_train_encoded = pd.get_dummies(X_train, drop_first=True)
    X_val_encoded = pd.get_dummies(X_val, drop_first=True)
    
    # Align columns (important!)
    X_train_encoded, X_val_encoded = X_train_encoded.align(X_val_encoded, join='left', axis=1, fill_value=0)


    results = {}
    
    for C in C_values:
        print(f"Training model with C = {C}...")
        
        # Create pipeline with current C value
        model = Pipeline([
            ('classifier', LogisticRegression(
                solver='liblinear', 
                C=C,  # Current regularization parameter
                max_iter=1000, 
                random_state=42
            ))
        ])

        model.fit(X_train_encoded, y_train)
        y_val_pred = model.predict(X_val_encoded)

        val_accuracy = accuracy_score(y_val, y_val_pred)
        val_accuracy_rounded = round(val_accuracy, 3)
        
        # Store results
        results[C] = {
            'model': model,
            'accuracy': val_accuracy,
            'accuracy_rounded': val_accuracy_rounded,
            'coefficients': None  # Will be filled if needed
        }
    
    return results
    # return X_train_encoded, X_val_encoded 


simple_logistic_regression(X_train, X_val, y_train, y_val, C_values=[0.01, 0.1, 1, 10, 100])

# print(X_val_encoded.shape)
# print(X_train_encoded.shape)
# Usage:
# model, accuracy = simple_logistic_regression(X_train, X_val, y_train, y_val)

Training model with C = 0.01...
Training model with C = 0.1...
Training model with C = 1...
Training model with C = 10...
Training model with C = 100...


{0.01: {'model': Pipeline(steps=[('classifier',
                   LogisticRegression(C=0.01, max_iter=1000, random_state=42,
                                      solver='liblinear'))]),
  'accuracy': 0.6996587030716723,
  'accuracy_rounded': 0.7,
  'coefficients': None},
 0.1: {'model': Pipeline(steps=[('classifier',
                   LogisticRegression(C=0.1, max_iter=1000, random_state=42,
                                      solver='liblinear'))]),
  'accuracy': 0.6996587030716723,
  'accuracy_rounded': 0.7,
  'coefficients': None},
 1: {'model': Pipeline(steps=[('classifier',
                   LogisticRegression(C=1, max_iter=1000, random_state=42,
                                      solver='liblinear'))]),
  'accuracy': 0.6996587030716723,
  'accuracy_rounded': 0.7,
  'coefficients': None},
 10: {'model': Pipeline(steps=[('classifier',
                   LogisticRegression(C=10, max_iter=1000, random_state=42,
                                      solver='liblinear'))]),
  

In [54]:
result_df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1
