In [2]:
import pandas as pd

# ============================================================
# 1) Entfernen der definierten Spalten
# ============================================================

DROP_COLUMNS = [
    "Asymmetrique Activity Index",
    "Asymmetrique Profile Index",
    "Asymmetrique Activity Score",
    "Asymmetrique Profile Score",
    "I agree to pay the amount through cheque",
    "A free copy of Mastering The Interview"
]

# Nur existierende Spalten droppen (Sicherheitscheck)
existing_drop = [c for c in DROP_COLUMNS if c in df.columns]
df_reduced = df.drop(columns=existing_drop).copy()

print("Remaining columns:", len(df_reduced.columns))


# ============================================================
# 2) NaNs in den ersten 3 Spalten prüfen
# ============================================================

first_three_cols = df_reduced.columns[:3]
print("\nFirst three columns:", list(first_three_cols))

for col in first_three_cols:
    missing_count = df_reduced[col].isna().sum()
    missing_pct = missing_count / len(df_reduced) * 100
    unique_values = df_reduced[col].nunique(dropna=False)

    print(f"\nColumn: {col}")
    print("Missing count:", missing_count)
    print("Missing %:", round(missing_pct, 4))
    print("Unique values:", unique_values)

NameError: name 'df' is not defined

In [None]:
cols_to_check = df_reduced.columns[3:6]
print("Columns under inspection:", list(cols_to_check))

for col in cols_to_check:
    print("\n" + "="*60)
    print(f"Column: {col}")
    
    # Missing values
    missing_count = df_reduced[col].isna().sum()
    missing_pct = missing_count / len(df_reduced) * 100
    
    print("Missing count:", missing_count)
    print("Missing %:", round(missing_pct, 4))
    
    # Unique values
    unique_values = df_reduced[col].nunique(dropna=False)
    print("Number of unique values (incl NaN):", unique_values)
    
    # Value counts (inkl NaN)
    print("\nTop value counts:")
    print(df_reduced[col].value_counts(dropna=False).head(15))
    
    # Check for empty strings
    empty_string_count = (df_reduced[col] == "").sum()
    if empty_string_count > 0:
        print("Empty string count:", empty_string_count)
    
    # Check for whitespace-only entries
    whitespace_count = (df_reduced[col].astype(str).str.strip() == "").sum()
    if whitespace_count > 0:
        print("Whitespace-only entries:", whitespace_count)

Columns under inspection: ['Lead Source', 'Do Not Email', 'Do Not Call']

Column: Lead Source
Missing count: 36
Missing %: 0.3896
Number of unique values (incl NaN): 22

Top value counts:
Lead Source
Google              2868
Direct Traffic      2543
Olark Chat          1755
Organic Search      1154
Reference            534
Welingak Website     142
Referral Sites       125
Facebook              55
NaN                   36
bing                   6
google                 5
Click2call             4
Live Chat              2
Social Media           2
Press_Release          2
Name: count, dtype: int64

Column: Do Not Email
Missing count: 0
Missing %: 0.0
Number of unique values (incl NaN): 2

Top value counts:
Do Not Email
No     8506
Yes     734
Name: count, dtype: int64

Column: Do Not Call
Missing count: 0
Missing %: 0.0
Number of unique values (incl NaN): 2

Top value counts:
Do Not Call
No     9238
Yes       2
Name: count, dtype: int64


In [None]:
cols_to_check = df_reduced.columns[6:10]
print("Columns under inspection:", list(cols_to_check))

for col in cols_to_check:
    print("\n" + "="*70)
    print(f"Column: {col}")
    
    # --- Missing check ---
    nan_mask = df_reduced[col].isna()
    n_nans = nan_mask.sum()
    
    if n_nans == 0:
        print("No NaNs found.")
    else:
        print(f"NaNs found: {n_nans}")
        display(
            df_reduced.loc[nan_mask,
                           ["Prospect ID", "Lead Number", "Lead Origin", col]]
            .head(20)
        )
    
    # --- Basic stats for numeric columns ---
    if pd.api.types.is_numeric_dtype(df_reduced[col]):
        print("\nBasic statistics:")
        print(df_reduced[col].describe())
        
        print("\nTop 10 values:")
        print(df_reduced[col].value_counts(dropna=False).head(10))
    
    # --- Unique values for non-numeric ---
    else:
        print("\nValue counts:")
        print(df_reduced[col].value_counts(dropna=False).head(15))

Columns under inspection: ['Converted', 'TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit']

Column: Converted
No NaNs found.

Basic statistics:
count    9240.000000
mean        0.385390
std         0.486714
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: Converted, dtype: float64

Top 10 values:
Converted
0    5679
1    3561
Name: count, dtype: int64

Column: TotalVisits
NaNs found: 137


Unnamed: 0,Prospect ID,Lead Number,Lead Origin,TotalVisits
77,895d4905-f534-4f18-915b-8d239a72b5dc,659722,Lead Add Form,
79,3a0ce10f-d2c1-4213-a2bc-4f97bcd29699,659710,Lead Add Form,
81,277ad6a6-4565-4a18-a1ff-e46e03f22663,659705,Lead Add Form,
88,68f496c2-0073-470f-9c3c-7fb48f060ce5,659631,Lead Add Form,
120,144807db-2895-4002-b52e-3eda79c22395,659283,Lead Add Form,
133,63ebde80-a465-4cdc-ab5a-5e880a7138b0,659158,Lead Add Form,
134,0298b9a5-fedb-408b-a284-2d357583600f,659153,Lead Add Form,
177,3b74e995-4407-44de-9e59-622afb514261,658648,Lead Add Form,
179,1730b5e8-e435-41c6-9082-b9c98976bd16,658627,Lead Add Form,
180,db2dc4b5-f603-4818-9b0c-0435923a4cd8,658623,Lead Add Form,



Basic statistics:
count    9103.000000
mean        3.445238
std         4.854853
min         0.000000
25%         1.000000
50%         3.000000
75%         5.000000
max       251.000000
Name: TotalVisits, dtype: float64

Top 10 values:
TotalVisits
0.0    2189
2.0    1680
3.0    1306
4.0    1120
5.0     783
6.0     466
1.0     395
7.0     309
8.0     224
9.0     164
Name: count, dtype: int64

Column: Total Time Spent on Website
No NaNs found.

Basic statistics:
count    9240.000000
mean      487.698268
std       548.021466
min         0.000000
25%        12.000000
50%       248.000000
75%       936.000000
max      2272.000000
Name: Total Time Spent on Website, dtype: float64

Top 10 values:
Total Time Spent on Website
0      2193
60       19
74       18
127      18
75       18
234      17
62       17
157      17
87       17
32       17
Name: count, dtype: int64

Column: Page Views Per Visit
NaNs found: 137


Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Page Views Per Visit
77,895d4905-f534-4f18-915b-8d239a72b5dc,659722,Lead Add Form,
79,3a0ce10f-d2c1-4213-a2bc-4f97bcd29699,659710,Lead Add Form,
81,277ad6a6-4565-4a18-a1ff-e46e03f22663,659705,Lead Add Form,
88,68f496c2-0073-470f-9c3c-7fb48f060ce5,659631,Lead Add Form,
120,144807db-2895-4002-b52e-3eda79c22395,659283,Lead Add Form,
133,63ebde80-a465-4cdc-ab5a-5e880a7138b0,659158,Lead Add Form,
134,0298b9a5-fedb-408b-a284-2d357583600f,659153,Lead Add Form,
177,3b74e995-4407-44de-9e59-622afb514261,658648,Lead Add Form,
179,1730b5e8-e435-41c6-9082-b9c98976bd16,658627,Lead Add Form,
180,db2dc4b5-f603-4818-9b0c-0435923a4cd8,658623,Lead Add Form,



Basic statistics:
count    9103.000000
mean        2.362820
std         2.161418
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max        55.000000
Name: Page Views Per Visit, dtype: float64

Top 10 values:
Page Views Per Visit
0.0    2189
2.0    1795
3.0    1196
4.0     896
1.0     651
5.0     517
1.5     306
6.0     244
2.5     241
NaN     137
Name: count, dtype: int64


In [None]:
print("="*80)
print("ANALYSIS: TotalVisits Missing Values")
print("="*80)

# Maske für Missing
missing_mask = df_reduced["TotalVisits"].isna()

n_missing = missing_mask.sum()
n_total = len(df_reduced)

print(f"\nTotal rows: {n_total}")
print(f"Missing TotalVisits: {n_missing}")
print(f"Percentage missing: {round(n_missing / n_total * 100, 4)} %")

# ------------------------------------------------------------
# 1️⃣ In welchen Lead Origins treten sie auf?
# ------------------------------------------------------------

print("\nLead Origin distribution for missing TotalVisits:")
print("-"*60)
print(df_reduced.loc[missing_mask, "Lead Origin"].value_counts())

# ------------------------------------------------------------
# 2️⃣ Conversion Rate Vergleich
# ------------------------------------------------------------

print("\nConversion comparison:")
print("-"*60)

conversion_missing = df_reduced.loc[missing_mask, "Converted"].mean()
conversion_not_missing = df_reduced.loc[~missing_mask, "Converted"].mean()
conversion_overall = df_reduced["Converted"].mean()

print(f"Overall conversion rate: {round(conversion_overall, 4)}")
print(f"Conversion (Missing TotalVisits): {round(conversion_missing, 4)}")
print(f"Conversion (Not Missing TotalVisits): {round(conversion_not_missing, 4)}")

# ------------------------------------------------------------
# 3️⃣ Beispielhafte Zeilen anzeigen
# ------------------------------------------------------------

print("\nSample rows with missing TotalVisits:")
print("-"*60)

display(
    df_reduced.loc[missing_mask,
                   ["Prospect ID", "Lead Number", "Lead Origin", 
                    "TotalVisits", "Page Views Per Visit", "Converted"]]
    .head(10)
)

print("\nAnalysis block complete.")

ANALYSIS: TotalVisits Missing Values

Total rows: 9240
Missing TotalVisits: 137
Percentage missing: 1.4827 %

Lead Origin distribution for missing TotalVisits:
------------------------------------------------------------
Lead Origin
Lead Add Form     110
Lead Import        24
API                 2
Quick Add Form      1
Name: count, dtype: int64

Conversion comparison:
------------------------------------------------------------
Overall conversion rate: 0.3854
Conversion (Missing TotalVisits): 0.7299
Conversion (Not Missing TotalVisits): 0.3802

Sample rows with missing TotalVisits:
------------------------------------------------------------


Unnamed: 0,Prospect ID,Lead Number,Lead Origin,TotalVisits,Page Views Per Visit,Converted
77,895d4905-f534-4f18-915b-8d239a72b5dc,659722,Lead Add Form,,,1
79,3a0ce10f-d2c1-4213-a2bc-4f97bcd29699,659710,Lead Add Form,,,1
81,277ad6a6-4565-4a18-a1ff-e46e03f22663,659705,Lead Add Form,,,1
88,68f496c2-0073-470f-9c3c-7fb48f060ce5,659631,Lead Add Form,,,1
120,144807db-2895-4002-b52e-3eda79c22395,659283,Lead Add Form,,,1
133,63ebde80-a465-4cdc-ab5a-5e880a7138b0,659158,Lead Add Form,,,1
134,0298b9a5-fedb-408b-a284-2d357583600f,659153,Lead Add Form,,,1
177,3b74e995-4407-44de-9e59-622afb514261,658648,Lead Add Form,,,1
179,1730b5e8-e435-41c6-9082-b9c98976bd16,658627,Lead Add Form,,,1
180,db2dc4b5-f603-4818-9b0c-0435923a4cd8,658623,Lead Add Form,,,1



Analysis block complete.


In [None]:
print("="*80)
print("ANALYSIS: Conversion Rate by Lead Origin")
print("="*80)

# Gesamt-Conversion
overall_conversion = df_reduced["Converted"].mean()
print(f"\nOverall Conversion Rate: {round(overall_conversion, 4)}")

print("\nConversion Rate by Lead Origin:")
print("-"*60)

origin_summary = (
    df_reduced
    .groupby("Lead Origin")["Converted"]
    .agg(["count", "mean"])
    .sort_values("mean", ascending=False)
)

print(origin_summary)

print("\nDetailed Interpretation:")
print("-"*60)

for origin, row in origin_summary.iterrows():
    print(f"{origin}:")
    print(f"  Number of Leads: {row['count']}")
    print(f"  Conversion Rate: {round(row['mean'], 4)}")
    print()

print("="*80)
print("Analysis complete.")

ANALYSIS: Conversion Rate by Lead Origin

Overall Conversion Rate: 0.3854

Conversion Rate by Lead Origin:
------------------------------------------------------------
                         count      mean
Lead Origin                             
Quick Add Form               1  1.000000
Lead Add Form              718  0.924791
Landing Page Submission   4886  0.361850
API                       3580  0.311453
Lead Import                 55  0.236364

Detailed Interpretation:
------------------------------------------------------------
Quick Add Form:
  Number of Leads: 1.0
  Conversion Rate: 1.0

Lead Add Form:
  Number of Leads: 718.0
  Conversion Rate: 0.9248

Landing Page Submission:
  Number of Leads: 4886.0
  Conversion Rate: 0.3619

API:
  Number of Leads: 3580.0
  Conversion Rate: 0.3115

Lead Import:
  Number of Leads: 55.0
  Conversion Rate: 0.2364

Analysis complete.


In [None]:
cols_to_check = df_reduced.columns[10:16]
print("Columns under inspection:", list(cols_to_check))

for col in cols_to_check:
    print("\n" + "="*80)
    print(f"Column: {col}")
    
    # ---- Missing ----
    nan_count = df_reduced[col].isna().sum()
    nan_pct = nan_count / len(df_reduced) * 100
    
    print(f"NaN count: {nan_count}")
    print(f"NaN %: {round(nan_pct, 4)}")
    
    # ---- Unique values ----
    unique_vals = df_reduced[col].nunique(dropna=False)
    print(f"Unique values (incl NaN): {unique_vals}")
    
    # ---- Top values ----
    print("\nTop value counts:")
    print(df_reduced[col].value_counts(dropna=False).head(15))
    
    # ---- Pseudo-Missing Checks ----
    if df_reduced[col].dtype == "object":
        select_count = df_reduced[col].astype(str).str.contains("Select", case=False, na=False).sum()
        not_provided_count = df_reduced[col].astype(str).str.contains("Not Provided", case=False, na=False).sum()
        empty_count = (df_reduced[col] == "").sum()
        
        if select_count > 0:
            print(f"'Select' occurrences: {select_count}")
        if not_provided_count > 0:
            print(f"'Not Provided' occurrences: {not_provided_count}")
        if empty_count > 0:
            print(f"Empty string occurrences: {empty_count}")

Columns under inspection: ['Last Activity', 'Country', 'Specialization', 'How did you hear about X Education', 'What is your current occupation', 'What matters most to you in choosing a course']

Column: Last Activity
NaN count: 103
NaN %: 1.1147
Unique values (incl NaN): 18

Top value counts:
Last Activity
Email Opened                    3437
SMS Sent                        2745
Olark Chat Conversation          973
Page Visited on Website          640
Converted to Lead                428
Email Bounced                    326
Email Link Clicked               267
Form Submitted on Website        116
NaN                              103
Unreachable                       93
Unsubscribed                      61
Had a Phone Conversation          30
Approached upfront                 9
View in browser link Clicked       6
Email Marked Spam                  2
Name: count, dtype: int64

Column: Country
NaN count: 2461
NaN %: 26.6342
Unique values (incl NaN): 39

Top value counts:
Country
India 

In [None]:
cols_to_check = df_reduced.columns[16:22]
print("Columns under inspection:", list(cols_to_check))

for col in cols_to_check:
    print("\n" + "="*80)
    print(f"Column: {col}")
    
    # ---- Missing ----
    nan_count = df_reduced[col].isna().sum()
    nan_pct = nan_count / len(df_reduced) * 100
    
    print(f"NaN count: {nan_count}")
    print(f"NaN %: {round(nan_pct, 4)}")
    
    # ---- Unique values ----
    unique_vals = df_reduced[col].nunique(dropna=False)
    print(f"Unique values (incl NaN): {unique_vals}")
    
    # ---- Value distribution ----
    print("\nValue counts:")
    print(df_reduced[col].value_counts(dropna=False))
    
    # ---- Check for inconsistent labels ----
    if df_reduced[col].dtype == "object":
        print("\nUnique raw labels:")
        print(df_reduced[col].unique())

Columns under inspection: ['Search', 'Magazine', 'Newspaper Article', 'X Education Forums', 'Newspaper', 'Digital Advertisement']

Column: Search
NaN count: 0
NaN %: 0.0
Unique values (incl NaN): 2

Value counts:
Search
No     9226
Yes      14
Name: count, dtype: int64

Unique raw labels:
['No' 'Yes']

Column: Magazine
NaN count: 0
NaN %: 0.0
Unique values (incl NaN): 1

Value counts:
Magazine
No    9240
Name: count, dtype: int64

Unique raw labels:
['No']

Column: Newspaper Article
NaN count: 0
NaN %: 0.0
Unique values (incl NaN): 2

Value counts:
Newspaper Article
No     9238
Yes       2
Name: count, dtype: int64

Unique raw labels:
['No' 'Yes']

Column: X Education Forums
NaN count: 0
NaN %: 0.0
Unique values (incl NaN): 2

Value counts:
X Education Forums
No     9239
Yes       1
Name: count, dtype: int64

Unique raw labels:
['No' 'Yes']

Column: Newspaper
NaN count: 0
NaN %: 0.0
Unique values (incl NaN): 2

Value counts:
Newspaper
No     9239
Yes       1
Name: count, dtype: int64



In [None]:
cols_to_check = df_reduced.columns[22:28]
print("Columns under inspection:", list(cols_to_check))

for col in cols_to_check:
    print("\n" + "="*80)
    print(f"Column: {col}")
    
    # ---- Missing ----
    nan_count = df_reduced[col].isna().sum()
    nan_pct = nan_count / len(df_reduced) * 100
    
    print(f"NaN count: {nan_count}")
    print(f"NaN %: {round(nan_pct, 4)}")
    
    # ---- Unique values ----
    unique_vals = df_reduced[col].nunique(dropna=False)
    print(f"Unique values (incl NaN): {unique_vals}")
    
    # ---- Value distribution ----
    print("\nValue counts:")
    print(df_reduced[col].value_counts(dropna=False).head(20))
    
    # ---- Check for pseudo-missing ----
    if df_reduced[col].dtype == "object":
        select_count = df_reduced[col].astype(str).str.contains("Select", case=False, na=False).sum()
        not_provided_count = df_reduced[col].astype(str).str.contains("Not Provided", case=False, na=False).sum()
        
        if select_count > 0:
            print(f"'Select' occurrences: {select_count}")
        if not_provided_count > 0:
            print(f"'Not Provided' occurrences: {not_provided_count}")
            

Columns under inspection: ['Through Recommendations', 'Receive More Updates About Our Courses', 'Tags', 'Lead Quality', 'Update me on Supply Chain Content', 'Get updates on DM Content']

Column: Through Recommendations
NaN count: 0
NaN %: 0.0
Unique values (incl NaN): 2

Value counts:
Through Recommendations
No     9233
Yes       7
Name: count, dtype: int64

Column: Receive More Updates About Our Courses
NaN count: 0
NaN %: 0.0
Unique values (incl NaN): 1

Value counts:
Receive More Updates About Our Courses
No    9240
Name: count, dtype: int64

Column: Tags
NaN count: 3353
NaN %: 36.2879
Unique values (incl NaN): 27

Value counts:
Tags
NaN                                                  3353
Will revert after reading the email                  2072
Ringing                                              1203
Interested in other courses                           513
Already a student                                     465
Closed by Horizzon                                    358
switche

In [None]:
start_index = 28  # letzte geprüfte Position anpassen falls nötig
cols_to_check = df_reduced.columns[start_index:start_index+6]

print("Columns under inspection:", list(cols_to_check))

for col in cols_to_check:
    print("\n" + "="*80)
    print(f"Column: {col}")
    
    # ---- Missing ----
    nan_count = df_reduced[col].isna().sum()
    nan_pct = nan_count / len(df_reduced) * 100
    
    print(f"NaN count: {nan_count}")
    print(f"NaN %: {round(nan_pct, 4)}")
    
    # ---- Unique values ----
    unique_vals = df_reduced[col].nunique(dropna=False)
    print(f"Unique values (incl NaN): {unique_vals}")
    
    # ---- Value distribution ----
    print("\nTop value counts:")
    print(df_reduced[col].value_counts(dropna=False).head(15))
    
    # ---- Pseudo-Missing ----
    if df_reduced[col].dtype == "object":
        select_count = df_reduced[col].astype(str).str.contains("Select", case=False, na=False).sum()
        not_provided_count = df_reduced[col].astype(str).str.contains("Not Provided", case=False, na=False).sum()
        
        if select_count > 0:
            print(f"'Select' occurrences: {select_count}")
        if not_provided_count > 0:
            print(f"'Not Provided' occurrences: {not_provided_count}")

Columns under inspection: ['Lead Profile', 'City', 'Last Notable Activity']

Column: Lead Profile
NaN count: 2709
NaN %: 29.3182
Unique values (incl NaN): 7

Top value counts:
Lead Profile
Select                         4146
NaN                            2709
Potential Lead                 1613
Other Leads                     487
Student of SomeSchool           241
Lateral Student                  24
Dual Specialization Student      20
Name: count, dtype: int64
'Select' occurrences: 4146

Column: City
NaN count: 1420
NaN %: 15.368
Unique values (incl NaN): 8

Top value counts:
City
Mumbai                         3222
Select                         2249
NaN                            1420
Thane & Outskirts               752
Other Cities                    686
Other Cities of Maharashtra     457
Other Metro Cities              380
Tier II Cities                   74
Name: count, dtype: int64
'Select' occurrences: 2249

Column: Last Notable Activity
NaN count: 0
NaN %: 0.0
Unique values 

In [None]:
import pandas as pd
import numpy as np

print("="*80)
print("CLEANING: Converting pseudo-missing values to NaN")
print("="*80)

df_clean = df_reduced.copy()

# Define pseudo-missing tokens
PSEUDO_TOKENS = ["Select", "Not Provided", "", " "]

# Replace across entire dataframe
for col in df_clean.columns:
    if df_clean[col].dtype == "object":
        df_clean[col] = df_clean[col].replace(PSEUDO_TOKENS, pd.NA)

print("Pseudo-missing values replaced with NaN.")

# ------------------------------------------------------------
# Check how many NaNs we now have per column
# ------------------------------------------------------------

missing_summary = (
    df_clean.isna()
    .sum()
    .sort_values(ascending=False)
)

print("\nUpdated missing summary (top 15 columns):")
print(missing_summary.head(15))

print("\nCleaning complete.")

CLEANING: Converting pseudo-missing values to NaN
Pseudo-missing values replaced with NaN.

Updated missing summary (top 15 columns):
How did you hear about X Education               7250
Lead Profile                                     6855
Lead Quality                                     4767
City                                             3669
Specialization                                   3380
Tags                                             3353
What matters most to you in choosing a course    2709
What is your current occupation                  2690
Country                                          2461
TotalVisits                                       137
Page Views Per Visit                              137
Last Activity                                     103
Lead Source                                        36
Converted                                           0
Do Not Call                                         0
dtype: int64

Cleaning complete.


In [None]:
print("="*80)
print("Creating tracking vs non-tracking datasets")
print("="*80)

tracking_missing_mask = df_clean["TotalVisits"].isna()

df_no_tracking = df_clean.loc[tracking_missing_mask].copy()
df_with_tracking = df_clean.loc[~tracking_missing_mask].copy()

print(f"Total rows: {len(df_clean)}")
print(f"Rows WITHOUT tracking (NaN TotalVisits): {len(df_no_tracking)}")
print(f"Rows WITH tracking: {len(df_with_tracking)}")

print("\nConversion comparison:")
print(f"No tracking conversion: {round(df_no_tracking['Converted'].mean(), 4)}")
print(f"With tracking conversion: {round(df_with_tracking['Converted'].mean(), 4)}")

Creating tracking vs non-tracking datasets
Total rows: 9240
Rows WITHOUT tracking (NaN TotalVisits): 137
Rows WITH tracking: 9103

Conversion comparison:
No tracking conversion: 0.7299
With tracking conversion: 0.3802


In [None]:
df_clean.to_csv("../data/processed/lead_scoring_cleaned.csv", index=False)
df_with_tracking.to_csv("../data/processed/lead_scoring_with_tracking.csv", index=False)
df_no_tracking.to_csv("../data/processed/lead_scoring_no_tracking.csv", index=False)

print("Processed datasets saved.")

Processed datasets saved.


In [None]:
df_clean["Lead Source"].unique()

array(['Olark Chat', 'Organic Search', 'Direct Traffic', 'Google',
       'Referral Sites', 'Welingak Website', 'Reference', 'google',
       'Facebook', nan, 'blog', 'Pay per Click Ads', 'bing',
       'Social Media', 'WeLearn', 'Click2call', 'Live Chat',
       'welearnblog_Home', 'youtubechannel', 'testone', 'Press_Release',
       'NC_EDM'], dtype=object)

In [None]:
df_clean["Lead Source"] = df_clean["Lead Source"].str.strip().str.title()
df_clean["Country"] = df_clean["Country"].replace("unknown", pd.NA)
df_clean[["TotalVisits", 
          "Total Time Spent on Website", 
          "Page Views Per Visit"]].describe()
df_clean.dtypes
df_clean["Converted"].value_counts(normalize=True)
print("="*100)
print("FINAL DATA AUDIT")
print("="*100)

# ------------------------------------------------------------
# 1️⃣ Shape
# ------------------------------------------------------------
print("\n1) DATASET SHAPE")
print("-"*50)
print("Rows:", df_clean.shape[0])
print("Columns:", df_clean.shape[1])


# ------------------------------------------------------------
# 2️⃣ Missing Overview
# ------------------------------------------------------------
print("\n2) MISSING SUMMARY (Top 15)")
print("-"*50)

missing_summary = (
    df_clean.isna()
    .sum()
    .sort_values(ascending=False)
)

print(missing_summary.head(15))


# ------------------------------------------------------------
# 3️⃣ Data Types
# ------------------------------------------------------------
print("\n3) DATA TYPES")
print("-"*50)
print(df_clean.dtypes)


# ------------------------------------------------------------
# 4️⃣ Numeric Summary
# ------------------------------------------------------------
print("\n4) NUMERIC SUMMARY")
print("-"*50)

numeric_cols = df_clean.select_dtypes(include=["int64", "float64"]).columns
print(df_clean[numeric_cols].describe())


# ------------------------------------------------------------
# 5️⃣ Target Distribution
# ------------------------------------------------------------
print("\n5) TARGET DISTRIBUTION")
print("-"*50)

print(df_clean["Converted"].value_counts())
print("\nTarget ratio:")
print(df_clean["Converted"].value_counts(normalize=True))


# ------------------------------------------------------------
# 6️⃣ Check for negative values in numeric columns
# ------------------------------------------------------------
print("\n6) NEGATIVE VALUE CHECK")
print("-"*50)

for col in numeric_cols:
    if (df_clean[col] < 0).any():
        print(f"Negative values found in {col}")
    else:
        print(f"{col}: OK (no negatives)")


# ------------------------------------------------------------
# 7️⃣ Lead Source normalization check
# ------------------------------------------------------------
print("\n7) LEAD SOURCE UNIQUE VALUES")
print("-"*50)

print(df_clean["Lead Source"].unique())


print("\nAUDIT COMPLETE")
print("="*100)

FINAL DATA AUDIT

1) DATASET SHAPE
--------------------------------------------------
Rows: 9240
Columns: 31

2) MISSING SUMMARY (Top 15)
--------------------------------------------------
How did you hear about X Education               7250
Lead Profile                                     6855
Lead Quality                                     4767
City                                             3669
Specialization                                   3380
Tags                                             3353
What matters most to you in choosing a course    2709
What is your current occupation                  2690
Country                                          2466
TotalVisits                                       137
Page Views Per Visit                              137
Last Activity                                     103
Lead Source                                        36
Converted                                           0
Do Not Call                                         0
d

In [None]:
print("="*80)
print("Overwriting processed dataset")
print("="*80)

output_path = "../data/processed/lead_scoring_cleaned.csv"

df_clean.to_csv(output_path, index=False)

print(f"File overwritten at: {output_path}")
print("Overwrite complete.")


Overwriting processed dataset
File overwritten at: ../data/processed/lead_scoring_cleaned.csv
Overwrite complete.


In [None]:
df_test = pd.read_csv("../data/processed/lead_scoring_cleaned.csv")
print(df_test.shape)

(9240, 31)


In [None]:
print("="*100)
print("DATA TYPE INSPECTION")
print("="*100)

for col in df_clean.columns:
    print("\n" + "-"*60)
    print(f"Column: {col}")
    print(f"Data Type: {df_clean[col].dtype}")
    
    unique_count = df_clean[col].nunique(dropna=False)
    print(f"Unique Values (incl NaN): {unique_count}")
    
    # Check for numeric-looking object columns
    if df_clean[col].dtype == "object":
        sample_vals = df_clean[col].dropna().astype(str).head(5).tolist()
        print("Sample Values:", sample_vals)
    
    # Check for suspicious numeric columns
    if df_clean[col].dtype in ["int64", "float64"]:
        print("Min:", df_clean[col].min())
        print("Max:", df_clean[col].max())

print("\nDATA TYPE CHECK COMPLETE")
print("="*100)

DATA TYPE INSPECTION

------------------------------------------------------------
Column: Prospect ID
Data Type: object
Unique Values (incl NaN): 9240
Sample Values: ['7927b2df-8bba-4d29-b9a2-b6e0beafe620', '2a272436-5132-4136-86fa-dcc88c88f482', '8cc8c611-a219-4f35-ad23-fdfd2656bd8a', '0cc2df48-7cf4-4e39-9de9-19797f9b38cc', '3256f628-e534-4826-9d63-4a8b88782852']

------------------------------------------------------------
Column: Lead Number
Data Type: int64
Unique Values (incl NaN): 9240
Min: 579533
Max: 660737

------------------------------------------------------------
Column: Lead Origin
Data Type: object
Unique Values (incl NaN): 5
Sample Values: ['API', 'API', 'Landing Page Submission', 'Landing Page Submission', 'Landing Page Submission']

------------------------------------------------------------
Column: Lead Source
Data Type: object
Unique Values (incl NaN): 21
Sample Values: ['Olark Chat', 'Organic Search', 'Direct Traffic', 'Direct Traffic', 'Google']

---------------

In [None]:
print("="*100)
print("REMOVING LEAKAGE + ID COLUMNS")
print("="*100)

# ------------------------------------------------------------
# 1️⃣ Define columns to remove
# ------------------------------------------------------------

DROP_COLUMNS = [
    "Prospect ID",
    "Lead Number",
    "Lead Quality",
    "Tags",
    "Lead Profile",
    "Last Notable Activity"
]

# ------------------------------------------------------------
# 2️⃣ Safety check (drop only if exists)
# ------------------------------------------------------------

existing_drop = [col for col in DROP_COLUMNS if col in df_clean.columns]

print("\nColumns to remove:")
for col in existing_drop:
    print("-", col)

# ------------------------------------------------------------
# 3️⃣ Drop columns
# ------------------------------------------------------------

df_model = df_clean.drop(columns=existing_drop).copy()

print("\nColumns removed successfully.")
print("\nNew dataset shape:")
print("Rows:", df_model.shape[0])
print("Columns:", df_model.shape[1])

print("\nRemaining columns:")
print(df_model.columns.tolist())

print("\nLEAKAGE REMOVAL COMPLETE")
print("="*100)

REMOVING LEAKAGE + ID COLUMNS

Columns to remove:
- Prospect ID
- Lead Number
- Lead Quality
- Tags
- Lead Profile
- Last Notable Activity

Columns removed successfully.

New dataset shape:
Rows: 9240
Columns: 25

Remaining columns:
['Lead Origin', 'Lead Source', 'Do Not Email', 'Do Not Call', 'Converted', 'TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit', 'Last Activity', 'Country', 'Specialization', 'How did you hear about X Education', 'What is your current occupation', 'What matters most to you in choosing a course', 'Search', 'Magazine', 'Newspaper Article', 'X Education Forums', 'Newspaper', 'Digital Advertisement', 'Through Recommendations', 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content', 'Get updates on DM Content', 'City']

LEAKAGE REMOVAL COMPLETE


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

print("="*100)
print("LOGISTIC REGRESSION PIPELINE START")
print("="*100)

# ------------------------------------------------------------
# 1️⃣ Copy dataset
# ------------------------------------------------------------

df = df_model.copy()

# ------------------------------------------------------------
# 2️⃣ Separate target
# ------------------------------------------------------------

y = df["Converted"]
X = df.drop(columns=["Converted"])

# ------------------------------------------------------------
# 3️⃣ Identify column types
# ------------------------------------------------------------

categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("\nCategorical columns:", len(categorical_cols))
print(categorical_cols)

print("\nNumeric columns:", len(numeric_cols))
print(numeric_cols)

# ------------------------------------------------------------
# 4️⃣ Fill missing for categorical columns only
# ------------------------------------------------------------

for col in categorical_cols:
    X[col] = X[col].fillna("Missing")

print("\nMissing values handled for categorical features.")

# ------------------------------------------------------------
# 5️⃣ Train/Test split
# ------------------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nTrain/Test split complete.")
print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

# ------------------------------------------------------------
# 6️⃣ Column Transformer (One-Hot for categorical)
# ------------------------------------------------------------

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

# ------------------------------------------------------------
# 7️⃣ Logistic Regression Model
# ------------------------------------------------------------

model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# ------------------------------------------------------------
# 8️⃣ Fit model
# ------------------------------------------------------------

model.fit(X_train, y_train)

print("\nModel training complete.")

# ------------------------------------------------------------
# 9️⃣ Predictions
# ------------------------------------------------------------

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nROC-AUC Score:")
print(roc_auc_score(y_test, y_proba))

print("\nPIPELINE COMPLETE")
print("="*100)

LOGISTIC REGRESSION PIPELINE START


NameError: name 'df_model' is not defined

In [None]:
import sys
print(sys.executable)
print(sys.version)


/Library/Developer/CommandLineTools/usr/bin/python3
3.9.6 (default, Apr 30 2025, 02:07:18) 
[Clang 17.0.0 (clang-1700.0.13.5)]


In [None]:
import sklearn
print(sklearn.__version__)

1.6.1


In [4]:
import pandas as pd

# ------------------------------------------------------------
# 1️⃣ Load cleaned dataset
# ------------------------------------------------------------

df_clean = pd.read_csv("../data/processed/lead_scoring_cleaned.csv")

print("Loaded cleaned dataset:", df_clean.shape)

# ------------------------------------------------------------
# 2️⃣ Remove leakage + ID columns
# ------------------------------------------------------------

DROP_COLUMNS = [
    "Prospect ID",
    "Lead Number",
    "Lead Quality",
    "Tags",
    "Lead Profile",
    "Last Notable Activity"
]

existing_drop = [col for col in DROP_COLUMNS if col in df_clean.columns]

df_model = df_clean.drop(columns=existing_drop).copy()

print("Model dataset shape:", df_model.shape)
print("Columns ready for modeling.")

Loaded cleaned dataset: (9240, 31)
Model dataset shape: (9240, 25)
Columns ready for modeling.


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

print("="*100)
print("LOGISTIC REGRESSION PIPELINE START")
print("="*100)

# ------------------------------------------------------------
# 1️⃣ Copy dataset
# ------------------------------------------------------------

df = df_model.copy()

# ------------------------------------------------------------
# 2️⃣ Separate target
# ------------------------------------------------------------

y = df["Converted"]
X = df.drop(columns=["Converted"])

# ------------------------------------------------------------
# 3️⃣ Identify column types
# ------------------------------------------------------------

categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("\nCategorical columns:", len(categorical_cols))
print(categorical_cols)

print("\nNumeric columns:", len(numeric_cols))
print(numeric_cols)

# ------------------------------------------------------------
# 4️⃣ Fill missing for categorical columns only
# ------------------------------------------------------------

for col in categorical_cols:
    X[col] = X[col].fillna("Missing")

print("\nMissing values handled for categorical features.")

# ------------------------------------------------------------
# 5️⃣ Train/Test split
# ------------------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nTrain/Test split complete.")
print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

# ------------------------------------------------------------
# 6️⃣ Column Transformer (One-Hot for categorical)
# ------------------------------------------------------------

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

# ------------------------------------------------------------
# 7️⃣ Logistic Regression Model
# ------------------------------------------------------------

model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# ------------------------------------------------------------
# 8️⃣ Fit model
# ------------------------------------------------------------

model.fit(X_train, y_train)

print("\nModel training complete.")

# ------------------------------------------------------------
# 9️⃣ Predictions
# ------------------------------------------------------------

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nROC-AUC Score:")
print(roc_auc_score(y_test, y_proba))

print("\nPIPELINE COMPLETE")
print("="*100)

LOGISTIC REGRESSION PIPELINE START

Categorical columns: 21
['Lead Origin', 'Lead Source', 'Do Not Email', 'Do Not Call', 'Last Activity', 'Country', 'Specialization', 'How did you hear about X Education', 'What is your current occupation', 'What matters most to you in choosing a course', 'Search', 'Magazine', 'Newspaper Article', 'X Education Forums', 'Newspaper', 'Digital Advertisement', 'Through Recommendations', 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content', 'Get updates on DM Content', 'City']

Numeric columns: 3
['TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit']

Missing values handled for categorical features.

Train/Test split complete.
Train size: (7392, 24)
Test size: (1848, 24)


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

from sklearn.impute import SimpleImputer

print("="*100)
print("LOGISTIC REGRESSION PIPELINE START (with imputers)")
print("="*100)

# ------------------------------------------------------------
# 1️⃣ Copy dataset
# ------------------------------------------------------------
df = df_model.copy()

# ------------------------------------------------------------
# 2️⃣ Separate target
# ------------------------------------------------------------
y = df["Converted"]
X = df.drop(columns=["Converted"])

# ------------------------------------------------------------
# 3️⃣ Identify column types
# ------------------------------------------------------------
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("\nCategorical columns:", len(categorical_cols))
print(categorical_cols)

print("\nNumeric columns:", len(numeric_cols))
print(numeric_cols)

# Quick NaN check before pipeline (informational)
print("\nNaNs in numeric cols BEFORE pipeline:")
print(X[numeric_cols].isna().sum())

# ------------------------------------------------------------
# 4️⃣ Train/Test split
# ------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nTrain/Test split complete.")
print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

# ------------------------------------------------------------
# 5️⃣ Preprocessing pipelines
# ------------------------------------------------------------
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", cat_pipeline, categorical_cols),
        ("num", num_pipeline, numeric_cols)
    ]
)

# ------------------------------------------------------------
# 6️⃣ Logistic Regression Model
# ------------------------------------------------------------
model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(max_iter=2000))
])

# ------------------------------------------------------------
# 7️⃣ Fit
# ------------------------------------------------------------
model.fit(X_train, y_train)
print("\nModel training complete.")

# ------------------------------------------------------------
# 8️⃣ Predict + Evaluate
# ------------------------------------------------------------
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nROC-AUC Score:")
print(round(roc_auc_score(y_test, y_proba), 4))

print("\nPIPELINE COMPLETE")
print("="*100)

LOGISTIC REGRESSION PIPELINE START (with imputers)

Categorical columns: 21
['Lead Origin', 'Lead Source', 'Do Not Email', 'Do Not Call', 'Last Activity', 'Country', 'Specialization', 'How did you hear about X Education', 'What is your current occupation', 'What matters most to you in choosing a course', 'Search', 'Magazine', 'Newspaper Article', 'X Education Forums', 'Newspaper', 'Digital Advertisement', 'Through Recommendations', 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content', 'Get updates on DM Content', 'City']

Numeric columns: 3
['TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit']

NaNs in numeric cols BEFORE pipeline:
TotalVisits                    137
Total Time Spent on Website      0
Page Views Per Visit           137
dtype: int64

Train/Test split complete.
Train size: (7392, 24)
Test size: (1848, 24)

Model training complete.

Classification Report:
              precision    recall  f1-score   support

           0       0.8

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
import numpy as np
import pandas as pd

print("="*100)
print("MODEL INTERPRETATION: TOP COEFFICIENTS + ODDS RATIOS")
print("="*100)

# 1) Extract components
pre = model.named_steps["preprocessing"]
clf = model.named_steps["classifier"]

# 2) Get feature names from ColumnTransformer
cat_pipe = pre.named_transformers_["cat"]
ohe = cat_pipe.named_steps["ohe"]
cat_feature_names = ohe.get_feature_names_out(categorical_cols)

num_feature_names = np.array(numeric_cols, dtype=object)
feature_names = np.concatenate([cat_feature_names, num_feature_names])

# 3) Pull coefficients
coefs = clf.coef_.ravel()
intercept = float(clf.intercept_[0])

coef_df = pd.DataFrame({
    "feature": feature_names,
    "coef_logit": coefs,
    "odds_ratio": np.exp(coefs),
    "abs_coef": np.abs(coefs),
}).sort_values("abs_coef", ascending=False)

print("\nIntercept (log-odds):", round(intercept, 4))
print("Intercept (odds):", round(np.exp(intercept), 4))

# 4) Show top drivers by absolute effect
TOP_N = 25

print("\nTOP FEATURES BY ABSOLUTE EFFECT (|coef|)")
print("-"*80)
display(coef_df.head(TOP_N)[["feature", "coef_logit", "odds_ratio"]])

# 5) Show strongest positive and negative
print("\nTOP POSITIVE DRIVERS (increase conversion odds)")
print("-"*80)
display(coef_df.sort_values("coef_logit", ascending=False).head(TOP_N)[["feature", "coef_logit", "odds_ratio"]])

print("\nTOP NEGATIVE DRIVERS (decrease conversion odds)")
print("-"*80)
display(coef_df.sort_values("coef_logit", ascending=True).head(TOP_N)[["feature", "coef_logit", "odds_ratio"]])

# 6) Numeric-only effects
print("\nNUMERIC FEATURES (odds ratios per +1 unit)")
print("-"*80)
num_df = coef_df[coef_df["feature"].isin(numeric_cols)].copy()
display(num_df[["feature", "coef_logit", "odds_ratio"]].sort_values("abs_coef", ascending=False))

print("\nINTERPRETATION BLOCK COMPLETE")
print("="*100)

MODEL INTERPRETATION: TOP COEFFICIENTS + ODDS RATIOS

Intercept (log-odds): -0.218
Intercept (odds): 0.8041

TOP FEATURES BY ABSOLUTE EFFECT (|coef|)
--------------------------------------------------------------------------------


Unnamed: 0,feature,coef_logit,odds_ratio
20,Lead Source_Welingak Website,1.96885,7.162434
2,Lead Origin_Lead Add Form,1.695148,5.447453
60,Country_Missing,1.649572,5.20475
113,What is your current occupation_Working Profes...,1.577276,4.841747
37,Last Activity_SMS Sent,1.446735,4.249217
32,Last Activity_Had a Phone Conversation,1.348164,3.850348
34,Last Activity_Olark Chat Conversation,-1.269479,0.280978
1,Lead Origin_Landing Page Submission,-1.054339,0.348423
39,Last Activity_Unsubscribed,0.971903,2.642968
26,Last Activity_Converted to Lead,-0.965445,0.380814



TOP POSITIVE DRIVERS (increase conversion odds)
--------------------------------------------------------------------------------


Unnamed: 0,feature,coef_logit,odds_ratio
20,Lead Source_Welingak Website,1.96885,7.162434
2,Lead Origin_Lead Add Form,1.695148,5.447453
60,Country_Missing,1.649572,5.20475
113,What is your current occupation_Working Profes...,1.577276,4.841747
37,Last Activity_SMS Sent,1.446735,4.249217
32,Last Activity_Had a Phone Conversation,1.348164,3.850348
39,Last Activity_Unsubscribed,0.971903,2.642968
114,What matters most to you in choosing a course_...,0.766066,2.151285
108,What is your current occupation_Housewife,0.756433,2.130662
21,Do Not Email_No,0.623653,1.865731



TOP NEGATIVE DRIVERS (decrease conversion odds)
--------------------------------------------------------------------------------


Unnamed: 0,feature,coef_logit,odds_ratio
34,Last Activity_Olark Chat Conversation,-1.269479,0.280978
1,Lead Origin_Landing Page Submission,-1.054339,0.348423
26,Last Activity_Converted to Lead,-0.965445,0.380814
8,Lead Source_Facebook,-0.962947,0.381766
3,Lead Origin_Lead Import,-0.962521,0.381929
112,What is your current occupation_Unemployed,-0.897208,0.407706
111,What is your current occupation_Student,-0.819574,0.440619
33,Last Activity_Missing,-0.808177,0.44567
22,Do Not Email_Yes,-0.784683,0.456264
116,What matters most to you in choosing a course_...,-0.778979,0.458874



NUMERIC FEATURES (odds ratios per +1 unit)
--------------------------------------------------------------------------------


KeyError: 'abs_coef'

In [8]:
print("\nNUMERIC FEATURES (odds ratios per +1 unit)")
print("-"*80)

num_df = coef_df[coef_df["feature"].isin(numeric_cols)].copy()
num_df["abs_coef"] = num_df["coef_logit"].abs()   # <- robust, egal was vorher war

display(num_df.sort_values("abs_coef", ascending=False)[["feature", "coef_logit", "odds_ratio"]])


NUMERIC FEATURES (odds ratios per +1 unit)
--------------------------------------------------------------------------------


Unnamed: 0,feature,coef_logit,odds_ratio
142,Page Views Per Visit,-0.096891,0.907655
140,TotalVisits,0.040456,1.041286
141,Total Time Spent on Website,0.002048,1.00205


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

def train_logit_auc(df_in, drop_cols=None):
    df = df_in.copy()
    drop_cols = drop_cols or []
    df = df.drop(columns=[c for c in drop_cols if c in df.columns])

    y = df["Converted"]
    X = df.drop(columns=["Converted"])

    categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
    numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    cat_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ])
    num_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median"))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", cat_pipeline, categorical_cols),
            ("num", num_pipeline, numeric_cols)
        ]
    )

    model = Pipeline(steps=[
        ("preprocessing", preprocessor),
        ("classifier", LogisticRegression(max_iter=10000, solver="saga"))
    ])

    model.fit(X_train, y_train)
    proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, proba)
    return auc

auc_full = train_logit_auc(df_model, drop_cols=[])
auc_no_last_activity = train_logit_auc(df_model, drop_cols=["Last Activity"])

print("AUC full model:", round(auc_full, 4))
print("AUC without Last Activity:", round(auc_no_last_activity, 4))
print("ΔAUC (drop):", round(auc_full - auc_no_last_activity, 4))

AUC full model: 0.8596
AUC without Last Activity: 0.838
ΔAUC (drop): 0.0216


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, classification_report

print("="*100)
print("STRUCTURAL / CAUSAL MODEL")
print("="*100)

df = df_model.copy()

# ------------------------------------------------------------
# 1️⃣ Remove process / funnel proximity features
# ------------------------------------------------------------

DROP_COLS = [
    "Last Activity",
    "Last Notable Activity",
    "Tags",
    "Lead Quality"
]

df = df.drop(columns=[c for c in DROP_COLS if c in df.columns])

# ------------------------------------------------------------
# 2️⃣ Separate target
# ------------------------------------------------------------

y = df["Converted"]
X = df.drop(columns=["Converted"])

# ------------------------------------------------------------
# 3️⃣ Identify types
# ------------------------------------------------------------

categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("\nCategorical:", len(categorical_cols))
print("Numeric:", len(numeric_cols))

# ------------------------------------------------------------
# 4️⃣ Pipelines
# ------------------------------------------------------------

cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  # <- key change
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", cat_pipeline, categorical_cols),
        ("num", num_pipeline, numeric_cols)
    ]
)

model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(max_iter=5000))
])

# ------------------------------------------------------------
# 5️⃣ Train/Test
# ------------------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

model.fit(X_train, y_train)

y_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

print("\nAUC:", round(roc_auc_score(y_test, y_proba), 4))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("="*100)

STRUCTURAL / CAUSAL MODEL

Categorical: 20
Numeric: 3

AUC: 0.827

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.87      0.83      1136
           1       0.75      0.64      0.69       712

    accuracy                           0.78      1848
   macro avg       0.77      0.75      0.76      1848
weighted avg       0.78      0.78      0.77      1848



In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

# ====================================================================================
# CONFIG
# ====================================================================================

TARGET = "Converted"

# Features we consider "process / funnel proximity" (excluded for structural/causal view)
PROCESS_COLS = ["Last Activity", "Last Notable Activity", "Tags", "Lead Quality"]

# Structural blocks
BLOCKS = {
    "A_Marketing": ["Lead Origin", "Lead Source"],
    "B_Profile": ["What is your current occupation", "Specialization"],
    "C_Behavior": ["TotalVisits", "Total Time Spent on Website", "Page Views Per Visit"],
    "D_Geo": ["Country", "City"],
}

RANDOM_STATE = 42
TEST_SIZE = 0.2
MAX_ITER = 5000

# ====================================================================================
# HELPERS
# ====================================================================================

def build_structural_df(df_model: pd.DataFrame) -> pd.DataFrame:
    """Start from df_model and remove process/proximity variables (structural view)."""
    df = df_model.copy()
    df = df.drop(columns=[c for c in PROCESS_COLS if c in df.columns])
    # Keep target
    assert TARGET in df.columns, f"Target '{TARGET}' not found."
    return df

def fit_logit_auc(df_in: pd.DataFrame, feature_cols: list[str]) -> float:
    """Fit a logistic regression with proper preprocessing and return test AUC."""
    # Select X/y
    X = df_in[feature_cols].copy()
    y = df_in[TARGET].copy()

    # Identify types
    categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
    numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )

    # Pipelines
    cat_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),  # structural: neutralize missingness
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ])

    num_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", cat_pipeline, categorical_cols),
            ("num", num_pipeline, numeric_cols)
        ]
    )

    model = Pipeline(steps=[
        ("preprocessing", preprocessor),
        ("classifier", LogisticRegression(max_iter=MAX_ITER))
    ])

    model.fit(X_train, y_train)
    proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, proba)
    return auc

def available_cols(df: pd.DataFrame, cols: list[str]) -> list[str]:
    return [c for c in cols if c in df.columns]

# ====================================================================================
# 1) PREPARE STRUCTURAL DATAFRAME
# ====================================================================================

df_struct = build_structural_df(df_model)

print("="*100)
print("STRUCTURAL BASE DATASET")
print("="*100)
print("Shape:", df_struct.shape)

all_feature_cols = [c for c in df_struct.columns if c != TARGET]

# ====================================================================================
# 2) FEATURE DROP TEST: Country / City
# ====================================================================================

print("\n" + "="*100)
print("FEATURE DROP TEST: GEO")
print("="*100)

# Structural Full (all features in df_struct)
auc_struct_full = fit_logit_auc(df_struct, feature_cols=all_feature_cols)

# Structural without Country
cols_no_country = [c for c in all_feature_cols if c != "Country"]
auc_no_country = fit_logit_auc(df_struct, feature_cols=cols_no_country)

# Structural without Country + City
cols_no_country_city = [c for c in all_feature_cols if c not in ["Country", "City"]]
auc_no_country_city = fit_logit_auc(df_struct, feature_cols=cols_no_country_city)

print(f"Structural FULL AUC:              {auc_struct_full:.4f}")
print(f"Structural without Country AUC:   {auc_no_country:.4f}   (Δ {auc_struct_full-auc_no_country:+.4f})")
print(f"Structural without Country+City:  {auc_no_country_city:.4f}   (Δ {auc_struct_full-auc_no_country_city:+.4f})")

# ====================================================================================
# 3) BLOCK CONTRIBUTION ANALYSIS
# ====================================================================================

print("\n" + "="*100)
print("BLOCK CONTRIBUTION ANALYSIS")
print("="*100)

# Ensure blocks only contain available columns
blocks_clean = {k: available_cols(df_struct, v) for k, v in BLOCKS.items()}

print("\nBlocks (available columns):")
for k, v in blocks_clean.items():
    print(f"- {k}: {v}")

# A) AUC using each block alone
print("\n--- AUC per Block (ALONE) ---")
block_alone_results = []
for block_name, cols in blocks_clean.items():
    if len(cols) == 0:
        continue
    auc = fit_logit_auc(df_struct, feature_cols=cols)
    block_alone_results.append((block_name, auc, cols))

block_alone_results = sorted(block_alone_results, key=lambda x: x[1], reverse=True)
for name, auc, cols in block_alone_results:
    print(f"{name:>12}: AUC={auc:.4f} | n_features={len(cols)}")

# B) Incremental build-up: A -> A+B -> A+B+C -> A+B+C+D
print("\n--- Incremental AUC (CUMULATIVE) ---")
order = ["A_Marketing", "B_Profile", "C_Behavior", "D_Geo"]

cum_cols = []
cum_results = []
for block_name in order:
    cols = blocks_clean.get(block_name, [])
    cum_cols += cols
    if len(cum_cols) == 0:
        continue
    auc = fit_logit_auc(df_struct, feature_cols=cum_cols)
    cum_results.append((block_name, auc, len(cum_cols)))

prev_auc = None
for block_name, auc, ncols in cum_results:
    delta = "" if prev_auc is None else f"(+{auc-prev_auc:.4f})"
    print(f"Up to {block_name:>12}: AUC={auc:.4f} {delta} | total_features={ncols}")
    prev_auc = auc

print("\nDONE.")

STRUCTURAL BASE DATASET
Shape: (9240, 24)

FEATURE DROP TEST: GEO
Structural FULL AUC:              0.8270
Structural without Country AUC:   0.8274   (Δ -0.0005)
Structural without Country+City:  0.8294   (Δ -0.0024)

BLOCK CONTRIBUTION ANALYSIS

Blocks (available columns):
- A_Marketing: ['Lead Origin', 'Lead Source']
- B_Profile: ['What is your current occupation', 'Specialization']
- C_Behavior: ['TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit']
- D_Geo: ['Country', 'City']

--- AUC per Block (ALONE) ---
  C_Behavior: AUC=0.7214 | n_features=3
 A_Marketing: AUC=0.6346 | n_features=2
   B_Profile: AUC=0.6103 | n_features=2
       D_Geo: AUC=0.5154 | n_features=2

--- Incremental AUC (CUMULATIVE) ---
Up to  A_Marketing: AUC=0.6346  | total_features=2
Up to    B_Profile: AUC=0.6997 (+0.0652) | total_features=4
Up to   C_Behavior: AUC=0.8273 (+0.1276) | total_features=7
Up to        D_Geo: AUC=0.8244 (+-0.0030) | total_features=9

DONE.
