In [6]:
import pandas as pd

# ============================================================
# 1) Entfernen der definierten Spalten
# ============================================================

DROP_COLUMNS = [
    "Asymmetrique Activity Index",
    "Asymmetrique Profile Index",
    "Asymmetrique Activity Score",
    "Asymmetrique Profile Score",
    "I agree to pay the amount through cheque",
    "A free copy of Mastering The Interview"
]

# Nur existierende Spalten droppen (Sicherheitscheck)
existing_drop = [c for c in DROP_COLUMNS if c in df.columns]
df_reduced = df.drop(columns=existing_drop).copy()

print("Remaining columns:", len(df_reduced.columns))


# ============================================================
# 2) NaNs in den ersten 3 Spalten prüfen
# ============================================================

first_three_cols = df_reduced.columns[:3]
print("\nFirst three columns:", list(first_three_cols))

for col in first_three_cols:
    missing_count = df_reduced[col].isna().sum()
    missing_pct = missing_count / len(df_reduced) * 100
    unique_values = df_reduced[col].nunique(dropna=False)

    print(f"\nColumn: {col}")
    print("Missing count:", missing_count)
    print("Missing %:", round(missing_pct, 4))
    print("Unique values:", unique_values)

Remaining columns: 31

First three columns: ['Prospect ID', 'Lead Number', 'Lead Origin']

Column: Prospect ID
Missing count: 0
Missing %: 0.0
Unique values: 9240

Column: Lead Number
Missing count: 0
Missing %: 0.0
Unique values: 9240

Column: Lead Origin
Missing count: 0
Missing %: 0.0
Unique values: 5


In [7]:
cols_to_check = df_reduced.columns[3:6]
print("Columns under inspection:", list(cols_to_check))

for col in cols_to_check:
    print("\n" + "="*60)
    print(f"Column: {col}")
    
    # Missing values
    missing_count = df_reduced[col].isna().sum()
    missing_pct = missing_count / len(df_reduced) * 100
    
    print("Missing count:", missing_count)
    print("Missing %:", round(missing_pct, 4))
    
    # Unique values
    unique_values = df_reduced[col].nunique(dropna=False)
    print("Number of unique values (incl NaN):", unique_values)
    
    # Value counts (inkl NaN)
    print("\nTop value counts:")
    print(df_reduced[col].value_counts(dropna=False).head(15))
    
    # Check for empty strings
    empty_string_count = (df_reduced[col] == "").sum()
    if empty_string_count > 0:
        print("Empty string count:", empty_string_count)
    
    # Check for whitespace-only entries
    whitespace_count = (df_reduced[col].astype(str).str.strip() == "").sum()
    if whitespace_count > 0:
        print("Whitespace-only entries:", whitespace_count)

Columns under inspection: ['Lead Source', 'Do Not Email', 'Do Not Call']

Column: Lead Source
Missing count: 36
Missing %: 0.3896
Number of unique values (incl NaN): 22

Top value counts:
Lead Source
Google              2868
Direct Traffic      2543
Olark Chat          1755
Organic Search      1154
Reference            534
Welingak Website     142
Referral Sites       125
Facebook              55
NaN                   36
bing                   6
google                 5
Click2call             4
Live Chat              2
Social Media           2
Press_Release          2
Name: count, dtype: int64

Column: Do Not Email
Missing count: 0
Missing %: 0.0
Number of unique values (incl NaN): 2

Top value counts:
Do Not Email
No     8506
Yes     734
Name: count, dtype: int64

Column: Do Not Call
Missing count: 0
Missing %: 0.0
Number of unique values (incl NaN): 2

Top value counts:
Do Not Call
No     9238
Yes       2
Name: count, dtype: int64


In [8]:
cols_to_check = df_reduced.columns[6:10]
print("Columns under inspection:", list(cols_to_check))

for col in cols_to_check:
    print("\n" + "="*70)
    print(f"Column: {col}")
    
    # --- Missing check ---
    nan_mask = df_reduced[col].isna()
    n_nans = nan_mask.sum()
    
    if n_nans == 0:
        print("No NaNs found.")
    else:
        print(f"NaNs found: {n_nans}")
        display(
            df_reduced.loc[nan_mask,
                           ["Prospect ID", "Lead Number", "Lead Origin", col]]
            .head(20)
        )
    
    # --- Basic stats for numeric columns ---
    if pd.api.types.is_numeric_dtype(df_reduced[col]):
        print("\nBasic statistics:")
        print(df_reduced[col].describe())
        
        print("\nTop 10 values:")
        print(df_reduced[col].value_counts(dropna=False).head(10))
    
    # --- Unique values for non-numeric ---
    else:
        print("\nValue counts:")
        print(df_reduced[col].value_counts(dropna=False).head(15))

Columns under inspection: ['Converted', 'TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit']

Column: Converted
No NaNs found.

Basic statistics:
count    9240.000000
mean        0.385390
std         0.486714
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: Converted, dtype: float64

Top 10 values:
Converted
0    5679
1    3561
Name: count, dtype: int64

Column: TotalVisits
NaNs found: 137


Unnamed: 0,Prospect ID,Lead Number,Lead Origin,TotalVisits
77,895d4905-f534-4f18-915b-8d239a72b5dc,659722,Lead Add Form,
79,3a0ce10f-d2c1-4213-a2bc-4f97bcd29699,659710,Lead Add Form,
81,277ad6a6-4565-4a18-a1ff-e46e03f22663,659705,Lead Add Form,
88,68f496c2-0073-470f-9c3c-7fb48f060ce5,659631,Lead Add Form,
120,144807db-2895-4002-b52e-3eda79c22395,659283,Lead Add Form,
133,63ebde80-a465-4cdc-ab5a-5e880a7138b0,659158,Lead Add Form,
134,0298b9a5-fedb-408b-a284-2d357583600f,659153,Lead Add Form,
177,3b74e995-4407-44de-9e59-622afb514261,658648,Lead Add Form,
179,1730b5e8-e435-41c6-9082-b9c98976bd16,658627,Lead Add Form,
180,db2dc4b5-f603-4818-9b0c-0435923a4cd8,658623,Lead Add Form,



Basic statistics:
count    9103.000000
mean        3.445238
std         4.854853
min         0.000000
25%         1.000000
50%         3.000000
75%         5.000000
max       251.000000
Name: TotalVisits, dtype: float64

Top 10 values:
TotalVisits
0.0    2189
2.0    1680
3.0    1306
4.0    1120
5.0     783
6.0     466
1.0     395
7.0     309
8.0     224
9.0     164
Name: count, dtype: int64

Column: Total Time Spent on Website
No NaNs found.

Basic statistics:
count    9240.000000
mean      487.698268
std       548.021466
min         0.000000
25%        12.000000
50%       248.000000
75%       936.000000
max      2272.000000
Name: Total Time Spent on Website, dtype: float64

Top 10 values:
Total Time Spent on Website
0      2193
60       19
74       18
127      18
75       18
234      17
62       17
157      17
87       17
32       17
Name: count, dtype: int64

Column: Page Views Per Visit
NaNs found: 137


Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Page Views Per Visit
77,895d4905-f534-4f18-915b-8d239a72b5dc,659722,Lead Add Form,
79,3a0ce10f-d2c1-4213-a2bc-4f97bcd29699,659710,Lead Add Form,
81,277ad6a6-4565-4a18-a1ff-e46e03f22663,659705,Lead Add Form,
88,68f496c2-0073-470f-9c3c-7fb48f060ce5,659631,Lead Add Form,
120,144807db-2895-4002-b52e-3eda79c22395,659283,Lead Add Form,
133,63ebde80-a465-4cdc-ab5a-5e880a7138b0,659158,Lead Add Form,
134,0298b9a5-fedb-408b-a284-2d357583600f,659153,Lead Add Form,
177,3b74e995-4407-44de-9e59-622afb514261,658648,Lead Add Form,
179,1730b5e8-e435-41c6-9082-b9c98976bd16,658627,Lead Add Form,
180,db2dc4b5-f603-4818-9b0c-0435923a4cd8,658623,Lead Add Form,



Basic statistics:
count    9103.000000
mean        2.362820
std         2.161418
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max        55.000000
Name: Page Views Per Visit, dtype: float64

Top 10 values:
Page Views Per Visit
0.0    2189
2.0    1795
3.0    1196
4.0     896
1.0     651
5.0     517
1.5     306
6.0     244
2.5     241
NaN     137
Name: count, dtype: int64


In [9]:
print("="*80)
print("ANALYSIS: TotalVisits Missing Values")
print("="*80)

# Maske für Missing
missing_mask = df_reduced["TotalVisits"].isna()

n_missing = missing_mask.sum()
n_total = len(df_reduced)

print(f"\nTotal rows: {n_total}")
print(f"Missing TotalVisits: {n_missing}")
print(f"Percentage missing: {round(n_missing / n_total * 100, 4)} %")

# ------------------------------------------------------------
# 1️⃣ In welchen Lead Origins treten sie auf?
# ------------------------------------------------------------

print("\nLead Origin distribution for missing TotalVisits:")
print("-"*60)
print(df_reduced.loc[missing_mask, "Lead Origin"].value_counts())

# ------------------------------------------------------------
# 2️⃣ Conversion Rate Vergleich
# ------------------------------------------------------------

print("\nConversion comparison:")
print("-"*60)

conversion_missing = df_reduced.loc[missing_mask, "Converted"].mean()
conversion_not_missing = df_reduced.loc[~missing_mask, "Converted"].mean()
conversion_overall = df_reduced["Converted"].mean()

print(f"Overall conversion rate: {round(conversion_overall, 4)}")
print(f"Conversion (Missing TotalVisits): {round(conversion_missing, 4)}")
print(f"Conversion (Not Missing TotalVisits): {round(conversion_not_missing, 4)}")

# ------------------------------------------------------------
# 3️⃣ Beispielhafte Zeilen anzeigen
# ------------------------------------------------------------

print("\nSample rows with missing TotalVisits:")
print("-"*60)

display(
    df_reduced.loc[missing_mask,
                   ["Prospect ID", "Lead Number", "Lead Origin", 
                    "TotalVisits", "Page Views Per Visit", "Converted"]]
    .head(10)
)

print("\nAnalysis block complete.")

ANALYSIS: TotalVisits Missing Values

Total rows: 9240
Missing TotalVisits: 137
Percentage missing: 1.4827 %

Lead Origin distribution for missing TotalVisits:
------------------------------------------------------------
Lead Origin
Lead Add Form     110
Lead Import        24
API                 2
Quick Add Form      1
Name: count, dtype: int64

Conversion comparison:
------------------------------------------------------------
Overall conversion rate: 0.3854
Conversion (Missing TotalVisits): 0.7299
Conversion (Not Missing TotalVisits): 0.3802

Sample rows with missing TotalVisits:
------------------------------------------------------------


Unnamed: 0,Prospect ID,Lead Number,Lead Origin,TotalVisits,Page Views Per Visit,Converted
77,895d4905-f534-4f18-915b-8d239a72b5dc,659722,Lead Add Form,,,1
79,3a0ce10f-d2c1-4213-a2bc-4f97bcd29699,659710,Lead Add Form,,,1
81,277ad6a6-4565-4a18-a1ff-e46e03f22663,659705,Lead Add Form,,,1
88,68f496c2-0073-470f-9c3c-7fb48f060ce5,659631,Lead Add Form,,,1
120,144807db-2895-4002-b52e-3eda79c22395,659283,Lead Add Form,,,1
133,63ebde80-a465-4cdc-ab5a-5e880a7138b0,659158,Lead Add Form,,,1
134,0298b9a5-fedb-408b-a284-2d357583600f,659153,Lead Add Form,,,1
177,3b74e995-4407-44de-9e59-622afb514261,658648,Lead Add Form,,,1
179,1730b5e8-e435-41c6-9082-b9c98976bd16,658627,Lead Add Form,,,1
180,db2dc4b5-f603-4818-9b0c-0435923a4cd8,658623,Lead Add Form,,,1



Analysis block complete.


In [10]:
print("="*80)
print("ANALYSIS: Conversion Rate by Lead Origin")
print("="*80)

# Gesamt-Conversion
overall_conversion = df_reduced["Converted"].mean()
print(f"\nOverall Conversion Rate: {round(overall_conversion, 4)}")

print("\nConversion Rate by Lead Origin:")
print("-"*60)

origin_summary = (
    df_reduced
    .groupby("Lead Origin")["Converted"]
    .agg(["count", "mean"])
    .sort_values("mean", ascending=False)
)

print(origin_summary)

print("\nDetailed Interpretation:")
print("-"*60)

for origin, row in origin_summary.iterrows():
    print(f"{origin}:")
    print(f"  Number of Leads: {row['count']}")
    print(f"  Conversion Rate: {round(row['mean'], 4)}")
    print()

print("="*80)
print("Analysis complete.")

ANALYSIS: Conversion Rate by Lead Origin

Overall Conversion Rate: 0.3854

Conversion Rate by Lead Origin:
------------------------------------------------------------
                         count      mean
Lead Origin                             
Quick Add Form               1  1.000000
Lead Add Form              718  0.924791
Landing Page Submission   4886  0.361850
API                       3580  0.311453
Lead Import                 55  0.236364

Detailed Interpretation:
------------------------------------------------------------
Quick Add Form:
  Number of Leads: 1.0
  Conversion Rate: 1.0

Lead Add Form:
  Number of Leads: 718.0
  Conversion Rate: 0.9248

Landing Page Submission:
  Number of Leads: 4886.0
  Conversion Rate: 0.3619

API:
  Number of Leads: 3580.0
  Conversion Rate: 0.3115

Lead Import:
  Number of Leads: 55.0
  Conversion Rate: 0.2364

Analysis complete.


In [11]:
cols_to_check = df_reduced.columns[10:16]
print("Columns under inspection:", list(cols_to_check))

for col in cols_to_check:
    print("\n" + "="*80)
    print(f"Column: {col}")
    
    # ---- Missing ----
    nan_count = df_reduced[col].isna().sum()
    nan_pct = nan_count / len(df_reduced) * 100
    
    print(f"NaN count: {nan_count}")
    print(f"NaN %: {round(nan_pct, 4)}")
    
    # ---- Unique values ----
    unique_vals = df_reduced[col].nunique(dropna=False)
    print(f"Unique values (incl NaN): {unique_vals}")
    
    # ---- Top values ----
    print("\nTop value counts:")
    print(df_reduced[col].value_counts(dropna=False).head(15))
    
    # ---- Pseudo-Missing Checks ----
    if df_reduced[col].dtype == "object":
        select_count = df_reduced[col].astype(str).str.contains("Select", case=False, na=False).sum()
        not_provided_count = df_reduced[col].astype(str).str.contains("Not Provided", case=False, na=False).sum()
        empty_count = (df_reduced[col] == "").sum()
        
        if select_count > 0:
            print(f"'Select' occurrences: {select_count}")
        if not_provided_count > 0:
            print(f"'Not Provided' occurrences: {not_provided_count}")
        if empty_count > 0:
            print(f"Empty string occurrences: {empty_count}")

Columns under inspection: ['Last Activity', 'Country', 'Specialization', 'How did you hear about X Education', 'What is your current occupation', 'What matters most to you in choosing a course']

Column: Last Activity
NaN count: 103
NaN %: 1.1147
Unique values (incl NaN): 18

Top value counts:
Last Activity
Email Opened                    3437
SMS Sent                        2745
Olark Chat Conversation          973
Page Visited on Website          640
Converted to Lead                428
Email Bounced                    326
Email Link Clicked               267
Form Submitted on Website        116
NaN                              103
Unreachable                       93
Unsubscribed                      61
Had a Phone Conversation          30
Approached upfront                 9
View in browser link Clicked       6
Email Marked Spam                  2
Name: count, dtype: int64

Column: Country
NaN count: 2461
NaN %: 26.6342
Unique values (incl NaN): 39

Top value counts:
Country
India 

In [12]:
cols_to_check = df_reduced.columns[16:22]
print("Columns under inspection:", list(cols_to_check))

for col in cols_to_check:
    print("\n" + "="*80)
    print(f"Column: {col}")
    
    # ---- Missing ----
    nan_count = df_reduced[col].isna().sum()
    nan_pct = nan_count / len(df_reduced) * 100
    
    print(f"NaN count: {nan_count}")
    print(f"NaN %: {round(nan_pct, 4)}")
    
    # ---- Unique values ----
    unique_vals = df_reduced[col].nunique(dropna=False)
    print(f"Unique values (incl NaN): {unique_vals}")
    
    # ---- Value distribution ----
    print("\nValue counts:")
    print(df_reduced[col].value_counts(dropna=False))
    
    # ---- Check for inconsistent labels ----
    if df_reduced[col].dtype == "object":
        print("\nUnique raw labels:")
        print(df_reduced[col].unique())

Columns under inspection: ['Search', 'Magazine', 'Newspaper Article', 'X Education Forums', 'Newspaper', 'Digital Advertisement']

Column: Search
NaN count: 0
NaN %: 0.0
Unique values (incl NaN): 2

Value counts:
Search
No     9226
Yes      14
Name: count, dtype: int64

Unique raw labels:
['No' 'Yes']

Column: Magazine
NaN count: 0
NaN %: 0.0
Unique values (incl NaN): 1

Value counts:
Magazine
No    9240
Name: count, dtype: int64

Unique raw labels:
['No']

Column: Newspaper Article
NaN count: 0
NaN %: 0.0
Unique values (incl NaN): 2

Value counts:
Newspaper Article
No     9238
Yes       2
Name: count, dtype: int64

Unique raw labels:
['No' 'Yes']

Column: X Education Forums
NaN count: 0
NaN %: 0.0
Unique values (incl NaN): 2

Value counts:
X Education Forums
No     9239
Yes       1
Name: count, dtype: int64

Unique raw labels:
['No' 'Yes']

Column: Newspaper
NaN count: 0
NaN %: 0.0
Unique values (incl NaN): 2

Value counts:
Newspaper
No     9239
Yes       1
Name: count, dtype: int64



In [13]:
cols_to_check = df_reduced.columns[22:28]
print("Columns under inspection:", list(cols_to_check))

for col in cols_to_check:
    print("\n" + "="*80)
    print(f"Column: {col}")
    
    # ---- Missing ----
    nan_count = df_reduced[col].isna().sum()
    nan_pct = nan_count / len(df_reduced) * 100
    
    print(f"NaN count: {nan_count}")
    print(f"NaN %: {round(nan_pct, 4)}")
    
    # ---- Unique values ----
    unique_vals = df_reduced[col].nunique(dropna=False)
    print(f"Unique values (incl NaN): {unique_vals}")
    
    # ---- Value distribution ----
    print("\nValue counts:")
    print(df_reduced[col].value_counts(dropna=False).head(20))
    
    # ---- Check for pseudo-missing ----
    if df_reduced[col].dtype == "object":
        select_count = df_reduced[col].astype(str).str.contains("Select", case=False, na=False).sum()
        not_provided_count = df_reduced[col].astype(str).str.contains("Not Provided", case=False, na=False).sum()
        
        if select_count > 0:
            print(f"'Select' occurrences: {select_count}")
        if not_provided_count > 0:
            print(f"'Not Provided' occurrences: {not_provided_count}")
            

Columns under inspection: ['Through Recommendations', 'Receive More Updates About Our Courses', 'Tags', 'Lead Quality', 'Update me on Supply Chain Content', 'Get updates on DM Content']

Column: Through Recommendations
NaN count: 0
NaN %: 0.0
Unique values (incl NaN): 2

Value counts:
Through Recommendations
No     9233
Yes       7
Name: count, dtype: int64

Column: Receive More Updates About Our Courses
NaN count: 0
NaN %: 0.0
Unique values (incl NaN): 1

Value counts:
Receive More Updates About Our Courses
No    9240
Name: count, dtype: int64

Column: Tags
NaN count: 3353
NaN %: 36.2879
Unique values (incl NaN): 27

Value counts:
Tags
NaN                                                  3353
Will revert after reading the email                  2072
Ringing                                              1203
Interested in other courses                           513
Already a student                                     465
Closed by Horizzon                                    358
switche

In [14]:
start_index = 28  # letzte geprüfte Position anpassen falls nötig
cols_to_check = df_reduced.columns[start_index:start_index+6]

print("Columns under inspection:", list(cols_to_check))

for col in cols_to_check:
    print("\n" + "="*80)
    print(f"Column: {col}")
    
    # ---- Missing ----
    nan_count = df_reduced[col].isna().sum()
    nan_pct = nan_count / len(df_reduced) * 100
    
    print(f"NaN count: {nan_count}")
    print(f"NaN %: {round(nan_pct, 4)}")
    
    # ---- Unique values ----
    unique_vals = df_reduced[col].nunique(dropna=False)
    print(f"Unique values (incl NaN): {unique_vals}")
    
    # ---- Value distribution ----
    print("\nTop value counts:")
    print(df_reduced[col].value_counts(dropna=False).head(15))
    
    # ---- Pseudo-Missing ----
    if df_reduced[col].dtype == "object":
        select_count = df_reduced[col].astype(str).str.contains("Select", case=False, na=False).sum()
        not_provided_count = df_reduced[col].astype(str).str.contains("Not Provided", case=False, na=False).sum()
        
        if select_count > 0:
            print(f"'Select' occurrences: {select_count}")
        if not_provided_count > 0:
            print(f"'Not Provided' occurrences: {not_provided_count}")

Columns under inspection: ['Lead Profile', 'City', 'Last Notable Activity']

Column: Lead Profile
NaN count: 2709
NaN %: 29.3182
Unique values (incl NaN): 7

Top value counts:
Lead Profile
Select                         4146
NaN                            2709
Potential Lead                 1613
Other Leads                     487
Student of SomeSchool           241
Lateral Student                  24
Dual Specialization Student      20
Name: count, dtype: int64
'Select' occurrences: 4146

Column: City
NaN count: 1420
NaN %: 15.368
Unique values (incl NaN): 8

Top value counts:
City
Mumbai                         3222
Select                         2249
NaN                            1420
Thane & Outskirts               752
Other Cities                    686
Other Cities of Maharashtra     457
Other Metro Cities              380
Tier II Cities                   74
Name: count, dtype: int64
'Select' occurrences: 2249

Column: Last Notable Activity
NaN count: 0
NaN %: 0.0
Unique values 

In [15]:
import pandas as pd
import numpy as np

print("="*80)
print("CLEANING: Converting pseudo-missing values to NaN")
print("="*80)

df_clean = df_reduced.copy()

# Define pseudo-missing tokens
PSEUDO_TOKENS = ["Select", "Not Provided", "", " "]

# Replace across entire dataframe
for col in df_clean.columns:
    if df_clean[col].dtype == "object":
        df_clean[col] = df_clean[col].replace(PSEUDO_TOKENS, pd.NA)

print("Pseudo-missing values replaced with NaN.")

# ------------------------------------------------------------
# Check how many NaNs we now have per column
# ------------------------------------------------------------

missing_summary = (
    df_clean.isna()
    .sum()
    .sort_values(ascending=False)
)

print("\nUpdated missing summary (top 15 columns):")
print(missing_summary.head(15))

print("\nCleaning complete.")

CLEANING: Converting pseudo-missing values to NaN
Pseudo-missing values replaced with NaN.

Updated missing summary (top 15 columns):
How did you hear about X Education               7250
Lead Profile                                     6855
Lead Quality                                     4767
City                                             3669
Specialization                                   3380
Tags                                             3353
What matters most to you in choosing a course    2709
What is your current occupation                  2690
Country                                          2461
TotalVisits                                       137
Page Views Per Visit                              137
Last Activity                                     103
Lead Source                                        36
Converted                                           0
Do Not Call                                         0
dtype: int64

Cleaning complete.


In [16]:
print("="*80)
print("Creating tracking vs non-tracking datasets")
print("="*80)

tracking_missing_mask = df_clean["TotalVisits"].isna()

df_no_tracking = df_clean.loc[tracking_missing_mask].copy()
df_with_tracking = df_clean.loc[~tracking_missing_mask].copy()

print(f"Total rows: {len(df_clean)}")
print(f"Rows WITHOUT tracking (NaN TotalVisits): {len(df_no_tracking)}")
print(f"Rows WITH tracking: {len(df_with_tracking)}")

print("\nConversion comparison:")
print(f"No tracking conversion: {round(df_no_tracking['Converted'].mean(), 4)}")
print(f"With tracking conversion: {round(df_with_tracking['Converted'].mean(), 4)}")

Creating tracking vs non-tracking datasets
Total rows: 9240
Rows WITHOUT tracking (NaN TotalVisits): 137
Rows WITH tracking: 9103

Conversion comparison:
No tracking conversion: 0.7299
With tracking conversion: 0.3802


In [17]:
df_clean.to_csv("../data/processed/lead_scoring_cleaned.csv", index=False)
df_with_tracking.to_csv("../data/processed/lead_scoring_with_tracking.csv", index=False)
df_no_tracking.to_csv("../data/processed/lead_scoring_no_tracking.csv", index=False)

print("Processed datasets saved.")

Processed datasets saved.


In [18]:
df_clean["Lead Source"].unique()

array(['Olark Chat', 'Organic Search', 'Direct Traffic', 'Google',
       'Referral Sites', 'Welingak Website', 'Reference', 'google',
       'Facebook', nan, 'blog', 'Pay per Click Ads', 'bing',
       'Social Media', 'WeLearn', 'Click2call', 'Live Chat',
       'welearnblog_Home', 'youtubechannel', 'testone', 'Press_Release',
       'NC_EDM'], dtype=object)

In [19]:
df_clean["Lead Source"] = df_clean["Lead Source"].str.strip().str.title()
df_clean["Country"] = df_clean["Country"].replace("unknown", pd.NA)
df_clean[["TotalVisits", 
          "Total Time Spent on Website", 
          "Page Views Per Visit"]].describe()
df_clean.dtypes
df_clean["Converted"].value_counts(normalize=True)
print("="*100)
print("FINAL DATA AUDIT")
print("="*100)

# ------------------------------------------------------------
# 1️⃣ Shape
# ------------------------------------------------------------
print("\n1) DATASET SHAPE")
print("-"*50)
print("Rows:", df_clean.shape[0])
print("Columns:", df_clean.shape[1])


# ------------------------------------------------------------
# 2️⃣ Missing Overview
# ------------------------------------------------------------
print("\n2) MISSING SUMMARY (Top 15)")
print("-"*50)

missing_summary = (
    df_clean.isna()
    .sum()
    .sort_values(ascending=False)
)

print(missing_summary.head(15))


# ------------------------------------------------------------
# 3️⃣ Data Types
# ------------------------------------------------------------
print("\n3) DATA TYPES")
print("-"*50)
print(df_clean.dtypes)


# ------------------------------------------------------------
# 4️⃣ Numeric Summary
# ------------------------------------------------------------
print("\n4) NUMERIC SUMMARY")
print("-"*50)

numeric_cols = df_clean.select_dtypes(include=["int64", "float64"]).columns
print(df_clean[numeric_cols].describe())


# ------------------------------------------------------------
# 5️⃣ Target Distribution
# ------------------------------------------------------------
print("\n5) TARGET DISTRIBUTION")
print("-"*50)

print(df_clean["Converted"].value_counts())
print("\nTarget ratio:")
print(df_clean["Converted"].value_counts(normalize=True))


# ------------------------------------------------------------
# 6️⃣ Check for negative values in numeric columns
# ------------------------------------------------------------
print("\n6) NEGATIVE VALUE CHECK")
print("-"*50)

for col in numeric_cols:
    if (df_clean[col] < 0).any():
        print(f"Negative values found in {col}")
    else:
        print(f"{col}: OK (no negatives)")


# ------------------------------------------------------------
# 7️⃣ Lead Source normalization check
# ------------------------------------------------------------
print("\n7) LEAD SOURCE UNIQUE VALUES")
print("-"*50)

print(df_clean["Lead Source"].unique())


print("\nAUDIT COMPLETE")
print("="*100)

FINAL DATA AUDIT

1) DATASET SHAPE
--------------------------------------------------
Rows: 9240
Columns: 31

2) MISSING SUMMARY (Top 15)
--------------------------------------------------
How did you hear about X Education               7250
Lead Profile                                     6855
Lead Quality                                     4767
City                                             3669
Specialization                                   3380
Tags                                             3353
What matters most to you in choosing a course    2709
What is your current occupation                  2690
Country                                          2466
TotalVisits                                       137
Page Views Per Visit                              137
Last Activity                                     103
Lead Source                                        36
Converted                                           0
Do Not Call                                         0
d

In [20]:
print("="*80)
print("Overwriting processed dataset")
print("="*80)

output_path = "../data/processed/lead_scoring_cleaned.csv"

df_clean.to_csv(output_path, index=False)

print(f"File overwritten at: {output_path}")
print("Overwrite complete.")


Overwriting processed dataset
File overwritten at: ../data/processed/lead_scoring_cleaned.csv
Overwrite complete.


In [21]:
df_test = pd.read_csv("../data/processed/lead_scoring_cleaned.csv")
print(df_test.shape)

(9240, 31)
