<a href="https://colab.research.google.com/github/jhapriya821/B2B-Lead-Scoring-Engine/blob/main/B2B_Lead_Scoring_Engine_MarTech_Data_Cleaning_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# 1. Create a "Messy" Lead Dataset
data = {
    'Email': ['lukas@siemens.de', 'info@gmail.com', 'sarah@sap.com', 'test@test.com', 'markus@bmw.de'],
    'Job_Title': ['Director', 'Student', 'Manager', 'Unknown', 'CFO'],
    'Web_Visits': [12, 1, 5, 0, 8],
    'Downloaded_Whitepaper': [True, False, True, False, True]
}
df_leads = pd.DataFrame(data)

# 2. Define the Scoring Rules
def calculate_lead_score(row):
    score = 0
    # Professional Email (Bonus points for corporate domains)
    if any(corp in row['Email'] for corp in ['.de', '.com']) and 'gmail' not in row['Email']:
        score += 20
    # Seniority Level
    if row['Job_Title'] in ['Director', 'CFO', 'VP']:
        score += 30
    # Engagement (Web visits)
    score += (row['Web_Visits'] * 5)
    # High-Intent Action
    if row['Downloaded_Whitepaper']:
        score += 25
    return score

df_leads['Lead_Score'] = df_leads.apply(calculate_lead_score, axis=1)

# 3. Categorize for the Sales Team
df_leads['Status'] = df_leads['Lead_Score'].apply(lambda x: 'ðŸ”¥ Hot (MQL)' if x > 70 else 'ðŸ§Š Cold')

print(df_leads[['Email', 'Lead_Score', 'Status']])

              Email  Lead_Score       Status
0  lukas@siemens.de         135  ðŸ”¥ Hot (MQL)
1    info@gmail.com           5       ðŸ§Š Cold
2     sarah@sap.com          70       ðŸ§Š Cold
3     test@test.com          20       ðŸ§Š Cold
4     markus@bmw.de         115  ðŸ”¥ Hot (MQL)


In [2]:
# 1. Identity "Dirty" Data (Duplicates and Bad Formatting)
def clean_martech_data(df):
    # Standardize Email to lowercase
    df['Email'] = df['Email'].str.lower().str.strip()

    # Flag "Test" or "Fake" leads
    df['is_fake'] = df['Email'].str.contains('test|demo|fake')

    # Remove duplicates based on Email
    clean_df = df.drop_duplicates(subset='Email', keep='first')

    # Filter out the fakes
    clean_df = clean_df[clean_df['is_fake'] == False]

    return clean_df

df_final = clean_martech_data(df_leads)
print(f"Cleaned dataset: {len(df_final)} high-quality leads remaining.")

Cleaned dataset: 4 high-quality leads remaining.
