In [1]:
import pandas as pd
import random
from datetime import datetime

# --- Configuration ---
# Define weights for different scoring criteria. These can be adjusted based on business needs.
SCORING_WEIGHTS = {
    'employee_size_score': 0.3,
    'industry_relevance_score': 0.4,
    'tech_stack_score': 0.2,
    'funding_news_score': 0.1,
}

# Industry relevance mapping: higher value for more relevant industries
INDUSTRY_RELEVANCE = {
    'Software': 5,
    'SaaS': 5,
    'IT Services': 4,
    'Consulting': 3,
    'E-commerce': 3,
    'Finance': 2,
    'Healthcare': 2,
    'Manufacturing': 1,
    'Retail': 1,
    'Other': 0
}

# Keywords to detect for "growth" or "funding" news
GROWTH_KEYWORDS = ['growth', 'expansion', 'raised', 'funding', 'invested', 'series A', 'series B', 'acquisition']

# Define ranges for priority tiers based on the final score
PRIORITY_THRESHOLDS = {
    'High': 80,
    'Medium': 50,
    'Low': 0
}

# --- Data Generation (Simulating SaaSquatch Leads Output) ---
def generate_dummy_leads(num_leads=100):
    """
    Generates a DataFrame of dummy lead data to simulate input from a lead scraping tool.
    This function creates various data points that would typically be scraped.
    """
    leads_data = []
    industries = list(INDUSTRY_RELEVANCE.keys())
    tech_stacks = ["Salesforce", "HubSpot", "SAP", "Microsoft Dynamics", "Zoho CRM", "None"]
    job_titles = ["CEO", "VP Sales", "Marketing Manager", "Account Executive", "Business Analyst", "Developer"]

    for i in range(num_leads):
        company_name = f"Company {i+1}"
        employee_size = random.choice([5, 15, 50, 200, 1000, 5000]) # Example employee sizes
        industry = random.choice(industries)
        contact_name = f"Contact {i+1}"
        contact_email = f"contact{i+1}@company{i+1}.com"
        contact_title = random.choice(job_titles)
        website = f"www.company{i+1}.com"
        last_activity = datetime.now().strftime("%Y-%m-%d") if random.random() > 0.3 else None
        tech_stack = random.choice(tech_stacks)
        recent_news = ""
        if random.random() < 0.2: # 20% chance of having relevant news
            recent_news = random.choice([
                f"{company_name} announced significant {random.choice(GROWTH_KEYWORDS)} in Q1.",
                f"{company_name} successfully closed its {random.choice(['Series A', 'Series B'])} funding round.",
                f"New strategic {random.choice(GROWTH_KEYWORDS)} plan for {company_name}.",
                f"{company_name} expands operations into new markets."
            ])
        else:
            recent_news = f"{company_name} continues to operate normally."


        leads_data.append({
            'company_name': company_name,
            'employee_size': employee_size,
            'industry': industry,
            'contact_name': contact_name,
            'contact_email': contact_email,
            'contact_title': contact_title,
            'website': website,
            'last_activity': last_activity,
            'tech_stack': tech_stack,
            'recent_news': recent_news
        })
    return pd.DataFrame(leads_data)

# --- Feature Engineering and Scoring Logic ---
def calculate_lead_score(df):
    """
    Calculates a lead score for each lead based on predefined criteria.
    This function transforms raw data into quantifiable features and applies scoring rules.
    """
    # Initialize score columns
    df['employee_size_score'] = 0
    df['industry_relevance_score'] = 0
    df['tech_stack_score'] = 0
    df['funding_news_score'] = 0
    df['lead_score'] = 0.0
    df['priority'] = 'Low' # Default priority

    # 1. Employee Size Score: Larger companies often mean larger deals
    # Assign scores based on employee size tiers
    df.loc[df['employee_size'] <= 10, 'employee_size_score'] = 10
    df.loc[(df['employee_size'] > 10) & (df['employee_size'] <= 100), 'employee_size_score'] = 30
    df.loc[(df['employee_size'] > 100) & (df['employee_size'] <= 500), 'employee_size_score'] = 60
    df.loc[(df['employee_size'] > 500) & (df['employee_size'] <= 2000), 'employee_size_score'] = 80
    df.loc[df['employee_size'] > 2000, 'employee_size_score'] = 100

    # 2. Industry Relevance Score: How well the industry aligns with target segments
    df['industry_relevance_score'] = df['industry'].apply(lambda x: INDUSTRY_RELEVANCE.get(x, 0) * 20) # Max 100

    # 3. Tech Stack Score: Presence of specific technologies (e.g., CRM indicating maturity)
    # This is a simplified check. In a real scenario, this would involve more sophisticated parsing.
    df.loc[df['tech_stack'].str.contains("Salesforce|HubSpot|SAP", case=False, na=False), 'tech_stack_score'] = 100
    df.loc[df['tech_stack'].str.contains("Microsoft Dynamics|Zoho CRM", case=False, na=False), 'tech_stack_score'] = 60
    df.loc[df['tech_stack'].str.contains("None", case=False, na=False), 'tech_stack_score'] = 10 # Minimal score if no specific tech is mentioned or "None"

    # 4. Funding/Growth News Score: Indicates potential for investment in new solutions
    # Check if any growth keywords are present in the 'recent_news' field
    df['funding_news_score'] = df['recent_news'].apply(
        lambda x: 100 if any(keyword in x.lower() for keyword in GROWTH_KEYWORDS) else 0
    )

    # Calculate final weighted score
    # Normalize weights to ensure they sum to 1, if necessary, or just use them directly
    total_score = (
        df['employee_size_score'] * SCORING_WEIGHTS['employee_size_score'] +
        df['industry_relevance_score'] * SCORING_WEIGHTS['industry_relevance_score'] +
        df['tech_stack_score'] * SCORING_WEIGHTS['tech_stack_score'] +
        df['funding_news_score'] * SCORING_WEIGHTS['funding_news_score']
    )
    df['lead_score'] = total_score

    # Normalize lead_score to a 0-100 scale based on the maximum possible score given weights and max individual scores (100)
    # The maximum possible score if all individual scores are 100
    max_possible_weighted_score = sum(SCORING_WEIGHTS.values()) * 100
    if max_possible_weighted_score > 0:
        df['lead_score'] = (df['lead_score'] / max_possible_weighted_score) * 100
    else:
        df['lead_score'] = 0 # Avoid division by zero if all weights are zero

    # Assign Priority
    df.loc[df['lead_score'] >= PRIORITY_THRESHOLDS['High'], 'priority'] = 'High'
    df.loc[(df['lead_score'] >= PRIORITY_THRESHOLDS['Medium']) & (df['lead_score'] < PRIORITY_THRESHOLDS['High']), 'priority'] = 'Medium'
    df.loc[df['lead_score'] < PRIORITY_THRESHOLDS['Medium'], 'priority'] = 'Low'

    # Round lead_score for cleaner output
    df['lead_score'] = df['lead_score'].round(2)

    return df
import pandas as pd
import random
from datetime import datetime

# --- Configuration ---
# Define weights for different scoring criteria. These can be adjusted based on business needs.
SCORING_WEIGHTS = {
    'employee_size_score': 0.3,
    'industry_relevance_score': 0.4,
    'tech_stack_score': 0.2,
    'funding_news_score': 0.1,
}

# Industry relevance mapping: higher value for more relevant industries
INDUSTRY_RELEVANCE = {
    'Software': 5,
    'SaaS': 5,
    'IT Services': 4,
    'Consulting': 3,
    'E-commerce': 3,
    'Finance': 2,
    'Healthcare': 2,
    'Manufacturing': 1,
    'Retail': 1,
    'Other': 0
}

# Keywords to detect for "growth" or "funding" news
GROWTH_KEYWORDS = ['growth', 'expansion', 'raised', 'funding', 'invested', 'series A', 'series B', 'acquisition']

# Define ranges for priority tiers based on the final score
PRIORITY_THRESHOLDS = {
    'High': 80,
    'Medium': 50,
    'Low': 0
}

# --- Data Generation (Simulating SaaSquatch Leads Output) ---
def generate_dummy_leads(num_leads=100):
    """
    Generates a DataFrame of dummy lead data to simulate input from a lead scraping tool.
    This function creates various data points that would typically be scraped.
    """
    leads_data = []
    industries = list(INDUSTRY_RELEVANCE.keys())
    tech_stacks = ["Salesforce", "HubSpot", "SAP", "Microsoft Dynamics", "Zoho CRM", "None"]
    job_titles = ["CEO", "VP Sales", "Marketing Manager", "Account Executive", "Business Analyst", "Developer"]

    for i in range(num_leads):
        company_name = f"Company {i+1}"
        employee_size = random.choice([5, 15, 50, 200, 1000, 5000]) # Example employee sizes
        industry = random.choice(industries)
        contact_name = f"Contact {i+1}"
        contact_email = f"contact{i+1}@company{i+1}.com"
        contact_title = random.choice(job_titles)
        website = f"www.company{i+1}.com"
        last_activity = datetime.now().strftime("%Y-%m-%d") if random.random() > 0.3 else None
        tech_stack = random.choice(tech_stacks)
        recent_news = ""
        if random.random() < 0.2: # 20% chance of having relevant news
            recent_news = random.choice([
                f"{company_name} announced significant {random.choice(GROWTH_KEYWORDS)} in Q1.",
                f"{company_name} successfully closed its {random.choice(['Series A', 'Series B'])} funding round.",
                f"New strategic {random.choice(GROWTH_KEYWORDS)} plan for {company_name}.",
                f"{company_name} expands operations into new markets."
            ])
        else:
            recent_news = f"{company_name} continues to operate normally."


        leads_data.append({
            'company_name': company_name,
            'employee_size': employee_size,
            'industry': industry,
            'contact_name': contact_name,
            'contact_email': contact_email,
            'contact_title': contact_title,
            'website': website,
            'last_activity': last_activity,
            'tech_stack': tech_stack,
            'recent_news': recent_news
        })
    return pd.DataFrame(leads_data)

# --- Feature Engineering and Scoring Logic ---
def calculate_lead_score(df):
    """
    Calculates a lead score for each lead based on predefined criteria.
    This function transforms raw data into quantifiable features and applies scoring rules.
    """
    # Initialize score columns
    df['employee_size_score'] = 0
    df['industry_relevance_score'] = 0
    df['tech_stack_score'] = 0
    df['funding_news_score'] = 0
    df['lead_score'] = 0.0
    df['priority'] = 'Low' # Default priority

    # 1. Employee Size Score: Larger companies often mean larger deals
    # Assign scores based on employee size tiers
    df.loc[df['employee_size'] <= 10, 'employee_size_score'] = 10
    df.loc[(df['employee_size'] > 10) & (df['employee_size'] <= 100), 'employee_size_score'] = 30
    df.loc[(df['employee_size'] > 100) & (df['employee_size'] <= 500), 'employee_size_score'] = 60
    df.loc[(df['employee_size'] > 500) & (df['employee_size'] <= 2000), 'employee_size_score'] = 80
    df.loc[df['employee_size'] > 2000, 'employee_size_score'] = 100

    # 2. Industry Relevance Score: How well the industry aligns with target segments
    df['industry_relevance_score'] = df['industry'].apply(lambda x: INDUSTRY_RELEVANCE.get(x, 0) * 20) # Max 100

    # 3. Tech Stack Score: Presence of specific technologies (e.g., CRM indicating maturity)
    # This is a simplified check. In a real scenario, this would involve more sophisticated parsing.
    df.loc[df['tech_stack'].str.contains("Salesforce|HubSpot|SAP", case=False, na=False), 'tech_stack_score'] = 100
    df.loc[df['tech_stack'].str.contains("Microsoft Dynamics|Zoho CRM", case=False, na=False), 'tech_stack_score'] = 60
    df.loc[df['tech_stack'].str.contains("None", case=False, na=False), 'tech_stack_score'] = 10 # Minimal score if no specific tech is mentioned or "None"

    # 4. Funding/Growth News Score: Indicates potential for investment in new solutions
    # Check if any growth keywords are present in the 'recent_news' field
    df['funding_news_score'] = df['recent_news'].apply(
        lambda x: 100 if any(keyword in x.lower() for keyword in GROWTH_KEYWORDS) else 0
    )

    # Calculate final weighted score
    # Normalize weights to ensure they sum to 1, if necessary, or just use them directly
    total_score = (
        df['employee_size_score'] * SCORING_WEIGHTS['employee_size_score'] +
        df['industry_relevance_score'] * SCORING_WEIGHTS['industry_relevance_score'] +
        df['tech_stack_score'] * SCORING_WEIGHTS['tech_stack_score'] +
        df['funding_news_score'] * SCORING_WEIGHTS['funding_news_score']
    )
    df['lead_score'] = total_score

    # Normalize lead_score to a 0-100 scale based on the maximum possible score given weights and max individual scores (100)
    # The maximum possible score if all individual scores are 100
    max_possible_weighted_score = sum(SCORING_WEIGHTS.values()) * 100
    if max_possible_weighted_score > 0:
        df['lead_score'] = (df['lead_score'] / max_possible_weighted_score) * 100
    else:
        df['lead_score'] = 0 # Avoid division by zero if all weights are zero

    # Assign Priority
    df.loc[df['lead_score'] >= PRIORITY_THRESHOLDS['High'], 'priority'] = 'High'
    df.loc[(df['lead_score'] >= PRIORITY_THRESHOLDS['Medium']) & (df['lead_score'] < PRIORITY_THRESHOLDS['High']), 'priority'] = 'Medium'
    df.loc[df['lead_score'] < PRIORITY_THRESHOLDS['Medium'], 'priority'] = 'Low'

    # Round lead_score for cleaner output
    df['lead_score'] = df['lead_score'].round(2)

    return df

# --- Main Execution ---
if __name__ == '__main__':
    print("Starting AI-Powered Lead Scoring and Prioritization Tool...")

    # 1. Generate dummy leads
    print("Generating dummy lead data...")
    leads_df = generate_dummy_leads(num_leads=200) # Generate 200 leads for demonstration
    print(f"Generated {len(leads_df)} dummy leads.")

    # 2. Calculate lead scores and assign priorities
    print("Calculating lead scores and assigning priorities...")
    scored_leads_df = calculate_lead_score(leads_df.copy()) # Use a copy to avoid modifying original df

    # 3. Display summary
    print("\n--- Lead Priority Distribution ---")
    print(scored_leads_df['priority'].value_counts())

    print("\n--- Top 10 High Priority Leads ---")
    high_priority_leads = scored_leads_df[scored_leads_df['priority'] == 'High'].sort_values(by='lead_score', ascending=False)
    print(high_priority_leads[['company_name', 'industry', 'employee_size', 'tech_stack', 'recent_news', 'lead_score', 'priority']].head(10).to_string())

    print("\n--- Example Medium Priority Leads (first 5) ---")
    medium_priority_leads = scored_leads_df[scored_leads_df['priority'] == 'Medium'].sort_values(by='lead_score', ascending=False)
    print(medium_priority_leads[['company_name', 'industry', 'employee_size', 'tech_stack', 'lead_score', 'priority']].head(5).to_string())

    print("\n--- Example Low Priority Leads (first 5) ---")
    low_priority_leads = scored_leads_df[scored_leads_df['priority'] == 'Low'].sort_values(by='lead_score', ascending=False)
    print(low_priority_leads[['company_name', 'industry', 'employee_size', 'lead_score', 'priority']].head(5).to_string())

    # 4. Save results to a CSV file
    output_filename = "scored_leads_output.csv"
    scored_leads_df.to_csv(output_filename, index=False)
    print(f"\nEnhanced lead data saved to '{output_filename}'")

    print("\nAI-Powered Lead Scoring process completed successfully.")
    # 1. Generate dummy leads
    print("Generating dummy lead data...")
    leads_df = generate_dummy_leads(num_leads=200) # Generate 200 leads for demonstration
    print(f"Generated {len(leads_df)} dummy leads.")

    # 2. Calculate lead scores and assign priorities
    print("Calculating lead scores and assigning priorities...")
    scored_leads_df = calculate_lead_score(leads_df.copy()) # Use a copy to avoid modifying original df

    # 3. Display summary
    print("\n--- Lead Priority Distribution ---")
    print(scored_leads_df['priority'].value_counts())

    print("\n--- Top 10 High Priority Leads ---")
    high_priority_leads = scored_leads_df[scored_leads_df['priority'] == 'High'].sort_values(by='lead_score', ascending=False)
    print(high_priority_leads[['company_name', 'industry', 'employee_size', 'tech_stack', 'recent_news', 'lead_score', 'priority']].head(10).to_string())

    print("\n--- Example Medium Priority Leads (first 5) ---")
    medium_priority_leads = scored_leads_df[scored_leads_df['priority'] == 'Medium'].sort_values(by='lead_score', ascending=False)
    print(medium_priority_leads[['company_name', 'industry', 'employee_size', 'tech_stack', 'lead_score', 'priority']].head(5).to_string())

    print("\n--- Example Low Priority Leads (first 5) ---")
    low_priority_leads = scored_leads_df[scored_leads_df['priority'] == 'Low'].sort_values(by='lead_score', ascending=False)
    print(low_priority_leads[['company_name', 'industry', 'employee_size', 'lead_score', 'priority']].head(5).to_string())

    # 4. Save results to a CSV file
    output_filename = "scored_leads_output.csv"
    scored_leads_df.to_csv(output_filename, index=False)
    print(f"\nEnhanced lead data saved to '{output_filename}'")

    print("\nAI-Powered Lead Scoring process completed successfully.")


Starting AI-Powered Lead Scoring and Prioritization Tool...
Generating dummy lead data...
Generated 200 dummy leads.
Calculating lead scores and assigning priorities...

--- Lead Priority Distribution ---
priority
Medium    99
Low       90
High      11
Name: count, dtype: int64

--- Top 10 High Priority Leads ---
    company_name    industry  employee_size          tech_stack                                                  recent_news  lead_score priority
51    Company 52    Software           5000            Zoho CRM   Company 52 successfully closed its Series B funding round.        92.0     High
154  Company 155    Software           5000            Zoho CRM  Company 155 successfully closed its Series B funding round.        92.0     High
6      Company 7    Software           5000          Salesforce                     Company 7 continues to operate normally.        90.0     High
190  Company 191    Software           1000             HubSpot                   Company 191 continu