<a href="https://colab.research.google.com/github/jsleweon11/Lead_Scoring_System/blob/main/Lead_Scoring_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

# Load the datasets
df_mn = pd.read_csv('/mnt/data/cleaned_combined_MN90DaysAllcounties.csv')
df_fl = pd.read_csv('/mnt/data/cleaned_combined_FL90DaysAllcounties.csv')
df_tx = pd.read_csv('/mnt/data/cleaned_combined_TX90DaysAllcounties.csv')
df_ga = pd.read_csv('/mnt/data/cleaned_combined_GA90DaysAllcounties.csv')
df_oh = pd.read_csv('/mnt/data/cleaned_combined_OH90DaysAllcounties.csv')

# Combine the dataframes
df_combined = pd.concat([df_mn, df_fl, df_tx, df_ga, df_oh], ignore_index=True)

# Check for missing values
missing_values = df_combined.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Define the scoring function
def score_lead(row):
    """
    This function calculates a lead score based on the number of beds, baths, AVM value,
    total debt, and available equity.
    """
    score = 0
    score += row['Beds'] * 10
    score += row['Baths'] * 10
    score += row['AVMValue'] / 10000  # Normalizing the AVM value
    score -= row['TotalDebt'] / 10000  # Lower debt is better
    score += row['AvailablEquity'] / 10000  # Higher equity is better
    return score

# Apply the scoring function to create the lead_score column
df_combined['lead_score'] = df_combined.apply(score_lead, axis=1)

# Evaluate the lead scores
lead_score_stats = df_combined['lead_score'].describe()
print("Lead score statistics:\n", lead_score_stats)

# Visualize the distribution of the lead scores
plt.hist(df_combined['lead_score'], bins=50, edgecolor='k')
plt.title('Distribution of Lead Scores')
plt.xlabel('Lead Score')
plt.ylabel('Frequency')
plt.show()

# Define the segmentation function
def segment_lead(score):
    """
    This function segments leads into high, medium, and low categories based on the lead score.
    """
    if score > 150:
        return 'High'
    elif score > 50:
        return 'Medium'
    else:
        return 'Low'

# Apply the segmentation function to create the lead_segment column
df_combined['lead_segment'] = df_combined['lead_score'].apply(segment_lead)

# Check the distribution of lead segments
lead_segment_distribution = df_combined['lead_segment'].value_counts()
print("Lead segment distribution:\n", lead_segment_distribution)

# Analyze segments - Summary statistics for each segment
segment_analysis = df_combined.groupby('lead_segment').describe()
print("Segment analysis:\n", segment_analysis)

# Filter high leads
high_leads = df_combined[df_combined['lead_segment'] == 'High']

# Save high leads to a new CSV
high_leads.to_csv('high_leads.csv', index=False)

# Output the path to the new CSV file
print("High leads saved to 'high_leads.csv'")
