<a href="https://colab.research.google.com/github/jsleweon11/Enorme-Strategist-Group-Machine-learning-and-Lead-Gen-Case-Study/blob/main/Enorme_Strategist_Group_Machine_learning_and_Lead_Gen_Case_Study.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Style for plots
sns.set(style="darkgrid")
plt.style.use("dark_background")

# Load data files
zombie_properties_df = pd.read_csv('/path/to/Zombie Properties 2024-05-16.csv')
propensity_to_default_df = pd.read_csv('/path/to/Propensity to Default By Zip Code 2024-05-16.csv')
mn_cleaned_phone_df = pd.read_csv('/path/to/MN Cleaned Cell and Business phone 90 day Auction-5-15-24 - cleaned_combined_MN90DaysAllcounties_Cleaned.csv')
rdc_inventory_hotness_df = pd.read_csv('/path/to/RDC_Inventory_Hotness_Metrics_Metro_History.csv')
metro_invt_fs_uc_sfrcondo_df = pd.read_csv('/path/to/Metro_invt_fs_uc_sfrcondo_sm_month - Metro_invt_fs_uc_sfrcondo_sm_month.csv')
cleaned_foreclosure_df = pd.read_csv('/path/to/cleaned_foreclosure_data.csv')

# Ensure all dates are in the same format
cleaned_foreclosure_df['AuctionDate'] = pd.to_datetime(cleaned_foreclosure_df['AuctionDate']).dt.tz_localize(None)
mn_cleaned_phone_df['AuctionDate'] = pd.to_datetime(mn_cleaned_phone_df['AuctionDate']).dt.tz_localize(None)

# Merging datasets
merged_df1 = pd.merge(cleaned_foreclosure_df, zombie_properties_df, how='left', left_on='StateCode', right_on='SitusState')
merged_df2 = pd.merge(merged_df1, propensity_to_default_df, how='left', left_on='Zipcode', right_on='SitusZip')
merged_df3 = pd.merge(merged_df2, mn_cleaned_phone_df, how='left', left_on=['AuctionDate', 'CountyName_x'], right_on=['AuctionDate', 'County'])
merged_df4 = pd.merge(merged_df3, rdc_inventory_hotness_df, how='left', left_on=['StateCode', 'CountyName'], right_on=['StateName', 'cbsa_title'])
merged_df5 = pd.merge(merged_df4, metro_invt_fs_uc_sfrcondo_df, how='left', left_on=['StateCode', 'CountyName'], right_on=['StateName', 'RegionName'])

# Handle missing values
merged_df5.fillna(0, inplace=True)

# Save the final merged dataframe to a CSV file
merged_df5.to_csv('/path/to/final_merged_foreclosure_data.csv', index=False)

# Load the final cleaned data
df = pd.read_csv('/path/to/final_merged_foreclosure_data.csv')

# Feature Engineering: Example of creating a binary target variable
# Assuming we are predicting if a property will be a potential deal
df['Potential_Deal'] = np.where(df['REOs'] > 0, 1, 0)

# Select features and target variable
features = df.drop(['Potential_Deal', 'StateCode', 'CountyName', 'Zipcode'], axis=1)
target = df['Potential_Deal']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Detailed classification report
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Feature Importance
feature_importances = rf_model.feature_importances_
features = X_train.columns
importances = pd.DataFrame({'feature': features, 'importance': feature_importances})
importances = importances.sort_values(by='importance', ascending=False)

# Plot Feature Importances
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=importances, palette='Blues_r')
plt.title('Feature Importances')
plt.show()

# Save the model
import joblib
joblib.dump(rf_model, '/path/to/rf_model.pkl')

# Save code to Google Colab
with open('/path/to/Enorme_Strategic_Group_Real_Estate_Case_Study.py', 'w') as f:
    f.write("""
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Style for plots
sns.set(style="darkgrid")
plt.style.use("dark_background")

# Load data files
zombie_properties_df = pd.read_csv('/path/to/Zombie Properties 2024-05-16.csv')
propensity_to_default_df = pd.read_csv('/path/to/Propensity to Default By Zip Code 2024-05-16.csv')
mn_cleaned_phone_df = pd.read_csv('/path/to/MN Cleaned Cell and Business phone 90 day Auction-5-15-24 - cleaned_combined_MN90DaysAllcounties_Cleaned.csv')
rdc_inventory_hotness_df = pd.read_csv('/path/to/RDC_Inventory_Hotness_Metrics_Metro_History.csv')
metro_invt_fs_uc_sfrcondo_df = pd.read_csv('/path/to/Metro_invt_fs_uc_sfrcondo_sm_month - Metro_invt_fs_uc_sfrcondo_sm_month.csv')
cleaned_foreclosure_df = pd.read_csv('/path/to/cleaned_foreclosure_data.csv')

# Ensure all dates are in the same format
cleaned_foreclosure_df['AuctionDate'] = pd.to_datetime(cleaned_foreclosure_df['AuctionDate']).dt.tz_localize(None)
mn_cleaned_phone_df['AuctionDate'] = pd.to_datetime(mn_cleaned_phone_df['AuctionDate']).dt.tz_localize(None)

# Merging datasets
merged_df1 = pd.merge(cleaned_foreclosure_df, zombie_properties_df, how='left', left_on='StateCode', right_on='SitusState')
merged_df2 = pd.merge(merged_df1, propensity_to_default_df, how='left', left_on='Zipcode', right_on='SitusZip')
merged_df3 = pd.merge(merged_df2, mn_cleaned_phone_df, how='left', left_on=['AuctionDate', 'CountyName_x'], right_on=['AuctionDate', 'County'])
merged_df4 = pd.merge(merged_df3, rdc_inventory_hotness_df, how='left', left_on=['StateCode', 'CountyName'], right_on=['StateName', 'cbsa_title'])
merged_df5 = pd.merge(merged_df4, metro_invt_fs_uc_sfrcondo_df, how='left', left_on=['StateCode', 'CountyName'], right_on=['StateName', 'RegionName'])

# Handle missing values
merged_df5.fillna(0, inplace=True)

# Save the final merged dataframe to a CSV file
merged_df5.to_csv('/path/to/final_merged_foreclosure_data.csv', index=False)

# Load the final cleaned data
df = pd.read_csv('/path/to/final_merged_foreclosure_data.csv')

# Feature Engineering: Example of creating a binary target variable
# Assuming we are predicting if a property will be a potential deal
df['Potential_Deal'] = np.where(df['REOs'] > 0, 1, 0)

# Select features and target variable
features = df.drop(['Potential_Deal', 'StateCode', 'CountyName', 'Zipcode'], axis=1)
target = df['Potential_Deal']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Detailed classification report
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Feature Importance
feature_importances = rf_model.feature_importances_
features = X_train.columns
importances = pd.DataFrame({'feature': features, 'importance': feature_importances})
importances = importances.sort_values(by='importance', ascending=False)

# Plot Feature Importances
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=importances, palette='Blues_r')
plt.title('Feature Importances')
plt.show()

# Save the model
import joblib
joblib.dump(rf_model, '/path/to/rf_model.pkl')
""")
