In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

file_path = 'Re-adjusted demographic data for florida counties.csv'
data = pd.read_csv(file_path, delimiter=',')

features = [
    'R_NATURAL_INC_2018', 'R_INTERNATIONAL_MIG_2018', 'R_DOMESTIC_MIG_2018', 'R_NET_MIG_2018',
    'Percent of adults with less than a high school diploma 2014-18',
    'Percent of adults with a high school diploma only 2014-18',
    'Percent of adults completing some college or associate\'s degree 2014-18',
    'Percent of adults with a bachelor\'s degree or higher 2014-18',
    'Unemployment_rate_2018', 'Median_Household_Income_2018',
    'Med_HH_Income_Percent_of_State_Total_2018', 'Housing units',
    'Density per square mile of land area - Population',
    'Density per square mile of land area - Housing units', 'Total_Male',
    'Total_Female', 'Total_age0to17', 'Total_age18to64', 'Total_age65plus', 'Total_age85plusr'
]
target = 'Hotspot'

data[features] = data[features].apply(pd.to_numeric, errors='coerce')

data = data.dropna(subset=features + [target])

X = data[features]
y = data[target].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train_scaled, y_train)

y_pred_tree = tree_model.predict(X_test_scaled)

print(classification_report(y_test, y_pred_tree))
print(f"Accuracy: {accuracy_score(y_test, y_pred_tree)}")


              precision    recall  f1-score   support

           0       0.67      0.75      0.71         8
           1       0.60      0.50      0.55         6

    accuracy                           0.64        14
   macro avg       0.63      0.62      0.63        14
weighted avg       0.64      0.64      0.64        14

Accuracy: 0.6428571428571429


# **Cross Validation of Decision Tree**

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

scores = cross_val_score(tree_model, X_train_scaled, y_train, cv=5)

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", scores.mean())
print("Standard deviation of cross-validation scores:", scores.std())

tree_model.fit(X_train_scaled, y_train)
y_pred_tree = tree_model.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_pred_tree))
print("Test Set Accuracy:", accuracy_score(y_test, y_pred_tree))

Cross-validation scores: [0.72727273 0.63636364 0.54545455 0.6        0.3       ]
Mean cross-validation score: 0.5618181818181818
Standard deviation of cross-validation scores: 0.14367088192827937
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.75      0.71         8
           1       0.60      0.50      0.55         6

    accuracy                           0.64        14
   macro avg       0.63      0.62      0.63        14
weighted avg       0.64      0.64      0.64        14

Test Set Accuracy: 0.6428571428571429


# **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
forest_model.fit(X_train_scaled, y_train)

y_pred_forest = forest_model.predict(X_test_scaled)

print(classification_report(y_test, y_pred_forest))
print(f"Accuracy: {accuracy_score(y_test, y_pred_forest)}")

              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       1.00      0.83      0.91         6

    accuracy                           0.93        14
   macro avg       0.94      0.92      0.93        14
weighted avg       0.94      0.93      0.93        14

Accuracy: 0.9285714285714286


# **Grid Search on Random Forest**

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 10, 20, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'n_estimators': [50, 100, 200]
}

forest_grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
forest_grid.fit(X_train_scaled, y_train)

print("Best parameters:", forest_grid.best_params_)
print("Best cross-validation score: {:.2f}".format(forest_grid.best_score_))

y_pred_best = forest_grid.predict(X_test_scaled)

print(classification_report(y_test, y_pred_best))
print(f"Accuracy: {accuracy_score(y_test, y_pred_best)}")


Best parameters: {'max_depth': 3, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 200}
Best cross-validation score: 0.68
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       1.00      0.83      0.91         6

    accuracy                           0.93        14
   macro avg       0.94      0.92      0.93        14
weighted avg       0.94      0.93      0.93        14

Accuracy: 0.9285714285714286
