In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

file_path = 'Re-adjusted demographic data for florida counties.csv'
data = pd.read_csv(file_path, delimiter=',')

In [None]:
data

Unnamed: 0,County,R_NATURAL_INC_2018,R_INTERNATIONAL_MIG_2018,R_DOMESTIC_MIG_2018,R_NET_MIG_2018,Percent of adults with less than a high school diploma 2014-18,Percent of adults with a high school diploma only 2014-18,Percent of adults completing some college or associate's degree 2014-18,Percent of adults with a bachelor's degree or higher 2014-18,Unemployment_rate_2018,...,Total_age85plusr,All Specialties (AAMC),Cardiovascular Disease (AAMC),Pediatrics** (AAMC),Pulmonary Disease (AAMC),Geriatrics Primary Care (2019),Emergency Medicine specialists (2019),Cardiology specialists (2019),crime_rate_per_100000,Hotspot
0,Alachua County,3.0,6.3,2.4,8.7,7.6,21.7,28.1,42.5,3.3,...,4599,715.900373,20.824026,44.119559,5.323245,1.330811,37.110621,26.704944,6052451953,1
1,Baker County,2.1,0.2,2.2,2.4,15.5,42.8,28.4,13.4,3.4,...,372,75.195051,2.187265,4.634126,0.559130,0.139783,3.897938,2.804971,236922963,0
2,Bay County,1.4,2.3,3.0,5.3,9.7,31.4,36.1,22.8,4.0,...,3803,491.365379,14.292779,30.281901,3.653662,0.913416,25.471245,18.329206,4971797905,1
3,Bradford County,-2.2,0.3,23.5,23.8,21.7,39.7,28.0,10.6,3.3,...,611,73.542907,2.139208,4.532308,0.546845,0.136711,3.812294,2.743342,4841713222,0
4,Brevard County,-3.6,3.0,15.1,18.1,8.0,28.6,34.1,29.3,3.5,...,18607,1582.792831,46.040093,97.544470,11.769226,2.942307,82.048322,59.042286,5379223453,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,Union County,-5.9,1.1,-30.3,-29.3,23.1,37.9,30.7,8.3,3.4,...,163,39.619610,1.152451,2.441680,0.294601,0.073650,2.053789,1.477914,2576638478,1
63,Volusia County,-4.0,3.7,17.4,21.2,10.0,32.2,34.2,23.6,3.7,...,16979,1452.024249,42.236312,89.485454,10.796866,2.699217,75.269581,54.164278,4313099042,1
64,Wakulla County,2.1,0.8,8.3,9.2,12.1,35.9,33.6,18.4,3.1,...,402,86.083814,2.503996,5.305179,0.640096,0.160024,4.462386,3.211150,296563729,0
65,Walton County,1.6,0.4,42.0,42.4,11.5,27.6,32.5,28.4,3.1,...,1340,189.280435,5.505767,11.664988,1.407439,0.351860,9.811860,7.060652,3611617369,0


In [3]:

features = [
    'R_NATURAL_INC_2018', 'R_INTERNATIONAL_MIG_2018', 'R_DOMESTIC_MIG_2018', 'R_NET_MIG_2018',
    'Percent of adults with less than a high school diploma 2014-18',
    'Percent of adults with a high school diploma only 2014-18',
    'Percent of adults completing some college or associate\'s degree 2014-18',
    'Percent of adults with a bachelor\'s degree or higher 2014-18',
    'Unemployment_rate_2018', 'Median_Household_Income_2018',
    'Med_HH_Income_Percent_of_State_Total_2018', 'Housing units',
    'Density per square mile of land area - Population',
    'Density per square mile of land area - Housing units', 'Total_Male',
    'Total_Female', 'Total_age0to17', 'Total_age18to64', 'Total_age65plus', 'Total_age85plusr'
]
target = 'Hotspot'

data[features] = data[features].apply(pd.to_numeric, errors='coerce')

data = data.dropna(subset=features + [target])



In [4]:
X = data[features]
y = data[target].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")


              precision    recall  f1-score   support

           0       0.88      0.88      0.88         8
           1       0.83      0.83      0.83         6

    accuracy                           0.86        14
   macro avg       0.85      0.85      0.85        14
weighted avg       0.86      0.86      0.86        14

Accuracy: 0.8571428571428571


# **Interaction Terms and polynomial features**

In [5]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

model_poly = LogisticRegression()
model_poly.fit(X_train_poly, y_train)
y_pred_poly = model_poly.predict(X_test_poly)

print(classification_report(y_test, y_pred_poly))
print(f"Accuracy: {accuracy_score(y_test, y_pred_poly)}")


              precision    recall  f1-score   support

           0       0.88      0.88      0.88         8
           1       0.83      0.83      0.83         6

    accuracy                           0.86        14
   macro avg       0.85      0.85      0.85        14
weighted avg       0.86      0.86      0.86        14

Accuracy: 0.8571428571428571


# **After cross validation**

In [6]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)  # Ensure each fold is a good representative of the whole

cv_scores = cross_val_score(model, X_scaled, y, cv=cv, scoring='accuracy')

print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
print("Standard Deviation of CV Accuracy:", cv_scores.std())

Cross-Validation Accuracy Scores: [0.71428571 0.57142857 0.76923077 0.92307692 0.69230769]
Mean CV Accuracy: 0.7340659340659341
Standard Deviation of CV Accuracy: 0.114496846396757


# **SVM**

In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

              precision    recall  f1-score   support

           0       0.78      0.88      0.82         8
           1       0.80      0.67      0.73         6

    accuracy                           0.79        14
   macro avg       0.79      0.77      0.78        14
weighted avg       0.79      0.79      0.78        14

Accuracy: 0.7857142857142857


In [8]:
svm_model = SVC(random_state=42)
parameters = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(svm_model, parameters, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

print("Best parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test_scaled)

print(classification_report(y_test, y_pred_best))
print(f"Accuracy: {accuracy_score(y_test, y_pred_best)}")

Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
              precision    recall  f1-score   support

           0       0.80      1.00      0.89         8
           1       1.00      0.67      0.80         6

    accuracy                           0.86        14
   macro avg       0.90      0.83      0.84        14
weighted avg       0.89      0.86      0.85        14

Accuracy: 0.8571428571428571
