In [5]:
import pandas as pd
import numpy as np
from sklearn.base import ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


import project_helper

In [6]:
np.random.seed(200)

In [7]:
df = pd.read_csv("data/cleaned_data_recent_only.csv", index_col="NPA")
df.drop(columns=["Public_Nutrition_Assistance-2023"], inplace=True)
df

Unnamed: 0_level_0,Area-2020,Age_of_Residents-2021,Tree_Canopy-2012,Impervious_Surface-2023,Housing_Density-2023,Housing_Size-2023,Housing_Age-2023,New_Residential-2023,Residential_Renovation-2023,Commuters_Driving_Alone-2022,...,Neighborhood_Organizations-2021,Park_Proximity-2023,Job_Density-2019,Home_Sale_Price-2023,Natural_Gas_Consumption-2013,Fire_Call_Rate-2021,Fincancial_Services_Proximity-2023,Public_Health_Insurance -2017,Subsidized_Housing-2023,Residential_Demolitions-2023
NPA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,410.557,31.0,56.157441,23.433162,2.8,1720.0,70.0,1.2,4.1,86.977887,...,0.000000,59.066901,2.460000,488364.0,43.0,49.000000,24.471831,15.0,4.428698,2.6
3,1156.150,33.0,41.978081,38.166350,8.4,2807.0,73.0,17.6,7.1,76.234365,...,4.000000,91.477741,34.330000,667092.0,49.0,63.690000,100.000000,6.0,3.889542,0.6
4,329.242,43.1,65.987392,21.782062,1.2,4158.0,45.0,2.1,7.8,90.712074,...,0.000000,1.466993,0.280000,1493043.0,81.0,49.660000,15.158924,1.0,0.000000,7.3
5,167.141,32.9,42.173622,21.546276,2.1,1195.0,61.0,31.1,0.6,100.000000,...,2.000000,100.000000,1.160000,255031.0,53.0,45.340000,18.965517,43.0,0.000000,2.9
6,403.223,38.0,43.716921,25.103400,2.0,1301.0,67.0,5.2,3.2,60.084034,...,2.000000,79.829891,1.360000,445698.0,51.0,69.000000,69.866343,38.0,0.000000,8.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472,373.419,52.3,50.560590,18.343223,1.6,3325.0,28.0,0.8,1.8,78.224101,...,,69.011725,0.160000,1100345.0,59.0,,9.715243,2.0,0.000000,0.0
473,494.455,43.9,50.075170,22.445100,2.4,1938.0,40.0,5.7,2.4,72.263451,...,,88.841202,8.840000,485542.0,,,73.047210,,9.668756,0.9
474,481.474,43.4,44.560229,25.384978,2.5,2589.0,31.0,0.4,1.6,96.083231,...,,36.065574,1.190000,798000.0,51.0,,69.098361,4.0,0.000000,0.0
475,946.283,52.4,61.106394,13.782155,1.0,3276.0,33.0,1.1,4.2,82.779456,...,,99.891658,0.650000,866494.0,,,47.128927,,0.000000,3.3


In [8]:
dropped_df = df[
    [
        "Single_Family_Housing-2023",
        "Housing_Violations-2023",
        "Home_Sale_Price-2023",
        "Housing_Size-2023",
        "Rental_Costs-2021",
        "Housing_Density-2023",
        "Fire_Call_Rate-2021",
        "Neighborhood_School_Attendance-2023",
        "Residential_Tree_Canopy-2012",
    ]
].copy()

dropped_df.dropna(inplace=True)

## Selecting Features

Looked at the feature importance from the graphs and combined them (From the imputation file). This combination seemed to have the best scores.


In [9]:
target = dropped_df["Single_Family_Housing-2023"]

features = dropped_df[
    [
        "Housing_Violations-2023",
        "Home_Sale_Price-2023",
        "Housing_Size-2023",
        "Rental_Costs-2021",
        "Housing_Density-2023",
        "Fire_Call_Rate-2021",
        "Neighborhood_School_Attendance-2023",
        "Residential_Tree_Canopy-2012",
    ]
]

Convert values to a binary classification (Values above 50% is 1, below 50% is 0)


In [10]:
target = target.apply(lambda x: 1 if x >= 50 else 0)
target

NPA
2      0
3      0
5      1
8      0
9      1
      ..
391    1
392    1
393    0
394    0
476    0
Name: Single_Family_Housing-2023, Length: 310, dtype: int64

Training-Validation-Testing splits


In [11]:
X_trn, y_trn, X_vld, y_vld, X_tst, y_tst = project_helper.trn_vld_tst_split(
    features, target, 0.6
)

print(X_trn.shape, y_trn.shape, X_vld.shape, y_vld.shape, X_tst.shape, y_tst.shape)

(186, 8) (186,) (62, 8) (62,) (62, 8) (62,)


Standardization


In [12]:
mean = X_trn.mean()
stdev = X_trn.std()

X_trn = (X_trn - mean) / stdev
X_vld = (X_vld - mean) / stdev
X_tst = (X_tst - mean) / stdev

Grouping up the models


In [13]:
models: list[tuple[str, ClassifierMixin]] = [
    ("Logistic Regression", LogisticRegression(fit_intercept=True)),
    ("KNN", KNeighborsClassifier(5)),
    ("Random Forest", RandomForestClassifier(10)),
]

## Model Performance

Testing to see how the accuracy of the train, validation, and test splits perform with different models.

Notes:

- Random Forest overfits the trainging data it appears. Even changing the forest size to 10 overfits too much.

- LDA requires normal distributions across its features so new featuers were selected just to test them along with trying the other regressions with those features. They do not perform as well as the first set of features.

The model that appears to do the best jobs are Logistic Regresion and KNN, favoring Logistic Regression for its higher training and validation accuracy.


In [14]:
for name, model in models:
    model.fit(X_trn, y_trn)

    print(f"{name}: ")

    for split, X_split, y_split in [
        ("Training", X_trn, y_trn),
        ("Validation", X_vld, y_vld),
        ("Testing", X_tst, y_tst),
    ]:
        y_pred = model.predict(X_split)

        accur = accuracy_score(y_split, y_pred)
        print(f"    {split} accuracy: {accur}")

    print()

Logistic Regression: 
    Training accuracy: 0.8225806451612904
    Validation accuracy: 0.7258064516129032
    Testing accuracy: 0.7580645161290323

KNN: 
    Training accuracy: 0.7849462365591398
    Validation accuracy: 0.6612903225806451
    Testing accuracy: 0.6612903225806451

Random Forest: 
    Training accuracy: 0.9838709677419355
    Validation accuracy: 0.6935483870967742
    Testing accuracy: 0.7419354838709677



In [15]:
dropped_df = df[
    [
        "Single_Family_Housing-2023",
        "Youth_Population-2022",
        "Long_Commute -2022",
        "Housing_Size-2023",
        "Housing_Age-2023",
        "Age_of_Residents-2021",
    ]
].copy()

dropped_df.dropna(inplace=True)

In [16]:
features = dropped_df[
    [
        "Youth_Population-2022",
        "Long_Commute -2022",
        "Housing_Size-2023",
        "Housing_Age-2023",
        "Age_of_Residents-2021",
    ]
]

target = dropped_df["Single_Family_Housing-2023"]
target = target.apply(lambda x: 1 if x >= 50 else 0)

X_trn, y_trn, X_vld, y_vld, X_tst, y_tst = project_helper.trn_vld_tst_split(
    features, target, 0.6
)

print(X_trn.shape, y_trn.shape, X_vld.shape, y_vld.shape, X_tst.shape, y_tst.shape)

(265, 5) (265,) (88, 5) (88,) (89, 5) (89,)


In [17]:
model = LinearDiscriminantAnalysis()
model.fit(X_trn, y_trn)

print("LDA: ")

for split, X_split, y_split in [
    ("Training", X_trn, y_trn),
    ("Validation", X_vld, y_vld),
    ("Testing", X_tst, y_tst),
]:
    y_pred = model.predict(X_split)

    accur = accuracy_score(y_split, y_pred)
    print(f"    {split} accuracy: {accur}")

LDA: 
    Training accuracy: 0.7433962264150943
    Validation accuracy: 0.7045454545454546
    Testing accuracy: 0.7865168539325843


In [18]:
for name, model in models:
    model.fit(X_trn, y_trn)

    print(f"{name}: ")

    for split, X_split, y_split in [
        ("Training", X_trn, y_trn),
        ("Validation", X_vld, y_vld),
        ("Testing", X_tst, y_tst),
    ]:
        y_pred = model.predict(X_split)

        accur = accuracy_score(y_split, y_pred)
        print(f"    {split} accuracy: {accur}")

    print()

Logistic Regression: 
    Training accuracy: 0.7433962264150943
    Validation accuracy: 0.7045454545454546
    Testing accuracy: 0.7640449438202247

KNN: 
    Training accuracy: 0.7245283018867924
    Validation accuracy: 0.6022727272727273
    Testing accuracy: 0.7078651685393258

Random Forest: 
    Training accuracy: 0.9886792452830189
    Validation accuracy: 0.7386363636363636
    Testing accuracy: 0.6292134831460674

