### Importing and Setting Up the Notebook


In [137]:
import pandas as pd
import numpy as np
import sklearn as sk
import seaborn as sns

from typing import Dict, Generator, List, Sequence
from warnings import simplefilter

In [138]:
simplefilter(action="ignore", category=RuntimeWarning)

### Reading in the Data


In [139]:
clt_housing_df = pd.read_csv(r"./data/Data_Download_20240305.csv")

clt_housing_df.head()

Unnamed: 0,variable_id,NPA,data_year,raw,normalized,Raw_Data_Name,Normalized_Data_Name,Raw_Data_Units,Normalized_Data_Units
0,1,2,2020,,410.557,,Area,land area in acres,land area in acres
1,1,3,2020,,1156.15,,Area,land area in acres,land area in acres
2,1,4,2020,,329.242,,Area,land area in acres,land area in acres
3,1,5,2020,,167.141,,Area,land area in acres,land area in acres
4,1,6,2020,,403.223,,Area,land area in acres,land area in acres


In [140]:
unique_npa = clt_housing_df["NPA"].unique()
unique_npa.sort()

### Helper Functions

Functions to help with data cleaning and preprocessing.

Functions:

- `get_npa_records`:
  - Accepts a dataframe and a seqeunce of NPAs as arguments
  - Returns a generator of records based on NPA.
- `filter_recent_cols`:
  - Accepts a dataframe as an argument
  - Returns a list of columns to use based on recent measurement.


In [141]:
def get_npa_records(
    data_frame: pd.DataFrame, npa_seq: Sequence[int], col_name: str = "normalized"
) -> Generator[pd.Series, None, None]:
    """
    Provides a generator for this chopped data.

    Args:
        data_frame (DataFrame):
            The dataframe with stacked NPAs and distict features in the same column.
        npa_seq (Sequence[int]):
            Sequence of NPA identifiers to process.
        col_name (str):
            Name of the column in `df` whose values will be yielded.

    Yields:
        Series:
            The values of `col_name` for each NPA, indexed by `"Var_Name_Year"`.

    Example:
        >>> # Build a DataFrame of “normalized” series for NPAs 101, 202:
        >>> result_df = pd.DataFrame.from_records(get_npa_records(stacked_df, npa_list)
    """
    df = data_frame.copy()
    df["Var_Name_Year"] = (
        df["Normalized_Data_Name"].astype(str) + "-" + df["data_year"].astype(str)
    )
    for npa in npa_seq:
        record: pd.DataFrame = df[df["NPA"] == npa].copy()
        record.set_index("Var_Name_Year", inplace=True)
        record["variable_id_adj"] = range(1, len(record) + 1)

        yield record[col_name]

In [142]:
def filter_recent_cols(dataFrame: pd.DataFrame) -> List[str]:
    """
    Returns a list of columns based on filtering out variables that
    are the same with different years into the most recently recorded variable.

    Args:
        dataFrame (DataFrame): The dataframe to get columns from.
        Columns must be label with the schema of 'var_name-year'.
        Columns not in said format will be skipped.

    Returns:
        List[str]: The filtered columns
    """
    var_prefix: str = ""
    prev_col = None
    usecols = []
    for col in dataFrame.columns:
        if len(split := col.split("-")) != 2 and not split[1].isnumeric():
            continue

        prefix, _ = split

        if var_prefix != prefix and prev_col:
            usecols.append(prev_col)

        prev_col = col
        var_prefix = prefix

    return usecols

### Unstacking and Ordering Columns


In [None]:
unique_npa = clt_housing_df["NPA"].unique()
unique_npa.sort()

In [143]:
df = pd.DataFrame.from_records(
    get_npa_records(clt_housing_df, unique_npa), index=unique_npa
)

df.index.name = "NPA"

df

Var_Name_Year,Area-2020,Age_of_Residents-2020,Age_of_Residents-2021,Tree_Canopy-2012,Impervious_Surface-2011,Impervious_Surface-2013,Impervious_Surface-2015,Impervious_Surface-2016,Impervious_Surface-2017,Impervious_Surface-2018,...,Residential_Demolitions-2015,Residential_Demolitions-2016,Residential_Demolitions-2017,Residential_Demolitions-2018,Residential_Demolitions-2020,Residential_Demolitions-2022,Residential_Demolitions-2023,High_Speed_Internet-2020,High_Speed_Internet-2021,High_Speed_Internet-2022
NPA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,410.557,30.0,31.0,56.157441,23.340000,23.100000,22.130000,22.160000,22.410000,22.410000,...,1.850000,1.85000,4.630000,3.690000,2.810000,3.542958,2.6,78.318584,75.449102,76.566757
3,1156.150,33.0,33.0,41.978081,36.940000,36.940000,37.290000,37.890000,38.580000,38.760000,...,9.470000,5.71000,4.920000,7.080000,1.570000,1.057579,0.6,89.982079,89.391892,91.624685
4,329.242,45.1,43.1,65.987392,19.740000,19.740000,19.740000,20.050000,20.650000,20.960000,...,7.430000,9.85000,4.900000,19.090000,5.000000,7.352941,7.3,84.560570,95.011338,91.914894
5,167.141,32.4,32.9,42.173622,22.140000,25.730000,22.140000,22.140000,21.540000,21.540000,...,21.150000,12.27000,3.110000,0.000000,0.000000,0.000000,2.9,36.111111,38.034188,63.948498
6,403.223,38.0,38.0,43.716921,25.540000,25.050000,25.050000,25.050000,25.050000,24.800000,...,2.450000,7.35000,6.150000,2.480000,2.530000,16.029593,8.5,57.348703,63.378176,59.839357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472,373.419,48.5,52.3,50.560590,14.510000,14.020000,14.510000,16.020000,16.560000,16.820000,...,7.420000,1.85000,10.930000,1.790000,5.320000,5.163511,0.0,92.337917,93.626374,91.327434
473,494.455,39.8,43.9,50.075170,19.350000,19.140000,20.790000,20.970000,21.370000,21.570000,...,4.800000,0.96000,2.860000,0.920000,1.830000,6.266786,0.9,77.741935,80.576923,82.166200
474,481.474,42.0,43.4,44.560229,21.720000,21.320000,21.720000,22.860000,22.860000,23.070000,...,0.870000,0.00000,4.330000,0.860000,0.000000,0.856898,0.0,88.160920,82.111111,81.971154
475,946.283,53.8,52.4,61.106394,9.400000,9.830000,10.570000,10.880000,11.100000,11.200000,...,2.370000,2.36000,2.280000,3.390000,4.820000,0.000000,3.3,90.806754,91.384615,93.897365


### Re-ordered Dataframe Insights

There are 540 columns in the reorganized data. For how many entries we have (463), we aren't going to use all the columns. Many of the columns are about the same variable but different years so many variables may be the same.

Many of the columns follow a Gausian-Normal Distribution, making them potential candidates for a Discriminant Analysis, if need be. The numbers are said to be "Normalized" but they don't appear to be as there are no negative numbers (indicating numbers that fall below the mean) and they are very large (often occuring between -4 and 4).


In [144]:
df = (df - df.mean()) / df.std()

df

Var_Name_Year,Area-2020,Age_of_Residents-2020,Age_of_Residents-2021,Tree_Canopy-2012,Impervious_Surface-2011,Impervious_Surface-2013,Impervious_Surface-2015,Impervious_Surface-2016,Impervious_Surface-2017,Impervious_Surface-2018,...,Residential_Demolitions-2015,Residential_Demolitions-2016,Residential_Demolitions-2017,Residential_Demolitions-2018,Residential_Demolitions-2020,Residential_Demolitions-2022,Residential_Demolitions-2023,High_Speed_Internet-2020,High_Speed_Internet-2021,High_Speed_Internet-2022
NPA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,-0.418097,-0.960189,-0.858584,0.599348,0.541611,0.475069,0.355854,0.333440,0.331939,0.314210,...,-0.008876,0.083730,0.154721,0.121431,0.351498,-0.019806,0.796587,0.036770,-0.254797,-0.334001
3,0.557507,-0.557516,-0.588272,-0.513685,1.950626,1.879606,1.922728,1.964108,2.007194,2.008817,...,0.813732,1.112186,0.175616,0.409402,0.051755,-0.059251,-0.171605,0.790008,0.697735,0.796813
4,-0.524497,1.066599,0.776807,1.370966,0.168636,0.134084,0.108833,0.114705,0.149598,0.163924,...,0.593507,2.215244,0.174175,1.429617,0.880883,0.040662,3.071840,0.439882,1.081640,0.818607
5,-0.736605,-0.638050,-0.601787,-0.498335,0.417286,0.741972,0.356887,0.331367,0.241805,0.224039,...,2.074634,2.860027,0.045199,-0.192024,-0.327758,-0.076036,0.941816,-2.689022,-2.810878,-1.281602
6,-0.427693,0.113606,0.087510,-0.377192,0.769540,0.672963,0.657653,0.633035,0.605450,0.561923,...,0.055896,1.549146,0.264243,0.018645,0.283814,0.178369,3.652756,-1.317481,-1.079448,-1.590189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472,-0.466691,1.522962,2.020246,0.160014,-0.373213,-0.446404,-0.431717,-0.303070,-0.274136,-0.265169,...,0.592427,0.083730,0.608660,-0.039969,0.958236,0.005914,-0.462063,0.942150,0.987023,0.774491
473,-0.308317,0.355210,0.884933,0.121910,0.128230,0.073193,0.217357,0.210077,0.224192,0.227148,...,0.309588,-0.153400,0.027186,-0.113873,0.114605,0.023424,-0.026376,-0.000470,0.095522,0.086504
474,-0.325302,0.650504,0.817354,-0.310995,0.373772,0.294428,0.313478,0.406006,0.378560,0.382616,...,-0.114671,-0.409182,0.133105,-0.118969,-0.327758,-0.062436,-0.462063,0.672396,0.200333,0.071856
475,0.282897,2.234351,2.033762,0.987824,-0.902630,-0.871621,-0.838939,-0.835914,-0.839807,-0.847657,...,0.047260,0.219615,-0.014606,0.095947,0.837372,-0.076036,1.135455,0.843266,0.833873,0.967486


In [145]:
df = df[filter_recent_cols(df)]

df

Var_Name_Year,Area-2020,Age_of_Residents-2021,Tree_Canopy-2012,Impervious_Surface-2023,Housing_Density-2023,Housing_Size-2023,Housing_Age-2023,New_Residential-2023,Residential_Renovation-2023,Commuters_Driving_Alone-2022,...,Park_Proximity-2023,Job_Density-2019,Home_Sale_Price-2023,Natural_Gas_Consumption-2013,Fire_Call_Rate-2021,Fincancial_Services_Proximity-2023,Public_Nutrition_Assistance-2023,Public_Health_Insurance -2017,Subsidized_Housing-2023,Residential_Demolitions-2023
NPA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,-0.418097,-0.858584,0.599348,0.326066,0.157465,-0.586356,1.876569,-0.240473,0.264299,0.224716,...,-0.207756,0.025873,0.049995,-0.602641,0.204785,-0.603198,,-0.057741,0.201382,0.796587
3,0.557507,-0.588272,-0.513685,1.860837,2.768740,0.807002,2.053879,0.562732,0.941271,-0.647024,...,0.791253,6.273766,0.708942,-0.056074,0.597797,1.819447,,-0.778879,0.136022,-0.171605
4,-0.524497,0.776807,1.370966,0.154070,-0.588614,2.538766,0.398982,-0.196394,1.099232,0.527712,...,-1.983175,-0.401501,3.754119,2.858951,0.222443,-0.901920,,-1.179511,-0.335489,3.071840
5,-0.736605,-0.601787,-0.498335,0.129508,-0.168945,-1.259321,1.344637,1.223907,-0.525501,1.281344,...,1.053937,-0.228983,-0.810275,0.308304,0.106867,-0.779819,,2.185799,-0.335489,0.941816
6,-0.427693,0.087510,-0.377192,0.500057,-0.215574,-1.123447,1.699258,-0.044569,0.061208,-1.957479,...,0.432227,-0.189774,-0.107310,0.126115,0.739859,0.852878,,1.785167,-0.335489,3.652756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472,-0.466691,2.020246,0.160014,-0.204158,-0.402094,1.470995,-0.605777,-0.260063,-0.254713,-0.485575,...,0.098776,-0.425026,2.306291,0.854871,,-1.076531,,-1.099384,-0.335489,-0.462063
473,-0.308317,0.884933,0.121910,0.223139,-0.029055,-0.306915,0.103464,-0.020081,-0.119318,-0.969228,...,0.709986,1.276628,0.039590,,,0.954908,,,0.836609,-0.026376
474,-0.325302,0.817354,-0.310995,0.529389,0.017575,0.527562,-0.428467,-0.279653,-0.299844,0.963533,...,-0.916733,-0.223102,1.191583,0.126115,,0.828244,,-0.939131,-0.335489,-0.462063
475,0.282897,2.033762,0.987824,-0.679289,-0.681874,1.408185,-0.310260,-0.245370,0.286865,-0.115949,...,1.050597,-0.328965,1.444112,,,0.123552,,,-0.335489,1.135455
