# Data Cleaning and Preparation for STAT5605 Project

In [None]:
# Data Upload and Package Install

## Package Install
import pandas as pd
import geopandas as gpd
import numpy as np 
from typing import Iterable, Union

## Data Upload
### Roads Dataset
roads_gdf = gpd.read_file('../data/Connecticut_Road/Road.shp')
### CT Towns Dataset
ct_towns_gdf = gpd.read_file('../data/tl_2020_09_all/tl_2020_09_cousub20.shp')
### DP03: Selected Economic Characteristics (from ACS Census 2023)
DP03_df = pd.read_csv('../data/ACS_Selected_Economic_Characteristics_CT_2023/ACSDP5Y2023.DP03-Data.csv')
### DP05: Demographic and Housing Estimates (from ACS Census 2023)
DP05_df = pd.read_csv('../data/ACS_Demographic_and_Housing_Estimates_CT_2023/ACSDP5Y2023.DP05-Data.csv')

In [16]:
print(DP03_df.head())

                GEO_ID                                               NAME  \
0            Geography                               Geographic Area Name   
1  0600000US0911001080  Andover town, Capitol Planning Region, Connect...   
2  0600000US0911002060    Avon town, Capitol Planning Region, Connecticut   
3  0600000US0911004300  Berlin town, Capitol Planning Region, Connecticut   
4  0600000US0911005910  Bloomfield town, Capitol Planning Region, Conn...   

                                          DP03_0001E  \
0  Estimate!!EMPLOYMENT STATUS!!Population 16 yea...   
1                                               2811   
2                                              15105   
3                                              16813   
4                                              18788   

                                          DP03_0001M  \
0  Margin of Error!!EMPLOYMENT STATUS!!Population...   
1                                                 75   
2                               

### Exploration of Data (Mapping Shapefiles)

In [1]:
# Mapping Shapefiles

## Mapping roads

#ct_towns_gdf.explore()

## Merge of ACS Census Data into CT towns map

In [None]:
## Managing Merge of ACS Census Data into CT towns map

### Function: Merging csv into gdf 
def merge_csv_into_gdf(
    gdf: gpd.GeoDataFrame,
    csv_or_df: Union[str, pd.DataFrame],
    gdf_key: str = "GEOID10",
    csv_geo_col: str = "Geography",
    keep_columns: Iterable[str] = (), ## COLUMNS TO KEEP SPECIFICATION
    join_type: str = "left",
    geoid_regex: str = r"US(\d+)",
) -> gpd.GeoDataFrame:
    
    """
    Merge a CSV (or DataFrame) into a GeoDataFrame using a GEOID embedded
    in a 'Geography' column.
    
    Requires pandas, geopandas, and typing:
    'from typing import Iterable, Union'

    Parameters
    ----------
    gdf : GeoDataFrame
        Target GeoDataFrame (e.g. CT towns)
    csv_or_df : str or DataFrame
        Path to CSV file or already-loaded DataFrame
    gdf_key : str
        GEOID column in the GeoDataFrame
    csv_geo_col : str
        Column containing the GEOID string (e.g. 'US09001')
    keep_columns : iterable of str
        Columns to keep from the CSV (excluding GEOID)
    join_type : str
        pandas merge type ('left', 'right', 'inner', 'outer')
    geoid_regex : str
        Regex used to extract the numeric GEOID

    Returns
    -------
    GeoDataFrame
    """

    # Load CSV if data is not already imported as dataframe
    if isinstance(csv_or_df, str):
        df = pd.read_csv(csv_or_df)
    else:
        df = csv_or_df.copy()

    # Extract GEOID from dataframe
    df["_merge_geoid"] = (
        df[csv_geo_col]
        .astype(str)
        .str.extract(geoid_regex, expand=False)
    )

    # Ensure string type + zero padding (important for census GEOIDs)
    gdf[gdf_key] = gdf[gdf_key].astype(str)
    df["_merge_geoid"] = df["_merge_geoid"].astype(str)

    # Subset to only needed columns
    cols_to_use = ["_merge_geoid", *keep_columns]
    df = df[cols_to_use]

    # Merge
    merged = gdf.merge(
        df,
        how=join_type,
        left_on=gdf_key,
        right_on="_merge_geoid"
    )

    # Clean up
    merged = merged.drop(columns="_merge_geoid")

    return merged



## Perform merge for census data into CT towns

### Rename GeoID column for function
DP03_df = DP03_df.rename({'GEO_ID': '_merge_geoid'})
DP05_df = DP03_df.rename({'GEO_ID': '_merge_geoid'})

### Execute merging of csvs into dataframe
for csv, cols in [
    ("DP03_df", ["Population"]),
    ("DP05_df", ["MedianIncome"]),
]:
    ct_towns_gdf = merge_csv_into_gdf(
        ct_towns_gdf,
        csv,
        keep_columns=cols
    )


#### Data Cleanup (Removing excess information, false towns)

In [None]:
## 

## Calculation of Road Lengths in Towns

In [None]:
# Calculate Road Lengths in Town

'''
Here, we want to calculate the length of primary and secondary roads
'''

## 

## Merge Road Data into CT Towns Dataset

In [None]:
## 