In [17]:
# Imports
import geopandas as gpd
import numpy as np
import pandas as pd
import os 
import requests
import json
import datetime
import time
import random
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder


In [13]:
# Load Data
cdf = gpd.read_file('../static/data/centroids_data.geojson')
cdf.head()

Unnamed: 0,OBJECTID,GEOID,county,Tract,Low,Lowmod,Lowmod_pct,category,geometry
0,199060,490111251021,Davis County,125102,120,340,0.1833,blue,POINT (-111.88081 41.08032)
1,199061,490111251022,Davis County,125102,245,490,0.1892,blue,POINT (-111.88004 41.04248)
2,199062,490111251031,Davis County,125103,370,850,0.3131,green,POINT (-111.95631 41.13919)
3,199063,490111251032,Davis County,125103,430,1000,0.4264,green,POINT (-111.94339 41.11548)
4,199064,490111251041,Davis County,125104,165,475,0.1967,blue,POINT (-111.91575 41.12877)


In [15]:
# Get Latitude and Longitude from Geometry
cdf['longitude'] = cdf['geometry'].x

# Extract latitude
cdf['latitude'] = cdf['geometry'].y

In [36]:
cdf.to_csv('../static/data/centroids_data.csv')

In [33]:
# Assuming 'df' is your DataFrame and 'category' is your categorical column
one_hot_encoded_df = pd.get_dummies(cdf, columns=['category'], prefix='category')
one_hot_encoded_df = pd.get_dummies(one_hot_encoded_df, columns=['county'], prefix='county')
one_hot_encoded_df.head()


Unnamed: 0,OBJECTID,GEOID,Tract,Low,Lowmod,Lowmod_pct,geometry,longitude,latitude,County_encoded,category_encoded,category_blue,category_green,category_orange,category_red,county_Davis County,county_Salt Lake County,county_Weber County
0,199060,490111251021,125102,120,340,0.1833,POINT (-111.88081 41.08032),-111.88081,41.080325,0,0,True,False,False,False,True,False,False
1,199061,490111251022,125102,245,490,0.1892,POINT (-111.88004 41.04248),-111.880037,41.042484,0,0,True,False,False,False,True,False,False
2,199062,490111251031,125103,370,850,0.3131,POINT (-111.95631 41.13919),-111.956309,41.139194,0,1,False,True,False,False,True,False,False
3,199063,490111251032,125103,430,1000,0.4264,POINT (-111.94339 41.11548),-111.943393,41.115477,0,1,False,True,False,False,True,False,False
4,199064,490111251041,125104,165,475,0.1967,POINT (-111.91575 41.12877),-111.915753,41.128774,0,0,True,False,False,False,True,False,False


In [31]:
one_hot_encoded_df[['longitude','latitude','county_Davis County','county_Weber County','county_Salt Lake County',\
    'category_red','category_orange','category_blue','category_blue']]

Unnamed: 0,longitude,latitude,county_Davis County,county_Weber County,county_Salt Lake County,category_red,category_orange,category_blue,category_blue.1
0,-111.880810,41.080325,True,False,False,False,False,True,True
1,-111.880037,41.042484,True,False,False,False,False,True,True
2,-111.956309,41.139194,True,False,False,False,False,False,False
3,-111.943393,41.115477,True,False,False,False,False,False,False
4,-111.915753,41.128774,True,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...
907,-111.920500,41.152009,False,True,False,False,False,True,True
908,-111.932628,41.156133,False,True,False,False,False,False,False
909,-111.964104,41.165035,False,True,False,False,False,False,False
910,-111.951588,41.154071,False,True,False,False,False,False,False


In [32]:
one_hot_encoded_df.to_csv('../static/data/centroid_prep.csv')

# LOAD DATA --------------------------------------------------

In [19]:
# Columns to standardi
standardize_cols = ['latitude','longitude',\
                                    'avg_pm2']

# Columns to one-hot encode
encode_cols = ['category_encoded', 'County_encoded','month']  # Ensure 'County' is the correct column name for your dataset


In [20]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), standardize_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), encode_cols)
    ])

In [23]:
# Fit and transform the training data
X_train_transformed = preprocessor.fit_transform(X_train)

# Transform the validation and test data
X_val_transformed = preprocessor.transform(X_val)
X_test_transformed = preprocessor.transform(X_test)