In [585]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## ACS and Citibike

In [586]:
acs = pd.read_parquet("Data/Cleaned/ACS.parquet")
citi =  pd.read_parquet("Data/Cleaned/Citibike_Clean/citibike_all_processed.parquet")

# merge df's together on hard tract name (with decimal points)
df = acs.merge(citi, left_on=['full_tract','year'], right_on=['Tract','Year'], how='left')
df.drop(['Tract','Year'],axis=1,inplace=True)
df.shape

(19286, 19)

In [587]:
# do a fuzzy join on the remaining counties by removing decimal points from tract

no_bike = df[df.Proportion_citibike_rides.isna()].drop('Proportion_citibike_rides', axis=1)
no_bike['fuzzy_tract'] = no_bike['full_tract'].apply(lambda x: x[:-2])

citi_fuzzy = citi.copy()
citi_fuzzy['fuzzy_tract'] = citi_fuzzy['Tract'].apply(lambda x: x[:-2])
citi_fuzzy.drop('Tract',axis=1,inplace=True)

df_fuzzy = no_bike.merge(citi_fuzzy, left_on=['fuzzy_tract','year'], right_on=['fuzzy_tract','Year'], how='left').drop_duplicates()
df_fuzzy.drop(['fuzzy_tract','Year'], axis=1, inplace=True)

no_bike.shape
df_fuzzy.shape

(10779, 19)

In [588]:
# do a fuzzy join on the remaining counties by removing decimal points from tract

no_bike = df[df.Proportion_citibike_rides.isna()].drop('Proportion_citibike_rides', axis=1)
no_bike['fuzzy_tract'] = no_bike['full_tract'].apply(lambda x: x[:-2])

citi_fuzzy = citi.copy()
citi_fuzzy['fuzzy_tract'] = citi_fuzzy['Tract'].apply(lambda x: x[:-2])
citi_fuzzy.drop('Tract',axis=1,inplace=True)

df_fuzzy = no_bike.merge(citi_fuzzy, left_on=['fuzzy_tract','year'], right_on=['fuzzy_tract','Year'], how='left').drop_duplicates()
df_fuzzy.drop(['fuzzy_tract','Year'], axis=1, inplace=True)

no_bike.shape
df_fuzzy.shape

(10779, 19)

In [589]:
# add the fuzzy matches back into main df
df_w_bike = df[~df.Proportion_citibike_rides.isna()]
df = pd.concat([df_fuzzy, df_w_bike]).drop_duplicates(['full_tract','year'])
df['Proportion_citibike_rides'] = df['Proportion_citibike_rides'].fillna(0)
df.shape

(19286, 19)

## Add Tree Census

In [590]:
tree = pd.read_parquet("Data/Cleaned/tree_census.parquet")
tree['year'] = tree['year'].astype(str)
df = df.merge(tree, on=['full_tract', 'year'], how='left')
df['num_trees'] = df['num_trees'].fillna(0)

#### TODO: fill tree nans with the average of that area


## Outliers

In [591]:
# replace all outliers with a common value

df['med_income'] = df['med_income'].astype(float)
for col in ['med_income','med_value','med_hh_age']:
    df[col] = np.where(df[col]<0, -999, df[col])

df['med_hh_residence'] = np.where(df['med_hh_residence']>666668689, -999, df['med_hh_residence'])

# change house age of 2024 to zero
df.loc[df['med_hh_age']>1000, 'med_hh_age'] = 2024-df.loc[df['med_hh_age']>1000, 'med_hh_age']

# replace missing values with the mean of that year. TODO: make it the mean of that geo area too
for yr in df.year.unique():
    for col in ['med_value','med_hh_age','med_income','med_hh_residence']:
        mean = df[df.year==yr][col].mean()
        df.loc[df.year==yr, col] = df.loc[df.year==yr, col].replace(to_replace= -999, value=mean)

In [594]:
df.describe()

Unnamed: 0,percent_white,percent_black,percent_native,percent_asian,percent_pacific,percent_latino,percent_bachelors,percent_grad,percent_instate,med_income,percent_owned,percent_rented,med_value,med_hh_age,med_hh_residence,Proportion_citibike_rides,num_trees
count,19286.0,19286.0,19286.0,19286.0,19286.0,19286.0,19286.0,19286.0,19286.0,19286.0,19286.0,19286.0,19286.0,19286.0,19286.0,19286.0,19286.0
mean,0.422996,0.256324,0.004259,0.13903,0.000523,0.266838,0.203618,0.137696,0.248753,65551.589651,0.588829,0.411171,619916.3,63.078514,19.273188,0.013777,281.772783
std,0.297577,0.300465,0.011406,0.167011,0.003949,0.225653,0.104678,0.117624,0.108836,33632.606403,0.357335,0.357335,289676.4,27.386113,4.886243,0.061164,280.105712
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9001.0,0.0,0.0,9999.0,0.0,4.0,0.0,-379.5
25%,0.142131,0.019725,0.0,0.021584,0.0,0.092199,0.127234,0.054783,0.17433,41989.25,0.255629,0.0,442225.0,59.0,16.0,0.0,146.0
50%,0.387614,0.098034,0.0,0.071981,0.0,0.184552,0.184101,0.102276,0.235342,60167.5,0.597937,0.402063,566588.4,72.0,20.0,0.0,218.6
75%,0.692257,0.442421,0.003949,0.196184,0.0,0.394121,0.262059,0.179918,0.305949,81667.0,1.0,0.744371,728100.0,85.0,22.0,0.0,319.5
max,1.0,1.0,0.37037,0.943174,0.181625,1.0,1.0,0.735763,1.0,250001.0,1.0,1.0,2000001.0,86.0,55.0,1.0,3433.5


## Feature Engineering

In [436]:
# normalize certain columns

for yr in df.year.unique():
    yr_df = df[df.year == yr]
    for col in ['med_income','med_value','med_hh_age','med_hh_residence','num_trees']:
        yr_df[col] = yr_df[col].astype(float)
        yr_df[col] = (yr_df[col] - yr_df[col].min()) / (yr_df[col].max() - yr_df[col].min())
    dropped = df.drop(df.iloc[yr_df.index].index)  # drop that year from main df
    df = pd.concat([dropped, yr_df]).reset_index(drop=True)  # add that year back


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yr_df[col] = yr_df[col].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yr_df[col] = (yr_df[col] - yr_df[col].min()) / (yr_df[col].max() - yr_df[col].min())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yr_df[col] = yr_df[col].astype(float)
A value is trying to be set on a copy of a s

Unnamed: 0,NAME,percent_white,percent_black,percent_native,percent_asian,percent_pacific,percent_latino,percent_bachelors,percent_grad,percent_instate,med_income,percent_owned,percent_rented,med_value,year,med_hh_age,med_hh_residence,full_tract,Proportion_citibike_rides,num_trees
0,"Census Tract 19, Bronx County, New York",0.388368,0.255269,0.000000,0.013271,0.0,0.611632,0.114817,0.062861,0.202186,0.116155,0.294118,0.705882,0.999024,2013,1.000000,0.121212,005001900,0.0,0.000000
1,"Census Tract 20, Bronx County, New York",0.190397,0.393921,0.016202,0.023297,0.0,0.629139,0.090335,0.039250,0.248108,0.044663,1.000000,0.000000,0.999059,2013,1.000000,0.303030,005002000,0.0,0.000000
2,"Census Tract 38, Bronx County, New York",0.147947,0.401239,0.000000,0.006971,0.0,0.542215,0.062417,0.091633,0.262587,0.127759,1.000000,0.000000,0.999169,2013,1.000000,0.333333,005003800,0.0,0.032278
3,"Census Tract 40.01, Bronx County, New York",0.202721,0.197279,0.000000,0.242857,0.0,0.365306,0.143541,0.036364,0.246939,0.17907,1.000000,0.000000,0.999059,2013,1.000000,0.333333,005004001,0.0,0.051669
4,"Census Tract 42, Bronx County, New York",0.217257,0.489837,0.005250,0.010499,0.0,0.536277,0.125163,0.034393,0.331673,0.087027,1.000000,0.000000,0.998715,2013,1.000000,0.363636,005004200,0.0,0.093520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19281,"Census Tract 871, Queens County, New York",0.206659,0.293341,0.010333,0.249139,0.0,0.450057,0.172603,0.042922,0.213548,0.024535,1.000000,0.000000,0.998627,2010,0.757576,0.277778,081087100,0.0,0.048638
19282,"Census Tract 964, Queens County, New York",0.123423,0.749099,0.021396,0.013063,0.0,0.173874,0.176573,0.068881,0.260135,0.236938,0.929655,0.070345,0.999087,2010,0.500000,0.083333,081096400,0.0,0.130333
19283,"Census Tract 987, Queens County, New York",0.808324,0.011837,0.000000,0.159603,0.0,0.085911,0.189031,0.141640,0.426880,0.297743,1.000000,0.000000,0.999507,2010,0.772727,0.250000,081098700,0.0,0.205912
19284,"Census Tract 1429, Queens County, New York",0.574949,0.000000,0.000000,0.393157,0.0,0.089591,0.272912,0.204380,0.281531,0.259191,0.941650,0.058350,0.999440,2010,0.787879,0.277778,081142900,0.0,0.250328


In [438]:
df.sort_values(['full_tract','year'], inplace=True)
df.drop('NAME', axis=1, inplace=True)
df.set_index(['full_tract', 'year'],inplace=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,percent_white,percent_black,percent_native,percent_asian,percent_pacific,percent_latino,percent_bachelors,percent_grad,percent_instate,med_income,percent_owned,percent_rented,med_value,med_hh_age,med_hh_residence,Proportion_citibike_rides,num_trees
full_tract,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
005000200,2010,0.145124,0.309146,0.008866,0.042697,0.0,0.715119,0.102853,0.039775,0.231918,0.210892,0.992405,0.007595,0.999090,0.863636,8.333333e-02,0.0,0.089704
005000200,2011,0.165115,0.321145,0.008858,0.036339,0.0,0.695889,0.141304,0.036879,0.214626,0.224313,0.956407,0.043593,0.999136,1.000000,1.499995e-09,0.0,0.095278
005000200,2012,0.176806,0.318547,0.007816,0.057879,0.0,0.658428,0.157368,0.042918,0.250739,0.260214,1.000000,0.000000,0.999135,0.833333,1.349996e-08,0.0,0.101046
005000200,2013,0.278304,0.309592,0.007205,0.053314,0.0,0.666118,0.131664,0.036086,0.237546,0.289403,1.000000,0.000000,0.999128,1.000000,2.727273e-01,0.0,0.107020
005000200,2014,0.315211,0.279293,0.001374,0.064966,0.0,0.687929,0.123925,0.022828,0.244553,0.272194,1.000000,0.000000,0.999117,0.695652,2.962963e-01,0.0,0.113211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
085032300,2018,0.399824,0.509666,0.000000,0.006151,0.0,0.318102,0.127686,0.069532,0.358524,0.255397,0.756818,0.243182,0.997413,0.999997,1.349996e-08,0.0,0.070664
085032300,2019,0.465983,0.391426,0.000000,0.014911,0.0,0.422181,0.132791,0.062331,0.375582,0.229861,0.694013,0.305987,0.997408,0.999997,1.949994e-08,0.0,0.091374
085032300,2020,0.386667,0.345833,0.000000,0.012500,0.0,0.323333,0.123762,0.237624,0.417500,0.352113,0.838202,0.161798,0.997393,0.999997,1.799995e-08,0.0,0.121307
085032300,2021,0.305882,0.390045,0.000000,0.029864,0.0,0.297738,0.093315,0.207521,0.396380,0.312937,0.752381,0.247619,0.997364,0.999997,2.249993e-08,0.0,0.132458


# Modeling

In [None]:
df.columns


In [None]:
# define feature set
feat = ['percent_white', 'percent_black', 'percent_native', 'percent_asian', 'percent_pacific', 'percent_latino',
       'percent_bachelors', 'percent_grad', 'percent_instate', 'percent_owned', 'percent_rented', 'med_income','med_hh_age',
       'med_hh_residence', 'Proportion_citibike_rides', 'num_trees']

# split the data
test = df[df.year.isin(['2021','2022'])]
train = df[df.year.isin(['2021','2022'])==False]
test.set_index(['full_tract','year'],inplace=True)
train.set_index(['full_tract','year'],inplace=True)

y_train = train['med_value'].astype(float)
y_test = test['med_value'].astype(float)
X_train = train[feat]
X_test = test[feat]

X_train.head()

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)
