In [80]:
### Postprocessing the cleaned Citibike data

In [81]:
import pandas as pd
import os

In [110]:
directory = "Data/Cleaned/Citibike_Clean/"
all_years_df = pd.DataFrame()

for filename in os.listdir(directory):
    if filename.endswith('.parquet'):
        print(filename)
        file_path = os.path.join(directory,filename)
        df = pd.read_parquet(file_path)
        df.rename(columns={'Num_citibike_rides':df['Year'].iloc[0]}, inplace=True)
        df.drop(['Year','Name'], axis=1, inplace=True)
        if all_years_df.empty:
            all_years_df = df
        else:
            all_years_df = all_years_df.merge(df, on='Tract', how='outer')


2017_citibike.parquet
2022_citibike.parquet
2018_citibike.parquet
2013_citibike.parquet
2016_citibike.parquet
2019_citibike.parquet
2020_citibike.parquet
2015_citibike.parquet
2021_citibike.parquet
2014_citibike.parquet


In [112]:
# fix tract numbers
all_years_df['Tract'] = all_years_df['Tract'].apply(lambda x: '0'+x if len(x)==8 else x)
all_years_df


Unnamed: 0,Tract,2017,2022,2018,2013,2016,2019,2020,2015,2021,2014
0,005000200,,3.0,,,,,,,,
1,005000400,,1.0,,,,,,,,
2,005001600,,12.0,,,,,,,,
3,005001901,,14881.0,,,,,6600.0,,,
4,005001902,,25021.0,,,,,18073.0,,,
...,...,...,...,...,...,...,...,...,...,...,...
1826,081055700,,,,,,,,,18159.0,
1827,081055900,,,,,,,,,12001.0,
1828,081058900,,,,,,,,,25438.0,
1829,081059100,,,,,,,,,19854.0,


In [113]:
# fill nans and missing years
all_years_df.fillna(0, inplace=True)
all_years_df['2012'] = 0
all_years_df['2011'] = 0
all_years_df['2010'] = 0
all_years_df

Unnamed: 0,Tract,2017,2022,2018,2013,2016,2019,2020,2015,2021,2014,2012,2011,2010
0,005000200,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
1,005000400,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2,005001600,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
3,005001901,0.0,14881.0,0.0,0.0,0.0,0.0,6600.0,0.0,0.0,0.0,0,0,0
4,005001902,0.0,25021.0,0.0,0.0,0.0,0.0,18073.0,0.0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1826,081055700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18159.0,0.0,0,0,0
1827,081055900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12001.0,0.0,0,0,0
1828,081058900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25438.0,0.0,0,0,0
1829,081059100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19854.0,0.0,0,0,0


In [114]:
# melt df into long format
long = all_years_df.melt(id_vars='Tract')
long['variable'] = long['variable'].astype(str)
long = long.sort_values(['Tract','variable']).reset_index(drop=True)
long

Unnamed: 0,Tract,variable,value
0,005000200,2010,0.0
1,005000200,2011,0.0
2,005000200,2012,0.0
3,005000200,2013,0.0
4,005000200,2014,0.0
...,...,...,...
23798,085020702,2018,0.0
23799,085020702,2019,0.0
23800,085020702,2020,0.0
23801,085020702,2021,0.0


In [115]:
# normalize counts
wide = long.pivot_table(index='Tract', columns='variable', values='value')
for yr in wide.columns[3:]:
    wide[yr] = (wide[yr] - wide[yr].min()) / (wide[yr].max() - wide[yr].min())
wide.sample(10)

variable,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
Tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
81018502,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015849
47021100,0.0,0.0,0.0,0.079093,0.073124,0.081959,0.060641,0.057583,0.051285,0.052105,0.036228,0.060523,0.051786
81004500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00381,0.017719,0.019347,0.029304,0.028268,0.027676
61015102,0.0,0.0,0.0,0.019665,0.129347,0.162357,0.09927,0.098587,0.110841,0.059121,0.118088,0.173968,0.182948
5021502,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.1e-05,0.0,0.0,0.005016,0.009071
5024600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8e-06
47004300,0.0,0.0,0.0,0.096907,0.084873,0.081502,0.047601,0.033118,0.03449,0.033106,0.028126,0.062019,0.055375
61023600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012467,0.028212,0.030336
47000200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000889,2.4e-05,0.0,0.0,0.004723,0.005833
47105804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7e-06


In [116]:
# melt df into long format again
df_out = wide.reset_index().melt('Tract')
df_out.rename(columns={'variable':'Year', 'value':'Proportion_citibike_rides'}, inplace=True)
df_out

Unnamed: 0,Tract,Year,Proportion_citibike_rides
0,005000200,2010,0.000000
1,005000400,2010,0.000000
2,005001600,2010,0.000000
3,005001901,2010,0.000000
4,005001902,2010,0.000000
...,...,...,...
14425,081122704,2022,0.000003
14426,081142900,2022,0.000002
14427,085006700,2022,0.000002
14428,085008100,2022,0.000007


In [117]:
# df_out.to_parquet('Data/Cleaned/Citibike_Clean/citibike_all_processed.parquet')

In [120]:
# look at name column
df_22 = pd.read_parquet("Data/Cleaned/Citibike_Clean/2022_citibike.parquet")


In [124]:
df_22[df_22.Tract=='005006302']

Unnamed: 0,Tract,Name,Num_citibike_rides,Year
322,5006302,Yankee Stadium-Macombs Dam Park,44712,2022
