### Postprocessing the cleaned Citibike data

In [1]:
import pandas as pd
import os

### Concat the separate files for 2023

In [2]:
directory = "Data/Cleaned/Citibike_Clean/"
df1 = pd.read_parquet(directory+'2023_pt1_citibike.parquet')
df2 = pd.read_parquet(directory+'2023_pt2_citibike.parquet')
df3 = pd.read_parquet(directory+'2023_pt3_citibike.parquet')
df4 = pd.read_parquet(directory+'2023_pt4_citibike.parquet')

In [76]:
merge = df1[['Tract','Num_citibike_rides']].merge(df2[['Tract','Num_citibike_rides']], on='Tract', how='outer')
merge.fillna(0, inplace=True)
merge['Num_citibike_rides'] = merge['Num_citibike_rides_x'] + merge['Num_citibike_rides_y']
merge.drop(['Num_citibike_rides_x', 'Num_citibike_rides_y'], axis=1, inplace=True)

merge = merge.merge(df3[['Tract','Num_citibike_rides']], on='Tract', how='outer')
merge.fillna(0, inplace=True)
merge['Num_citibike_rides'] = merge['Num_citibike_rides_x'] + merge['Num_citibike_rides_y']
merge.drop(['Num_citibike_rides_x', 'Num_citibike_rides_y'], axis=1, inplace=True)

merge = merge.merge(df4[['Tract','Num_citibike_rides']], on='Tract', how='outer')
merge.fillna(0, inplace=True)
merge['Num_citibike_rides'] = merge['Num_citibike_rides_x'] + merge['Num_citibike_rides_y']
merge.drop(['Num_citibike_rides_x', 'Num_citibike_rides_y'], axis=1, inplace=True)
merge['Year'] = '2023'
merge['Name'] = 'na'

# merge.to_parquet(directory+'2023_citibike.parquet')


### Read in all files and merge into one df

In [70]:
directory = "Data/Cleaned/Citibike_Clean/"
all_years_df = pd.DataFrame()

for filename in os.listdir(directory):
    if filename.endswith('.parquet'):
        print(filename)
        file_path = os.path.join(directory,filename)
        df = pd.read_parquet(file_path)
        df.rename(columns={'Num_citibike_rides':df['Year'].iloc[0]}, inplace=True)
        df.drop(['Year','Name'], axis=1, inplace=True)
        df['Tract'] = df['Tract'].apply(lambda x: '360'+x if len(x)==8 else'36'+x if len(x)==9 else x)
        if all_years_df.empty:
            all_years_df = df
        else:
            all_years_df = all_years_df.merge(df, on='Tract', how='outer')
all_years_df.rename(columns={'Tract':'GEOID'}, inplace=True)


2017_citibike.parquet
2022_citibike.parquet
2018_citibike.parquet
2013_citibike.parquet
2016_citibike.parquet
2023_citibike.parquet
2019_citibike.parquet
2020_citibike.parquet
2015_citibike.parquet
2021_citibike.parquet
2014_citibike.parquet


In [72]:
# fill nans and missing years
all_years_df.fillna(0, inplace=True)
all_years_df['2012'] = 0
all_years_df['2011'] = 0
all_years_df['2010'] = 0
all_years_df

Unnamed: 0,GEOID,2017,2022,2018,2013,2016,2023,2019,2020,2015,2021,2014,2012,2011,2010
0,36005000200,0.0,3.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0,0,0
1,36005000400,0.0,1.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2,36005001600,0.0,12.0,0.0,0.0,0.0,35.0,0.0,0.0,0.0,0.0,0.0,0,0,0
3,36005001901,0.0,14881.0,0.0,0.0,0.0,23026.0,0.0,6600.0,0.0,12870.0,0.0,0,0,0
4,36005001902,0.0,25021.0,0.0,0.0,0.0,32127.0,0.0,18073.0,0.0,21922.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1400,36085008100,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
1401,36085012500,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0
1402,36085020702,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
1403,36085022802,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0,0,0


In [73]:
# melt df into long format
long = all_years_df.melt(id_vars='GEOID')
long['variable'] = long['variable'].astype(str)
long = long.sort_values(['GEOID','variable']).reset_index(drop=True)
long.rename(columns={'variable':'year', 'value':'num_citibike_rides'}, inplace=True)
long

Unnamed: 0,GEOID,year,num_citibike_rides
0,36005000200,2010,0.0
1,36005000200,2011,0.0
2,36005000200,2012,0.0
3,36005000200,2013,0.0
4,36005000200,2014,0.0
...,...,...,...
19665,36085024700,2019,0.0
19666,36085024700,2020,0.0
19667,36085024700,2021,0.0
19668,36085024700,2022,0.0


In [75]:
# long.to_parquet('Data/Cleaned/Citibike_Clean/citibike_all_processed.parquet')

In [32]:
# # normalize counts
# wide = long.pivot_table(index='GEOID', columns='variable', values='value')
# for yr in wide.columns[3:]:
#     wide[yr] = (wide[yr] - wide[yr].min()) / (wide[yr].max() - wide[yr].min())
# # melt df into long format again
# df_out = wide.reset_index().melt('Tract')