In [None]:
import pandas as pd
import os
import gc
from numba import jit,cuda
import matplotlib.pyplot as plt
import numpy as np

@jit(target_backend='cuda') 
def read(file):
    dt_range = pd.to_datetime(pd.date_range(start='07/01/2021', end='10/31/2021')).date
    df = pd.read_parquet(file)
    if ('taxi' in file) or ('hvfhv' in file):
        df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')
        df['pickup_date'] = df['pickup_datetime'].dt.date
        df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'], errors='coerce')
        df['dropoff_date'] = df['dropoff_datetime'].dt.date
        df = df[df["pickup_date"].isin(dt_range) & df["dropoff_date"].isin(dt_range)]
        dt_wkends = df[df['Day of week']>4]['pickup_date'].unique()
        if 'hvfhv' in file:
            print(df.head())
            df.drop(['congestion_surcharge', 'dispatching_base_num', 'Affiliated_base_number',
                     'sales_tax','SR_Flag','bcf','base_passenger_fare','driver_pay'], axis=1,inplace=True)
        
    elif 'citibike' in file:
        df['started_datetime'] = pd.to_datetime(df['started_datetime'], errors='coerce')
        df['started_date'] = df['started_datetime'].dt.date
        df['ended_datetime'] = pd.to_datetime(df['ended_datetime'], errors='coerce')
        df['ended_date'] = df['ended_datetime'].dt.date
        df.rename(columns = {'started_date':'pickup_date','ended_date':'dropoff_date'}, inplace = True)
        df.rename(columns = {'started_datetime':'pickup_datetime','ended_datetime':'dropoff_datetime'}, inplace = True)
        df = df[df["pickup_date"].isin(dt_range) & df["dropoff_date"].isin(dt_range)]
        dt_wkends = df[df['Day of week']>4]['pickup_date'].unique()

        
    elif 'subway' in file:
        df['DATE_TIME'] = pd.to_datetime(df['DATE_TIME'], errors='coerce')
        df['DATE'] = df['DATE_TIME'].dt.date    
        df = df[df["DATE"].isin(dt_range)]
        
        dt_wkends = df[df['Day of week']>4]['DATE'].unique()
    
    return df,dt_wkends.tolist()


def trips_by_day(df):

    df['index'] = df.index
    sta_pk = df.fillna(0).groupby(df["pickup_date"]).count()
    sta_pd = pd.DataFrame()
    sta_pd['cnt'] = sta_pk['index']

    return sta_pd
    

def run(dir,file):
    df,dt_wkends = read(dir+file)
    print(df.head())
    if 'taxi' in file:
        col = 'taxi'
        sta = trips_by_day(df)
        
        del df
        gc.collect()
        
    elif 'hvfhv' in file:
        col='hvfhv'
        sta = trips_by_day(df)
        
        del df
        gc.collect()
    
    elif 'citibike' in file:
        col='citibike'
        sta = trips_by_day(df)
        
        del df
        gc.collect()
        
    else:
        col='subway'
        sta = df.groupby(["DATE"])['ENTRIES_D'].sum()
        
        del df
        gc.collect()
        
    return sta,col,dt_wkends

def vis(idir,sta_df,sta_df_nwk):
    # def plt_by_day(idir,title,sta_df_nwk,sta_df,key):
    title = 'Daily ridership pattern of multi-modal transportation'
    fig = plt.figure(figsize=(20, 5), dpi=300)
    
    maxy = 5
    miny = 0
    
    ida_nyc_window = pd.to_datetime(['2021-09-01','2021-09-02'])
    henri_nyc_wd = pd.to_datetime(['2021-08-21','2021-08-23'])
    elsa_nyc_window = pd.to_datetime(['2021-07-08','2021-07-09'])
    noreaster_nyc_window = pd.to_datetime(['2021-10-25','2021-10-26'])

    plt.fill_between(henri_nyc_wd,0,maxy, facecolor='lightskyblue', alpha=0.4)#,label='Henri_period_nyc')
    plt.fill_between(elsa_nyc_window,0,maxy, facecolor='lightskyblue', alpha=0.4)#,label='Elsa_period_nyc')
    plt.fill_between(ida_nyc_window,0,maxy, facecolor='lightskyblue', alpha=0.4)#,label='Ida_period_nyc')
    plt.fill_between(noreaster_nyc_window,0,maxy, facecolor='lightskyblue', alpha=0.4)#,label="nor'easter_period_nyc")
    plt.text(henri_nyc_wd[0],0.8*maxy,'Henri',fontsize=14)
    plt.text(elsa_nyc_window[0],0.8*maxy,'Elsa',fontsize=14)
    plt.text(ida_nyc_window[0],0.8*maxy,'Ida',fontsize=14)
    plt.text(noreaster_nyc_window[0],0.8*maxy,"Nor'easter",fontsize=14)

    
    plt.plot(sta_df.index,sta_df,marker = 'o',label = sta_df.columns, alpha=0.5)
    plt.scatter(sta_df_nwk.index,sta_df_nwk['taxi'],marker = 's',c='black',label='non-workday')
    plt.scatter(sta_df_nwk.index,sta_df_nwk['fhv'],marker = 's',c='black',label='non-workday')
    plt.scatter(sta_df_nwk.index,sta_df_nwk['citibike'],marker = 's',c='black',label='non-workday')
    plt.scatter(sta_df_nwk.index,sta_df_nwk['subway'],marker = 's',c='black',label='non-workday')
    

    plt.xticks( sta_df.index, rotation=90 )
    # plt.legend(ncol=10,loc='upper left',unique=True)
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys(),ncol=10,loc='upper left',fontsize=14)
    # plt.tight_layout()
    plt.margins(x=0.01, y=0.1)
    plt.title(title,fontsize=16)
    plt.ylim((miny,maxy))
    plt.grid(axis='y', linestyle='--')
    plt.savefig(idir+title+'.png',format='png',bbox_inches='tight',dpi = 300)
    plt.show()


if __name__=='__main__':
    
    input_dir = ""
    if os.path.exists(input_dir+'sta_daily.csv'):
        re = pd.DataFrame(pd.read_csv(input_dir+'sta_daily.csv'))
        re['date'] =  pd.to_datetime(re['date'], errors='coerce').dt.date
        re = re.reset_index().set_index('date')
        
        dt_non_work = pd.DataFrame(pd.read_csv(input_dir+'dt_wkends.csv'))
        dt_non_work['non-workday'] =  pd.to_datetime(dt_non_work['non-workday'], errors='coerce').dt.date
        
        vre = re[['taxi_norm','hvfhv_norm','citibike_norm','subway_norm']]
        vre.rename(columns = {'taxi_norm':'taxi','hvfhv_norm':'fhv','citibike_norm':'citibike','subway_norm':'subway'}, inplace = True)
        
        re_nwk = vre[vre.index.isin(dt_non_work['non-workday'])]
        vis(input_dir,vre,re_nwk)
    
    else:
        print('.......except........')
        items = [i for i in os.listdir(input_dir) if os.path.splitext(i)[1] == '.gzip']
        re = pd.DataFrame()
        for file in items:
            sta,col,dt_non_work = run(input_dir, file)
            re[col] = sta
            re[col+'_norm'] = re[col]/max(re[col])
            if col=='hvfhv':
                re[col+'_norm'] = re[col+'_norm']+1
            elif col=='citibike':
                re[col+'_norm'] = re[col+'_norm']+2
            elif col=='subway':
                re[col+'_norm'] = re[col+'_norm']+3

        vre = re[['taxi_norm','hvfhv_norm','citibike_norm','subway_norm']]
        vre.rename(columns = {'taxi_norm':'taxi','hvfhv_norm':'fhv','citibike_norm':'citibike','subway_norm':'subway'}, inplace = True)
        re_nwk = vre[vre.index.isin(dt_non_work)]
        vis(input_dir,vre,re_nwk)

        re['date'] = re.index
        re.to_csv(input_dir+'sta_daily.csv',index=False)
    
    