In [None]:
%load_ext autoreload
%autoreload 1

%aimport Seasonal_Outliers
import importlib

importlib.reload(Seasonal_Outliers)

import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose
from scipy import stats
import seaborn as sns
from mpl_toolkits.axes_grid1 import make_axes_locatable
from numba import jit,cuda

import Ipynb_importer
# import Data_Cleaning_Taxi
from prophet import Prophet
import logging
logger = logging.getLogger('cmdstanpy')
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.CRITICAL)
import gc

def read(file):
    print(file)
    df = pd.read_parquet(file)
    print(df.head())
    car_type=''
    if 'fhv_' in file:
        car_type='fhv'
        df.rename(columns={'dropOff_datetime':'dropoff_datetime',
                          'PUlocationID':'PULocationID',
                          'DOlocationID':'DOLocationID'},inplace=True)
        
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')
    df['pickup_date'] = df['pickup_datetime'].dt.date
    df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'], errors='coerce')
    df['dropoff_date'] = df['dropoff_datetime'].dt.date
    df['Day of week'] = df['pickup_datetime'].dt.dayofweek
    dt_wkends = df[df['Day of week']>4]['pickup_date'].unique()
    
    
    if car_type=='fhv':
        
        df['pickup_datetime'] = df['pickup_datetime'].dt.round(freq='1h').dt.ceil(freq='2h').dt.floor(freq='4h')
        df['dropoff_datetime'] = df['dropoff_datetime'].dt.round(freq='1h').dt.ceil(freq='2h').dt.floor(freq='4h')
        return df,dt_wkends.tolist()
    
    df.drop(['hvfhs_license_num', 'dispatching_base_num', 'originating_base_num','shared_request_flag',
             'shared_match_flag', 'access_a_ride_flag', 'wav_request_flag', 'wav_match_flag'], axis=1,inplace=True)
    
    df['trip_time'] = pd.to_timedelta(df['trip_time'],unit='S')/pd.Timedelta(1,'h')
    df['request_datetime'] = pd.to_datetime(df['request_datetime'], errors='coerce')
    df['on_scene_datetime'] = pd.to_datetime(df['on_scene_datetime'], errors='coerce')
    

    df['pickup_datetime'] = df['pickup_datetime'].dt.round(freq='1h').dt.ceil(freq='2h').dt.floor(freq='4h')
    df['dropoff_datetime'] = df['dropoff_datetime'].dt.round(freq='1h').dt.ceil(freq='2h').dt.floor(freq='4h')

    return df,dt_wkends.tolist()
    



def combine_df(idir,files):
    df = pd.DataFrame()
    dt_non_work = []#non-working day
    for fi in files:
        dfi,dt_wkends = read(idir+fi)
        df = df.append(dfi,ignore_index=True)
        del dfi
        gc.collect()
        dt_non_work+=dt_wkends
    holiday = pd.to_datetime(pd.Series(['2021-07-04','2021-07-05','2021-09-06','2021-09-16','2021-10-11'])).dt.date.unique()
    dt_non_work = np.append(dt_non_work,holiday)#Labor Day

    return df,dt_non_work


def vis(idir,outliers,col,title):
    
    start = pd.to_datetime('07/01/2021 00:00:00')
    end = pd.to_datetime('10/31/2021 23:59:59')
    dt_range = pd.to_datetime(pd.date_range(start=start, end=end,freq='4H'))
    df = pd.DataFrame(index=outliers[col[0]].unique(),columns=dt_range)
    df = df.fillna(0).astype('float')
    for idx in df.index:#datetime
        vals = outliers[outliers[col[0]]==idx][[col[1]]].values
        for val in vals:
            if val in dt_range.values:
                df.loc[idx][val] = outliers.loc[(outliers[col[0]]==idx)&(outliers[col[1]]==val[0])]['norm_resid']

    figsize=(60, 20)
    fig, ax = plt.subplots(figsize=figsize)
    # my_colors=['whitesmoke','orangered']
    
    ax = sns.heatmap(df, cmap='bwr', center=0, xticklabels = df.columns.strftime('%Y-%m-%d,%H'),
                     cbar_kws={"shrink":0.5},zorder=1)#square=True,

    for i in range(df.shape[0]+1):
        ax.axhline(i, color='white', lw=1.5,zorder=2)
    y1 = 0
    y2 = len(df)

    # ida_window = pd.to_datetime(['2021-08-26 00','2021-09-04 00'])
    # ida_nyc_window = pd.to_datetime(['2021-09-01 00','2021-09-02 00'])
    xlabels = ax.get_xticklabels()
    print(xlabels[0].get_text())
    print(xlabels[0])

    xs = [ x.get_position()[0] for x in xlabels if str(x.get_text()) in ["2021-09-01,00","2021-09-03,00"]]
    x1 = xs[0]-0.5
    x2 = xs[1]+0.5
    print(xs)
    # ax.fill_between([x1,x2],ida_window[0],ida_window[1], color='lightskyblue', alpha=0.4)
    ax.fill_between([x1,x2],y1,y2,alpha=0.5, color='none',edgecolor='blue', zorder=3, label='Ida',linewidth=3)
    
    xs2 = [ x.get_position()[0] for x in xlabels if str(x.get_text()) in ["2021-08-21,00","2021-08-24,00"]]
    x12 = xs2[0]-0.5
    x22 = xs2[1]+0.5
    print(xs2)
    ax.fill_between([x12,x22],y1,y2,alpha=0.8, color='none',edgecolor='gold', zorder=3,label='Henri',linewidth=3)
    
    xs3 = [ x.get_position()[0] for x in xlabels if str(x.get_text()) in ["2021-07-08,00","2021-07-10,00"]]
    x13 = xs3[0]-0.5
    x23 = xs3[1]+0.5
    print(xs3)
    ax.fill_between([x13,x23],y1,y2, alpha=0.5, color='none',edgecolor='green', zorder=3, label='Elsa',linewidth=3)
    
    xs4 = [ x.get_position()[0] for x in xlabels if str(x.get_text()) in ["2021-10-25,00","2021-10-27,00"]]
    x14 = xs4[0]-0.5
    x24 = xs4[1]+0.5
    print(xs4)
    ax.fill_between([x14,x24],y1,y2, alpha=0.5, color='none',edgecolor='deeppink', zorder=3,label="Nor'eastern",linewidth=3)
    

    for i, label in enumerate(xlabels):
        if i % 2 == 0:
            label.set_visible(True)
        else:
            label.set_visible(False)
    if "Line's" in title:
        ax.tick_params(axis='x', labelsize=7)    
    # ax.set_title(title)
    ax.legend(ncol=4, loc='upper right',fontsize=40)
    # 
    fig = plt.gcf()
    cax = fig.axes[-1]
    plt.subplots_adjust(left=0.15, right=0.95, bottom=0.15, top=0.9, wspace=0.2, hspace=0.2)
    cax.set_position([.796, .2, .03, .6]) # 
    plt.savefig(idir+'result/'+title.replace(':','_').replace('/',' ')+'.png',format='png',bbox_inches='tight',dpi=300)
    plt.show()


    


@jit(target_backend='cuda')     
def find_outliers(idir,df,title):
    if "Pickups" in title:
        col = ["PULocationID",'pickup_datetime']
    elif "Dropoffs" in title:
        col = ["DOLocationID",'dropoff_datetime']
    gps = df.groupby([col[0]])
    # names = df[col[0]].unique()
    outs = pd.DataFrame()
    stas = pd.DataFrame()
    for nm,df2 in gps:
        # if len(df2)<200:
        #     continue
        nm = 'Z'+ str(nm)
        df2['index'] = df2.index
        sta_df = pd.DataFrame()
        sta_df['cnt'] = df2.groupby([col[1]]).count()['index']
        key='cnt'
        if ' FHV' in title:
            if (sta_df['cnt'].mean()<6):
                continue
        elif 'HVFHV' in title:
            if (sta_df['cnt'].mean()<12):
                continue
        elif (sta_df['cnt'].mean()<12):
            continue
        out = pd.DataFrame()
        tmp,forecast = Seasonal_Outliers.seasonal_de_hour(sta_df,key,plot=False)
        if len(tmp)>0:
            out[col[1]] = tmp
            out[col[0]] = nm
            outs = pd.concat([outs,out[col]],ignore_index=True)
            forecast[col] = nm
            stas = pd.concat([stas,forecast],ignore_index=True)

    sst = stas[['ds','norm_resid',col[0]]]
    print(sst,outs)
    sst['ds'] = pd.to_datetime(sst['ds'], errors='coerce')
    sst[col[1]] = sst['ds']
    outs = pd.merge(outs,sst,on=col)
    print(outs)
    
    vis(idir,outs,col, title)
    stas.to_csv(idir+'result/'+title.replace(':','_').replace('/',' ')+'.csv',index=False)
    


    
    
def run(idir,files,_title):
        
    keys_dict = {
        "Zone's pickups":[''],
        "Zone's dropoffs":[''],
                 
                }
    # df,dt_non_work = combine_df(idir,files)


    for k,v in keys_dict.items():
        if "pickups" in k:
            title = _title + "Zone's Pickups"
        if "dropoffs" in k:
            title = _title + "Zone's Dropoffs"
        find_outliers(idir,df,title)
        
 


    


if __name__=='__main__':
    
    input_dir = ""
    items = [i for i in os.listdir(input_dir) if os.path.splitext(i)[1] == '.parquet']
    f_fi = [i for i in items if 'fhv_' in i]
    hf_fi = [i for i in items if 'fhvhv' in i]
    
    _f_title = 'Hourly Stationarity of FHV Taxi: '
    _hf_title = 'Hourly Stationarity of HVFHV Taxi: '
    
    df_f,dt_non_work = combine_df(input_dir,f_fi)
    df_hf,dt_non_work = combine_df(input_dir,hf_fi)
    
    df = pd.concat([df_f,df_hf],ignore_index=True)
    # print(df)
    
    del df_f
    del df_hf
    gc.collect()
   
    _title = 'Hourly Stationarity of HVFHV Taxi '
    run(input_dir, df, _title)

    # run(input_dir, f_fi, _f_title)
    # run(input_dir, hf_fi, _hf_title)
    
