#### Purpose: New York Visualization Project
#### Autho: Kubam Ivo
#### Date: 1/19/2021

### Importing and preparing dataset

In [49]:
#importing necessary libraries
import pandas as pd
import numpy as np
from sort_dataframeby_monthorweek import Sort_Dataframeby_Month
import matplotlib.pyplot as plt
import seaborn as sns

In [99]:

df_y = pd.read_csv("C:\\Users\\ivomb\\OneDrive\\Msc Data Science\\INFOH600-Computing Foundations\\Project\\df_yellow.csv")
df_g = pd.read_csv("C:\\Users\\ivomb\\OneDrive\\Msc Data Science\\INFOH600-Computing Foundations\\Project\\df_green.csv")
df_loc = pd.read_csv("C:\\Users\\ivomb\\Downloads\\taxi+_zone_lookup.csv")

In [100]:
# Generate 50% random from yellow dataset and all from green 
df_y = df_y.sample(frac=0.5)


In [101]:
# Filtering needed columns 
df_y = df_y[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance', 'pulocationid', \
         'dolocationid', 'tip_amount', 'total_amount', 'trip_month' ]]

df_g = df_g[['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'trip_distance', 'pulocationid', \
         'dolocationid', 'tip_amount', 'total_amount', 'trip_month' ]]

In [102]:
import warnings
warnings.filterwarnings('ignore')

In [103]:
# Ensuring pickup and dropoff fields are datetime

df_y['tpep_pickup_datetime'] = pd.to_datetime(df_y['tpep_pickup_datetime'])
df_y['tpep_dropoff_datetime'] = pd.to_datetime(df_y['tpep_dropoff_datetime'])
df_g['lpep_pickup_datetime'] = pd.to_datetime(df_g['lpep_pickup_datetime'])
df_g['lpep_dropoff_datetime'] = pd.to_datetime(df_g['lpep_dropoff_datetime'])

In [104]:
# Extracting new columns
df_y['day'] = df_y['tpep_pickup_datetime'].apply(lambda time: time.dayofweek)
df_y['day_name'] = df_y['tpep_pickup_datetime'].dt.day_name()
df_y['month_name'] = df_y['tpep_pickup_datetime'].dt.month_name()
df_y['year'] = pd.DatetimeIndex(df_y.loc[:,'tpep_pickup_datetime']).year
df_y['hour'] = pd.DatetimeIndex(df_y.loc[:,'tpep_pickup_datetime']).hour
df_y['trip_duration'] = (df_y['tpep_dropoff_datetime'] -\
                         df_y['tpep_pickup_datetime']).astype('timedelta64[m]')
df_y['uc_mile'] = df_y['trip_distance']/df_y['total_amount']
df_y['taxi_type'] = 'yellow'

df_g['day'] = df_g['lpep_dropoff_datetime'].apply(lambda time: time.dayofweek)
df_g['day_name'] = df_g['lpep_dropoff_datetime'].dt.day_name()
df_g['month_name'] = df_g['lpep_dropoff_datetime'].dt.month_name()
df_g['year'] = pd.DatetimeIndex(df_g.loc[:,'lpep_pickup_datetime']).year
df_g['hour'] = pd.DatetimeIndex(df_g.loc[:,'lpep_pickup_datetime']).hour
df_g['trip_duration'] = (df_g['lpep_dropoff_datetime'] - \
                         df_g['lpep_pickup_datetime']).astype('timedelta64[m]')
df_g['uc_mile'] = df_g['trip_distance']/df_g['total_amount']
df_g['taxi_type'] = 'green'

In [105]:
# Renaming the schema of green taxi to match that of yellow
df_g.rename(columns={'lpep_dropoff_datetime':'tpep_dropoff_datetime', 'lpep_pickup_datetime': 'tpep_pickup_datetime'},inplace=True)


In [106]:
# concat both datasets
df_concat = pd.concat([df_y,df_g])

In [107]:
# Deleting rows with pickup time less than dropoff time
index_name = df_concat[df_concat['trip_duration']<0].index
df_concat.drop(index_name,inplace=True)

In [137]:
def taxi_plot(df,stat,taxi,field,start,end):
    x_axis_labels = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'] # labels for x-axis
    y_axis_labels = ['Mon','Tue','Wed','Thur','Fri','Sat','Sun'] # labels for y-axis
     
    x1 = df.groupby(['hour','day']).agg(value = (field,stat),).reset_index()
    x1 = x1.pivot('day', 'hour', "value")
    y1 = df.groupby(['trip_month','taxi_type']).agg(value = (field,stat),).reset_index() 
    
    print('Number of Trips: ', f'{df[field].count():,}')
    x = df['uc_mile'].median()
    
    print('Unit cost per Mile: ', '${:,.2f}'.format(x))
    fig = plt.figure(figsize=(15, 12))
    a = plt.subplot2grid((3, 3), (0, 0), colspan=2)
    a = sns.barplot(x='trip_month', y="value", data=y1, hue='taxi_type',palette="Paired")
    a.set(xlabel='Month', ylabel=field, xticklabels=x_axis_labels)
    a.legend(loc='outside')
    b = plt.subplot2grid((3, 3), (1, 0), colspan=3)
    b = sns.heatmap(x1, annot=True,linewidths=.5, yticklabels=y_axis_labels, cmap="YlOrBr") 
    c = plt.subplot2grid((3, 3), (0, 2), rowspan=1)
    c = sns.violinplot(x=field,palette="Paired", data= df)     
            
    if field == 'trip_duration':    
        a.set_ylabel('Trip_duration (mins)')
        a.set_xlabel('')
        a.set_title('Monthly '+stat+ ' '+ field +' for '+ taxi + ' taxi '  )
        b.set_title('Daily Heatmap for '+stat+ ' ' + field + ' for ' + taxi +  ' taxi ')
        c.set_title('Distribution for '+ field + ' for ' + taxi + ' taxi ')
        
    elif field == 'total_amount':
        a.set_ylabel('Total amount (dollars)')
        a.set_xlabel('')
        a.set_title('Monthly '+stat+ ' '+ field +' for '+ taxi + ' taxi '  )
        b.set_title('Daily Heatmap for '+stat+ ' ' + field + ' for ' + taxi +  ' taxi ')
        c.set_title('Distribution for '+ field + ' for ' + taxi + ' taxi ')
        
    elif field == 'tip_amount':
        a.set_ylabel('Tip amount (dollars)')
        a.set_xlabel('')
        a.set_title('Monthly '+stat+ ' '+ field +' for '+ taxi + ' taxi '  )
        b.set_title('Daily Heatmap for '+stat+ ' ' + field + ' for ' + taxi +  ' taxi ')
        c.set_title('Distribution for '+ field + ' for ' + taxi + ' taxi ')
        
    elif field == 'trip_distance':
        a.set_ylabel('Trip distance (Miles)')
        a.set_xlabel('')
        a.set_title('Monthly '+stat+ ' '+ field +' for '+ taxi + ' taxi '  )
        b.set_title('Daily Heatmap for '+stat+ ' ' + field + ' for ' + taxi +  ' taxi ')
        c.set_title('Distribution for '+ field + ' for ' + taxi + ' taxi ')
            
   
    fig.suptitle('Visualization Dashboard  from ' + df_loc['Zone'][df_loc['LocationID']==start].any() + \
                             ' to ' + df_loc['Zone'][df_loc['LocationID']==end].any() + ' Zone', fontsize=16)
    #fig.subplots_adjust(top=0.88)
    #fig.tight_layout()

    plt.show()
    

In [134]:
stat_list = [('mean','mean'),('median','median')]
field_list = [('Trip duration','trip_duration'),('Trip Amount','total_amount'),('Tips Amount','tip_amount'),\
                  ('Trip distance','trip_distance')]  

dict = {}
for i in range(df_loc.shape[0]):
    dict[df_loc['Zone'][i]] = df_loc['LocationID'][i]

loc_list = [(k, v) for k, v in dict.items()] 


#### Auxillary Functions to create Visuals

In [110]:
def taxi_viz(start, end,stat,field,taxi,borough,df=df_concat,df_zone = df_loc):
    '''Function to create plot based on defined route'''
    from sort_dataframeby_monthorweek import Sort_Dataframeby_Month
    import matplotlib.pyplot as plt
    import seaborn as sns
    import numpy as np  
    
    
    df_zone = df_zone[df_zone['Borough'].isin(list(borough))]
    dict = {}
    df_zone = df_zone.reset_index()
    for i in range(df_zone.shape[0]):
        dict[df_zone['Zone'][i]] = df_zone['LocationID'][i]
    dict['ALL'] = 265
    loc_list = [(k, v) for k, v in dict.items()] 
    
    df = pd.DataFrame(df.loc[(df['pulocationid'].isin(dict.values())) & (df['dolocationid'].isin(dict.values()))])
    
    
    if (start ==265  and end == 265 ) and taxi=='All':
          
        taxi_plot(df,stat,taxi,field,start,end)
        
    elif (start==265  and end ==265) and taxi=='green':
        df_g = pd.DataFrame(df.loc[(df.taxi_type=='green')])
        if df_g.shape[0] == 0:
            print('No record to display for this route')
        else:       
                        
            taxi_plot(df_g,stat,taxi,field,start,end)
            
    elif (start==265  and end ==265) and taxi=='yellow':
        df_g = pd.DataFrame(df.loc[(df.taxi_type=='yellow')])
        if df_g.shape[0] == 0:
            print('No record to display for this route')
        else:       
                        
            taxi_plot(df_g,stat,taxi,field,start,end)
            
    elif start!=265  and end ==265 and taxi =='All':
        df_g = pd.DataFrame(df.loc[(df.pulocationid==int(start))])
        if df_g.shape[0] == 0:
            print('No record to display for this route')
        else:       
                        
            taxi_plot(df_g,stat,taxi,field,start,end)
    
            
    
    elif (start!=265  and end ==265) and taxi=='green':
        df_g = pd.DataFrame(df.loc[(df.pulocationid==int(start)) & (df.taxi_type=='green')])
        if df_g.shape[0] == 0:
            print('No record to display for this route')
        else:       
                        
            taxi_plot(df_g,stat,taxi,field,start,end)
    
    elif (start!=265  and end ==265) and taxi=='yellow':
        df_g = pd.DataFrame(df.loc[(df.pulocationid==int(start)) & (df.taxi_type=='yellow')])
        if df_g.shape[0] == 0:
            print('No record to display for this route')
        else:       
                        
            taxi_plot(df_g,stat,taxi,field,start,end)
        
    elif not (start ==265  and end == 265 ) and taxi=='All':
        
        
        
        df_g = pd.DataFrame(df.loc[(df.pulocationid==int(start)) & (df.dolocationid==int(end))] )
        if df_g.shape[0] == 0:
            print('No record to display for this route')
        else:       
                        
            taxi_plot(df_g,stat,taxi,field,start,end)
            
    elif not (start ==265  and end == 265 ) and taxi=='green':
        
        df_g = pd.DataFrame(df.loc[(df.pulocationid==int(start)) & (df.dolocationid==int(end)) \
                           & (df.taxi_type=='green')])
        if df_g.shape[0] == 0:
            print('No record to display for this route')
        else:
            taxi_plot(df_g,stat,taxi,field,start,end) 
            
    elif not (start ==265  and end == 265 ) and taxi=='yellow':
        
        df_g = pd.DataFrame(df.loc[(df.pulocationid==int(start)) & (df.dolocationid==int(end)) \
                           & (df.taxi_type=='yellow')])
        if df_g.shape[0] == 0:
            print('No record to display for this route')
        else:
            taxi_plot(df_g,stat,taxi,field,start,end)
                   
        

In [138]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

a = widgets.Dropdown(description='Start',options=loc_list,value=265)
b = widgets.Dropdown(description='End',options=loc_list,value=265)
c = widgets.RadioButtons(description='Stat',options=stat_list, value='median')
d = widgets.Select(description='Field',options=field_list, value='trip_duration')
e = widgets.RadioButtons(description='Taxi',options=['All','green','yellow'], value='All')
f = widgets.SelectMultiple(description='Borough',options=['Bronx','EWR','Manhattan','Staten Island', 'Queens'], value=['Manhattan'])

left_box = widgets.HBox([f, a,b])
right_box = widgets.HBox([e,c,d])
accordion = widgets.Accordion(children=[left_box,right_box])
accordion.set_title(0, 'Location')
accordion.set_title(1, 'Statistics')
out = widgets.interactive_output(taxi_viz,{'start':a,'end':b,'taxi':e, 'stat':c, 'field':d,'borough':f})
display(accordion, out)

Accordion(children=(HBox(children=(SelectMultiple(description='Borough', index=(2,), options=('Bronx', 'EWR', …

Output()