#### Purpose: New York Visualization Project
#### Autho: Kubam Ivo
#### Date: 1/19/2021

### Importing and preparing dataset

In [1059]:
import pandas as pd
df_y = pd.read_csv("C:\\Users\\ivomb\\OneDrive\\Msc Data Science\\INFOH600-Computing Foundations\\Project\\df_yellow.csv")
df_g = pd.read_csv("C:\\Users\\ivomb\\OneDrive\\Msc Data Science\\INFOH600-Computing Foundations\\Project\\df_green.csv")
df_loc = pd.read_csv("C:\\Users\\ivomb\\Downloads\\taxi+_zone_lookup.csv")

In [865]:
# Filtering needed columns 
df_y = df_y[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance', 'pulocationid', \
         'dolocationid', 'tip_amount', 'total_amount', 'trip_month' ]]

df_g = df_g[['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'trip_distance', 'pulocationid', \
         'dolocationid', 'tip_amount', 'total_amount', 'trip_month' ]]

In [866]:
import warnings
warnings.filterwarnings('ignore')

In [867]:
# Ensuring pickup and dropoff fields are datetime

df_y['tpep_pickup_datetime'] = pd.to_datetime(df_y['tpep_pickup_datetime'])
df_y['tpep_dropoff_datetime'] = pd.to_datetime(df_y['tpep_dropoff_datetime'])
df_g['lpep_pickup_datetime'] = pd.to_datetime(df_g['lpep_pickup_datetime'])
df_g['lpep_dropoff_datetime'] = pd.to_datetime(df_g['lpep_dropoff_datetime'])

In [868]:
# Extracting new columns
df_y['day'] = df_y['tpep_pickup_datetime'].apply(lambda time: time.dayofweek)
df_y['day_name'] = df_y['tpep_pickup_datetime'].dt.day_name()
df_y['month_name'] = df_y['tpep_pickup_datetime'].dt.month_name()
df_y['year'] = pd.DatetimeIndex(df_y.loc[:,'tpep_pickup_datetime']).year
df_y['hour'] = pd.DatetimeIndex(df_y.loc[:,'tpep_pickup_datetime']).hour
df_y['trip_duration'] = (df_y['tpep_dropoff_datetime'] -\
                         df_y['tpep_pickup_datetime']).astype('timedelta64[m]')
df_y['uc_mile'] = df_y['trip_distance']/df['total_amount']
df_y['taxi_type'] = 'yellow'

df_g['day'] = df_g['lpep_dropoff_datetime'].apply(lambda time: time.dayofweek)
df_g['day_name'] = df_g['lpep_dropoff_datetime'].dt.day_name()
df_g['month_name'] = df_g['lpep_dropoff_datetime'].dt.month_name()
df_g['year'] = pd.DatetimeIndex(df_g.loc[:,'lpep_pickup_datetime']).year
df_g['hour'] = pd.DatetimeIndex(df_g.loc[:,'lpep_pickup_datetime']).hour
df_g['trip_duration'] = (df_g['lpep_dropoff_datetime'] - \
                         df_g['lpep_pickup_datetime']).astype('timedelta64[m]')
df_g['uc_mile'] = df_g['trip_distance']/df['total_amount']
df_g['taxi_type'] = 'green'

In [869]:
# Renaming the schema of green taxi to match that of yellow
df_g.rename(columns={'lpep_dropoff_datetime':'tpep_dropoff_datetime', 'lpep_pickup_datetime': 'tpep_pickup_datetime'},inplace=True)


In [870]:
# concat both datasets
df_concat = pd.concat([df_y,df_g])
df_concat.head()


3258745

In [1206]:
max(df_concat['tip_amount'])

446.64

In [1039]:
df_loc.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [871]:
# Deleting rows with pickup time less than dropoff time
index_name = df_concat[df_concat['trip_duration']<0].index
df_concat.drop(index_name,inplace=True)

In [1224]:
def taxi_plot(df,stat,taxi,field,start,end):
    x_axis_labels = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'] # labels for x-axis
    y_axis_labels = ['Mon','Tue','Wed','Thur','Fri','Sat','Sun'] # labels for y-axis
     
    x1 = df.groupby(['hour','day']).agg(value = (field,stat),).reset_index()
    x1 = x1.pivot('day', 'hour', "value")
    y1 = df.groupby(['trip_month','taxi_type']).agg(value = (field,stat),).reset_index() 
    
    f, axes = plt.subplots(2, 1, figsize=(15,8))
    a = sns.barplot(x='trip_month', y="value", data=y1, hue='taxi_type',palette="Set2", ax=axes[0])
    a.set(xlabel='Month', ylabel=field, xticklabels=x_axis_labels)
    a.legend(loc='outside')
    b = sns.heatmap(x1, annot=True,linewidths=.5, ax=axes[1] , yticklabels=y_axis_labels, cmap="YlOrBr")
    print('Records: ',int(df[field].count()), 'Max: ',float(max(df[field])))       
            
    if field == 'trip_duration':    
        a.set_ylabel('Trip_duration (mins)')
        a.set_xlabel('Month')
        a.set_title('Monthly '+stat+ ' '+ taxi + ' taxi ' + ' trip duration from ' + df_loc['Zone'][df_loc['LocationID']==start].any() + \
                             ' to ' + df_loc['Zone'][df_loc['LocationID']==end].any())
    elif field == 'total_amount':
        a.set_ylabel('Total amount (dollars)')
        a.set_xlabel('Month')
        a.set_title('Monthly '+stat+ ' '+ taxi + ' taxi ' + ' trip duration from ' + df_loc['Zone'][df_loc['LocationID']==start].any() + \
                             ' to ' + df_loc['Zone'][df_loc['LocationID']==end].any())
    elif field == 'tip_amount':
        a.set_ylabel('Tip amount (dollars)')
        a.set_xlabel('Month')
        a.set_title('Monthly '+stat+ ' '+ taxi + ' taxi ' + ' trip duration from ' + df_loc['Zone'][df_loc['LocationID']==start].any() + \
                             ' to ' + df_loc['Zone'][df_loc['LocationID']==end].any())
    elif field == 'trip_distance':
        a.set_ylabel('Trip distance (Miles)')
        a.set_xlabel('Month')
        a.set_title('Monthly '+stat+ ' '+ taxi + ' taxi ' + ' trip duration from ' + df_loc['Zone'][df_loc['LocationID']==start].any() + \
                             ' to ' + df_loc['Zone'][df_loc['LocationID']==end].any())
            

    fig.tight_layout()

    plt.show()
    

#### Auxillary Functions to create Visuals

In [1225]:
def taxi_viz(start, end,stat,field,taxi='All'):
    '''Function to create plot based on defined route'''
    from sort_dataframeby_monthorweek import Sort_Dataframeby_Month
    import matplotlib.pyplot as plt
    import seaborn as sns
    import numpy as np  
    
    
    stat_list = [('mean','mean'),('median','median')]
    field_list = [('Duration trip','trip_duration'),('Trip Amount','total_amount'),('Tip Amount','tip_amount'),\
                  ('Trip distance','trip_distance')]  
    dict = {}
    for i in range(df_loc.shape[0]):
        dict[df_loc['Zone'][i]] = df_loc['LocationID'][i]

    loc_list = [(k, v) for k, v in dict.items()] 
    loc_list = loc_list[:-1]
    
    dict1 = {}
    for i in np.unique(np.array(df_concat['dolocationid'][df_concat['pulocationid']==start])):
        dict1[df_loc['Zone'][i-1]] = df_loc['LocationID'][i-1]

    loc_list1= [(k, v) for k, v in dict1.items()] 
    loc_list1 = loc_list1[:-1]
       
    if (start ==265  and end == 265 ) and taxi=='All':
          
        taxi_plot(df_concat,stat,taxi,field,start,end)
            
    elif start!=265  and end ==265 and taxi =='All':
        df_g = pd.DataFrame(df_concat.loc[(df_concat.pulocationid==int(start))])
        if df_g.shape[0] == 0:
            print('No record to display for this route')
        else:       
                        
            taxi_plot(df_g,stat,taxi,field,start,end)
            
    
    elif (start!=265  and end ==265) and taxi=='green':
        df_g = pd.DataFrame(df_concat.loc[(df_concat.pulocationid==int(start)) & (df_concat.taxi_type=='green')])
        if df_g.shape[0] == 0:
            print('No record to display for this route')
        else:       
                        
            taxi_plot(df_g,stat,taxi,field,start,end)
    
    elif (start!=265  and end ==265) and taxi=='yellow':
        df_g = pd.DataFrame(df_concat.loc[(df_concat.pulocationid==int(start)) & (df_concat.taxi_type=='yellow')])
        if df_g.shape[0] == 0:
            print('No record to display for this route')
        else:       
                        
            taxi_plot(df_g,stat,taxi,field,start,end)
        
    elif not (start ==265  and end == 265 ) and taxi=='All':
        
        
        
        df_g = pd.DataFrame(df_concat.loc[(df_concat.pulocationid==int(start)) & (df_concat.dolocationid==int(end))] )
        if df_g.shape[0] == 0:
            print('No record to display for this route')
        else:       
                        
            taxi_plot(df_g,stat,taxi,field,start,end)
            
    elif not (start ==265  and end == 265 ) and taxi=='green':
        
        df_g = pd.DataFrame(df_concat.loc[(df_concat.pulocationid==int(start)) & (df_concat.dolocationid==int(end)) \
                           & (df_concat.taxi_type=='green')])
        if df_g.shape[0] == 0:
            print('No record to display for this route')
        else:
            taxi_plot(df_g,stat,taxi,field,start,end) 
            
    elif not (start ==265  and end == 265 ) and taxi=='yellow':
        
        df_g = pd.DataFrame(df_concat.loc[(df_concat.pulocationid==int(start)) & (df_concat.dolocationid==int(end)) \
                           & (df_concat.taxi_type=='yellow')])
        if df_g.shape[0] == 0:
            print('No record to display for this route')
        else:
            taxi_plot(df_g,stat,taxi,field,start,end)
                   
        

In [1226]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

a = widgets.Dropdown(description='Start',options=loc_list,value=132)
b = widgets.Dropdown(description='End',options=loc_list1,value=265)
c = widgets.RadioButtons(description='Stat',options=stat_list, value='median')
d = widgets.Select(description='Field',options=field_list, value='trip_duration')
e = widgets.RadioButtons(description='Taxi',options=['All','green','yellow'], value='All')
left_box = widgets.VBox([a, b])
right_box = widgets.VBox([e,c])
right_box1 = widgets.VBox([d])
ui = widgets.HBox([left_box,right_box,right_box1])
out = widgets.interactive_output(taxi_viz,{'start':a,'end':b,'taxi':e, 'stat':c, 'field':d})
display(ui, out)

HBox(children=(VBox(children=(Dropdown(description='Start', index=128, options=(('Newark Airport', 1), ('Jamai…

Output()