### PREPARATION

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('yellow_tripdata_2018-01.csv')

In [3]:
zone = pd.read_csv('taxi _zone_lookup.csv')

In [4]:
zone = zone[['LocationID', 'Borough']].set_index('LocationID')

In [5]:
df = df.join(zone, on = 'DOLocationID', how = 'left')

In [6]:
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'], format = '%Y%m%d %H:%M:%S')

In [7]:
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], format = '%Y%m%d %H:%M:%S')

In [8]:
y = pd.to_datetime('2018')
y = y.year
m = pd.to_datetime('01', format='%m')
m = m.month

In [9]:
df = df.loc[(df['tpep_dropoff_datetime'].dt.year == y) & df['tpep_dropoff_datetime'].dt.month == m]

In [10]:
df = df.loc[(df['tpep_pickup_datetime'].dt.year == y) & df['tpep_pickup_datetime'].dt.month == m]

In [11]:
df = df.sort_values('tpep_pickup_datetime')

In [12]:
df = df.reset_index(drop=True)

In [13]:
df.drop(df.index[list(range(8757432, 8757438))], inplace = True)

# RESEARCH QUESTION 2

What are the time slots with more passengers? Set your own time slots and discover which are those when Taxis drive the highest number of passengers overall New York and repeat the analysis for each borough. Provide the results through a visualization and comment them.

We have decided to take into account four time-slots:
1. 06:01 to 12:00
2. 12:01 to 18:00
3. 18:01 to 00:00
4. 00:01 to 06:00

In [173]:
jan = df.copy()

In [175]:
#get rid of the date

jan['tpep_pickup_datetime'] = jan['tpep_pickup_datetime'].dt.time
jan['tpep_dropoff_datetime'] = jan['tpep_dropoff_datetime'].dt.time

In [176]:
#keep just needed columns

jan = jan[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'Borough']].copy()

In [177]:
#We created a series containig our times and converted into datetime.time

import datetime
times = pd.Series(['06:00:00', '12:00:00', '18:00:00', '00:00:00'])
times = pd.to_datetime(times, format = '%H:%M:%S')
times = times.dt.time

In [179]:
#For every slot we create a dataframe from the original one (jan) that contains only the taxi rides
#within that slot, and count the number of passengers.

slot_1 = jan.loc[jan['tpep_pickup_datetime'] >= times[0]]
slot_1 = jan.loc[jan['tpep_dropoff_datetime'] < times[1]]
slot_1 = slot_1['passenger_count'].sum()

slot_2 = jan.loc[jan['tpep_pickup_datetime'] >= times[1]]
slot_2 = jan.loc[jan['tpep_dropoff_datetime'] < times[2]]
slot_2 = slot_2['passenger_count'].sum()

slot_3 = jan.loc[jan['tpep_pickup_datetime'] >= times[2]]
slot_3 = jan.loc[jan['tpep_dropoff_datetime'] < times[3]]
slot_3 = slot_3['passenger_count'].sum()

slot_4 = jan.loc[jan['tpep_pickup_datetime'] >= times[3]]
slot_4 = jan.loc[jan['tpep_dropoff_datetime'] < times[0]]
slot_4 = slot_4['passenger_count'].sum()

In [184]:
#we create a series containing the total number of passengers for each slot, and then we find 
#the index of the maximum value.

people = pd.Series([slot_1, slot_2, slot_3, slot_4], 
                   index = ['06:00-12:00', '12:00-18:00', '18:00-00:00', '00:00-06:00'])
answer = people.idxmax()
print('Time slot with the highest number of passengers overall New York is', answer+'.')

Time slot with the highest number of passengers overall New York is 12:00-18:00.


### Now we repeat the analysis for each borough:

In [273]:
#Get names of boroughs in alphabetical order

boroughs = jan['Borough'].unique()
boroughs.sort()

In [274]:
#Use the previous code:

def passenger_peak(jan, str_):

    slot_1 = jan.loc[jan['tpep_pickup_datetime'] >= times[0]]
    slot_1 = jan.loc[jan['tpep_dropoff_datetime'] < times[1]]
    slot_1 = slot_1['passenger_count'].sum()

    slot_2 = jan.loc[jan['tpep_pickup_datetime'] >= times[1]]
    slot_2 = jan.loc[jan['tpep_dropoff_datetime'] < times[2]]
    slot_2 = slot_2['passenger_count'].sum()

    slot_3 = jan.loc[jan['tpep_pickup_datetime'] >= times[2]]
    slot_3 = jan.loc[jan['tpep_dropoff_datetime'] < times[3]]
    slot_3 = slot_3['passenger_count'].sum()

    slot_4 = jan.loc[jan['tpep_pickup_datetime'] >= times[3]]
    slot_4 = jan.loc[jan['tpep_dropoff_datetime'] < times[0]]
    slot_4 = slot_4['passenger_count'].sum()
    
    people = pd.Series([slot_1, slot_2, slot_3, slot_4], 
                   index = ['06:00-12:00', '12:00-18:00', '18:00-00:00', '00:00-06:00'])
    answer = people.idxmax()

    print('The time slot with the highest number of passengers in', str_, 'is', answer+'.')

In [275]:
for i in range(len(boroughs)):
    if boroughs[i] == 'Unknown':
        pass
    else:
        borough = jan.loc[jan['Borough'] == boroughs[i]]
        passenger_peak(borough, boroughs[i])

The time slot with the highest number of passengers in Bronx is 12:00-18:00.
The time slot with the highest number of passengers in Brooklyn is 12:00-18:00.
The time slot with the highest number of passengers in EWR is 12:00-18:00.
The time slot with the highest number of passengers in Manhattan is 12:00-18:00.
The time slot with the highest number of passengers in Queens is 12:00-18:00.
The time slot with the highest number of passengers in Staten Island is 12:00-18:00.


# RESEARCH QUESTION 3

Do the all trips last the same? Let's put our attention on the distribution of trip's duration. Provide a plot for it and comment what you see. Run this analysis for NYC and for each borough (and obviously comment the results!).

In [14]:
j = df.copy()

In [24]:
j = j[['tpep_pickup_datetime', 'tpep_dropoff_datetime']]
dropoff = j['tpep_dropoff_datetime']
pickup = j['tpep_pickup_datetime']
delta = (dropoff-pickup).astype('timedelta64[m]')
n_rides = pd.DataFrame(delta.value_counts(), index=delta)

In [25]:
n_rides = n_rides[n_rides.index >= 0]
dis = n_rides[n_rides.index <= 60]
dis = dis.sort_index()

In [27]:
dis

Unnamed: 0,0
0.0,73933
0.0,73933
0.0,73933
0.0,73933
0.0,73933
0.0,73933
0.0,73933
0.0,73933
0.0,73933
0.0,73933


In [None]:
dis.plot.bar()