### Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium
from folium import plugins
from geopy.distance import geodesic
import sklearn
from sklearn.manifold import TSNE
import datetime  
from datetime import date 

#### 1. Importing data

In [None]:
taxi_data_train = pd.read_csv('train.csv')
taxi_data_test = pd.read_csv('test.csv')


taxi_data_train.head()

In [None]:
taxi_data_test.head()

We don't need the ```id```, ```vendor_id```, and the ```store_and_fwd_flag```

In [None]:
taxi_data_train.drop(['id', 'vendor_id', 'store_and_fwd_flag'], axis=1, inplace=True)
taxi_data_test.drop(['id', 'vendor_id', 'store_and_fwd_flag'], axis=1, inplace=True)

taxi_data_train.head()

In [None]:
taxi_data_train.info()

In [None]:
taxi_data_test.info()

There are no missing features on the dataset, no need for data imputaion.

#### 2. Visualization

#### Where do most dropoffs occured?

In [None]:
NYC_map_dropoff = folium.Map(location=[40.7128, -74.0060])
 
heatarr = taxi_data_train[['dropoff_latitude', 'dropoff_longitude']].values
NYC_map_dropoff.add_child(plugins.HeatMap(heatarr, radius=15))
folium.LayerControl().add_to(NYC_map_dropoff)
NYC_map_dropoff

The two hostpots are located near the intersection of West 33rd Street and 7th Ave, and around the intersection of 31st Street and 8th Ave.These location border the Pennsylvania Station.

#### Where do most pickups occured?

In [None]:
NYC_map_pickup = folium.Map(location=[40.7128, -74.0060])
 
heatarr = taxi_data_train[['pickup_latitude', 'pickup_longitude']].values
NYC_map_pickup.add_child(plugins.HeatMap(heatarr, radius=15))
folium.LayerControl().add_to(NYC_map_pickup)
NYC_map_pickup

Pickups are also abundant in the Pennsylvania Station which serves more than 650, 000 daily commuters [1]

*** A lot commuters who ride yellow taxis in New York hailed and alight at the Pennsylvania Station. This insight may be useful for policy makers to manage traffic around the area, ensuring a smooth flow traffic. ***

[1] New York, NY - Moynihan Trsain Hall At Penn Station https://www.greatamericanstations.com/stations/new-york-penn-station-ny-nyp/

#### What is the busiest time for pick up and where is the busiest place?

In [None]:
pickup_hours = []
df = taxi_data_train.copy()

for time in df.iterrows():
    pickup_hours.append(int(time[1][0][11:13]))
    
df['Pickup Hour'] = pickup_hours
df.head()

In [None]:
values, counts = np.unique(pickup_hours, return_counts=True)

In [None]:
plt.plot(values, counts, 'bo', values, counts, 'r')
plt.xlabel('Hours')
plt.ylabel('Count')
plt.title('Number of Pickups in each Hour')

Many passengers hail yellow taxi between 18:00 and 19:00, the time when many are leaving for home. There is also a significant drop in activity starting at 0 and acheiving its lowest at 5:00. Activity started to rapidly increase after 5:00. This may be the time when peole are all heading to their repsective works and school.

In [None]:
pickup_hours_peak = df[(df['Pickup Hour'] >= 18) & (df['Pickup Hour'] <= 19)]

NYC_map_pickup_hours = folium.Map(location=[40.7128, -74.0060])
 
heatarr = pickup_hours_peak[['pickup_latitude', 'pickup_longitude']].values
NYC_map_pickup_hours.add_child(plugins.HeatMap(heatarr, radius=15))
folium.LayerControl().add_to(NYC_map_pickup_hours)
NYC_map_pickup_hours

The busiest place for pickup is at Pennsylvania Station, La Guardia Airport, and John F. Kennedy Airport. People who are leaving for work are probably heading for these transport hubs to go home.

#### What is the busiest hour for drop off and where is it?

In [None]:
dropoff_hours = []


for time in df.iterrows():
    dropoff_hours.append(int(time[1][1][11:13]))
    
df['Dropoff Hour'] = dropoff_hours
df.head()

In [None]:
values, counts = np.unique(dropoff_hours, return_counts=True)

In [None]:
plt.plot(values, counts, 'bo', values, counts, 'r')
plt.xlabel('Hours')
plt.ylabel('Count')
plt.title('Number of Dropoffs in each Hour')

Similar to pickup, the peak hour for dropoff is at 19:00. However, unlike in pickup, 18:00 did not perfectly match 19:00 in terms of number of activity. This is probably due to the travel time starting at 18:00.

In [None]:
dropoff_hours_peak = df[df['Dropoff Hour'] >= 19]

NYC_map_dropoff_hours = folium.Map(location=[40.7128, -74.0060])
 
heatarr = dropoff_hours_peak[['dropoff_latitude', 'dropoff_longitude']].values
NYC_map_dropoff_hours.add_child(plugins.HeatMap(heatarr, radius=15))
folium.LayerControl().add_to(NYC_map_dropoff_hours)
NYC_map_dropoff_hours

The busiest place for dropoff is still at the mentioned train station and airports. This makes sense since it is highly likely that many people are leaving for work between 18:00 and 19:00 and are heading for transportaion hubs to go home.

#### When is the busiest day?

In [None]:
def get_day(date_str):
    date, _ = date_str.split()
    year, month, day = date.split('-')
    day_name = datetime.date(int(year), int(month), int(day)) 
    day_name = day_name.strftime('%A')
    
    return day_name

In [None]:
df['Day'] = df.apply(lambda x: get_day(x.pickup_datetime), axis=1)
df.head()

In [None]:
values, counts = np.unique(df['Day'], return_counts=True)
counts = zip(values, counts)
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
days = {v: i for i, v in enumerate(days)}

sorted_ = sorted(counts, key=lambda x: days[x[0]])
values = [days[0] for days in sorted_]
counts = [days[1] for days in sorted_]
sorted_

In [None]:
plt.rcParams["figure.figsize"] = (10,5)
plt.bar(values, counts)
plt.xlabel('Day')
plt.ylabel('Count')
plt.title('Number of Pickups in each Day')

The number of pickups steadily increases as the week progress and drops by Sunday.

In [None]:
pickup_day_peak = df[df['Day'] == 'Friday']

NYC_map_pickup_day = folium.Map(location=[40.7128, -74.0060])
 
heatarr = pickup_day_peak[['pickup_latitude', 'pickup_longitude']].values
NYC_map_pickup_day.add_child(plugins.HeatMap(heatarr, radius=15))
folium.LayerControl().add_to(NYC_map_pickup_day)
NYC_map_pickup_day

#### When is the busiest month?

In [None]:
def get_month(date_str):
    date, _ = date_str.split()
    year, month, day = date.split('-')
    month_name = datetime.date(int(year), int(month), int(day)) 
    month_name = month_name.strftime('%B')
    
    return month_name

In [None]:
df['Month'] = df.apply(lambda x: get_month(x.pickup_datetime), axis=1)
df.head()

In [None]:
values, counts = np.unique(df['Month'], return_counts=True)
counts = zip(values, counts)
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ]
months = {v: i for i, v in enumerate(months)}

sorted_ = sorted(counts, key=lambda x: months[x[0]])
values = [days[0] for days in sorted_]
counts = [days[1] for days in sorted_]

plt.rcParams["figure.figsize"] = (10,5)
plt.bar(values, counts)
plt.xlabel('Month')
plt.ylabel('Count')
plt.title('Number of Pickups in each Month')

In [None]:
df.to_csv('df.csv')

#### Adding distance feature

In [None]:
def get_distance (pickup_lat, pickup_long, dropoff_lat, dropoff_long):
    
    distance = geodesic((pickup_lat, pickup_long), (dropoff_lat, dropoff_long)).km
    
    return distance

In [None]:
df['distance'] = df.apply(lambda x: get_distance(x.pickup_latitude, x.pickup_longitude, 
                                                   x.dropoff_latitude, x.dropoff_longitude), axis=1)

In [None]:
df.head()

#### Visualize High Dimesnional Data