In [2]:
# Ignore several specific Pandas warnings
import warnings
warnings.filterwarnings("ignore")

# Import library
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
from datetime import datetime
import time

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.externals.joblib import parallel_backend

import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

import time
from tqdm import tqdm_notebook as tqdm



# Set the seed for randomization
seed = 0


## Helper Functions

In [3]:
def taxi_data_preprocess(taxi_data):
    # Convert data type for categorical attribute
    taxi_data.VendorID = taxi_data.VendorID.astype(object)
    taxi_data.RatecodeID = taxi_data.RatecodeID.astype(object)
    taxi_data.PULocationID = taxi_data.PULocationID.astype(object)
    taxi_data.DOLocationID = taxi_data.DOLocationID.astype(object)
    taxi_data.payment_type = taxi_data.payment_type.astype(object)
    # Convert data type for date attribute
    taxi_data.tpep_pickup_datetime = pd.to_datetime(taxi_data.tpep_pickup_datetime)
    taxi_data.tpep_dropoff_datetime = pd.to_datetime(taxi_data.tpep_dropoff_datetime)
    
    # Compute the duration of each trip and store in a new column, 'duration'
    print('Compute the duration of each taxi trip...')
    taxi_data['duration'] = taxi_data.tpep_dropoff_datetime - taxi_data.tpep_pickup_datetime
    # Create 5 different time period
    print('Create 5 different time period: morning, noon, afternoon, night, latenight...')
    taxi_data['time'] = None
    
    morning = [6 <= time.hour < 11 for time in taxi_data.tpep_pickup_datetime]
    print('Morning is done!')
    noon = [11 <= time.hour < 14 for time in taxi_data.tpep_pickup_datetime]
    print('Noon is done!')
    afternoon = [14 <= time.hour < 18 for time in taxi_data.tpep_pickup_datetime]
    print('Afternoon is done!')
    night = [18 <= time.hour <= 23 for time in taxi_data.tpep_pickup_datetime]
    print('Night is done!')
    latenight = [0 <= time.hour < 6 for time in taxi_data.tpep_pickup_datetime]
    print('Latenight is done!')
    
    
    taxi_data.loc[morning,'time'] = 'Morning'
    taxi_data.loc[noon,'time'] = 'Noon'
    taxi_data.loc[afternoon,'time'] = 'Afternoon'
    taxi_data.loc[night,'time'] = 'Night'
    taxi_data.loc[latenight,'time'] = 'Latenight'
    
    # Remove unreasonable records
    print('Remove unreasonable records...')
    # 1. Taxi trips that have negative or zero duration
    print('Remove trip records whose dropoff time is earlier than pickup time...')
    taxi_data = taxi_data.loc[taxi_data.tpep_dropoff_datetime > taxi_data.tpep_pickup_datetime,]
    # 2. Taxi trips that last longer than 24 hours
    print('Remove trip records that last longer than 24 hours or have negative duration time...')
    taxi_data = taxi_data.loc[[duration.days == 0 for duration in taxi_data.duration],]
    # 3. Taxi trips that travel more than 30 miles
    print('Remove trip records that travel more than 30 miles...')
    taxi_data = taxi_data.loc[taxi_data.trip_distance <= 30,]
    # 4. Taxi trips with non-positive total amount 
    print('Remove trip records that have negative or zero total amount...')
    taxi_data = taxi_data.loc[taxi_data.total_amount > 0,]
    
    # Reset row index
    print('Reset row index...')
    taxi_data.reset_index(inplace = True, drop = True)
    
    # Compute the income/second
    print('Compute the income per second for each trip in dollar amount...')
    taxi_data['income_per_second'] = [income/time.total_seconds() for income,time in zip(taxi_data.total_amount,taxi_data.duration)]
    
    # R
    print('Remove data points with extreme values in the following attributes: income_per_second, tip_amount, total_amount...')
    taxi_data = taxi_data.loc[taxi_data.income_per_second <= 0.06,]
    taxi_data = taxi_data.loc[taxi_data.tip_amount <= 16,]
    taxi_data = taxi_data.loc[taxi_data.total_amount <= 77,]
    
    print('Taxi data preprocess completed!')
    
    return taxi_data

In [5]:
def add_March_weather(taxi_data):
    print('Add weather warnings for March data...')
    # Create weather column in taxi data
    taxi_data['weather'] = 'None'
    # List of dates that have winter storm warning
    winter_storm_date = [6,7,8,20,22]
    # True/False vector to indicate whether trip was impacted by the weather or not
    impact = [time.day in winter_storm_date for time in taxi_data.tpep_pickup_datetime]
    # Label impacted trips
    taxi_data.loc[impact,'weather'] = 'Winter Storm'
    print('Weather warnings have been added!')
    
    return taxi_data

In [6]:
def add_August_weather(taxi_data):
    print('Add weather warnings for August data...')
    # Create weather column in taxi data
    taxi_data['weather'] = 'None'
    # List of dates that have winter storm warning
    thunderstorm_date = [7,11]
    # True/False vector to indicate whether trip was impacted by the weather or not
    impact = [time.day in thunderstorm_date for time in taxi_data.tpep_pickup_datetime]
    # Label impacted trips
    taxi_data.loc[impact,'weather'] = 'Thunderstorm'
    print('Weather warnings have been added!')
    
    return taxi_data

In [None]:
def compare_time_density(taxi_data, feature, fig_width, fig_height):
    # Plot income_per_second for different time in a day
    fig, ax = plt.subplots(figsize = (fig_width, fig_height))

    sns.distplot(taxi_data.loc[taxi_data.time == 'Latenight', feature], ax = ax)
    sns.distplot(taxi_data.loc[taxi_data.time == 'Night', feature], ax = ax)
    sns.distplot(taxi_data.loc[taxi_data.time == 'Afternoon', feature], ax = ax)
    sns.distplot(taxi_data.loc[taxi_data.time == 'Morning', feature], ax = ax)
    sns.distplot(taxi_data.loc[taxi_data.time == 'Noon', feature], ax = ax)

    ax.legend(['Latenight','Night','Afternoon','Morning','Noon']); ax.set_ylabel('Density');
    
    return ax

In [None]:
def compare_WS_density(taxi_data, feature, fig_width, fig_height):
    # Plot income_per_second for different time in a day
    fig, ax = plt.subplots(figsize = (fig_width, fig_height))

    sns.distplot(taxi_data.loc[taxi_data.weather == 'Winter Storm', feature], ax = ax)
    sns.distplot(taxi_data.loc[taxi_data.weather == 'None', feature], ax = ax)
    
    ax.legend(['Winter Storm','None']); ax.set_ylabel('Density');
    
    return ax

In [None]:
def compare_SV_density(taxi_data, feature, fig_width, fig_height):
    # Plot income_per_second for different time in a day
    fig, ax = plt.subplots(figsize = (fig_width, fig_height))

    sns.distplot(taxi_data.loc[taxi_data.weather == 'Thunderstorm', feature], ax = ax)
    sns.distplot(taxi_data.loc[taxi_data.weather == 'None', feature], ax = ax)
    
    ax.legend(['Thunderstorm','None']); ax.set_ylabel('Density');
    
    return ax

In [None]:
def compare_group_boxplot(taxi_data, group, feature, fig_width, fig_height):
    
    fig, ax = plt.subplots(figsize = (fig_width, fig_height))
    
    sns.boxplot(x = group, y = feature, data = taxi_data)
    

# Yellow Taxi

In [7]:
start = time.time()
# Load the csv file into pandas dataframe
yellow_taxi_March = pd.read_csv('../data/yellow_tripdata_2018-03.csv')
# Preprocess the taxi data
yellow_taxi_March = taxi_data_preprocess(yellow_taxi_March)
# Add weather info
yellow_taxi_March = add_March_weather(taxi_data = yellow_taxi_March)
end = time.time()
print('Entire process took {:.2f} minutes'.format((end-start)/60))

Compute the duration of each taxi trip...
Create 5 different time period: morning, noon, afternoon, night, latenight...
Morning is done!
Noon is done!
Afternoon is done!


KeyboardInterrupt: 

In [None]:
start = time.time()
# Load the csv file into pandas dataframe
yellow_taxi_August = pd.read_csv('../data/yellow_tripdata_2018-08.csv')
# Preprocess the taxi data
yellow_taxi_August = taxi_data_preprocess(yellow_taxi_August)
# Add weather info
yellow_taxi_August = add_August_weather(taxi_data = yellow_taxi_August)
end = time.time()
print('Entire process took {:.2f} minutes'.format((end-start)/60))

## 1. Compare Driver Income by Different Time of a Day

In [None]:
compare_time_density(taxi_data = yellow_taxi_March, feature = 'income_per_second', fig_width = 15, fig_height = 5)

In [None]:
compare_group_boxplot(taxi_data = yellow_taxi_March, group = 'time', feature = 'income_per_second')