# Data Preperation

In [26]:
from datetime import date, datetime, timedelta, time
import holidays
import pytz
import pandas as pd
import numpy as np

## Functions

helper functions for data retrieval

In [27]:
def removeTzRows(data, dtList): 
    """remove rows of datetimes that do not exist due to tz changes."""
    for d in dtList:
        if d in data.index:
            data.drop(d, inplace=True)
            print(str(d) + " dropped.")  
    return data
def newIndex(start, end, delta):
    """create a new index for reindexing."""
    curr = start
    while curr < end:
        yield curr
        curr += delta
        
def prepData(data, delta=60):
    """removes duplicates that occur due to DST changes"""
    # some zones have missing values
    # that are supposed to be zero.
    data.fillna(0, axis=1, inplace=True)
    # drop the hour between 2 and 3 
    # when US time goes so summer time
    data = removeTzRows(data, tzDates)
    # drop duplicate index rows
    duplicateNumber = data.index.duplicated().astype(int).sum()
    data = data.loc[~data.index.duplicated(keep='first')]
    print(str(duplicateNumber) + " duplicates found and removed.")
    # create new index with 30 minute time interval
    reindx = list()
    for result in newIndex(startDt, endDt, timedelta(minutes=delta)):
        reindx.append(result)
    # reindex so there are no missing datetimes
    data = data.reindex(reindx)
    # fill missing values with their previous one
    data.fillna(method="ffill", inplace=True)
    return data 

## Constants

* start and end of observation horizon
* tzDates: datetimes that do not exist due to DST changes

In [28]:
startDt = datetime(2017, 1, 1, 0, 0)
endDt = datetime(2020, 1, 1, 0, 0)
# datetimes that should not exist due to tz changes
tzDates = [datetime(2017,3,12,2,0), datetime(2017,3,12,2,30),
            datetime(2018,3,11,2,0), datetime(2018,3,11,2,30),
            datetime(2019,3,10,2,0), datetime(2019,3,10,2,30)
           ]
usHolidays = holidays.UnitedStates()

In [29]:
#load zone details
zones = pd.read_csv('../taxi_data/taxiZoneLookup.csv')

## Aggregated Data

* aggregates trips for every dt for each zone
* time-series format
* used to explore trips more easily

In [30]:
def get_aggr_data(years):
    check = 1
    for year in years:
        for month in range(1,13):
            path = "../taxi_data/yellow_tripdata_{}-{}{}".format(year, f"{month:02d}", ".csv")
            df = pd.read_csv(path, 
                             parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"], 
                             low_memory=False)
            
            df = df[["tpep_pickup_datetime", 
                     "tpep_dropoff_datetime", 
                     "trip_distance", 
                     "passenger_count", 
                     "PULocationID", 
                     "DOLocationID"]]
            
            #only trips shorter than 100 miles
            df = df[df.trip_distance < 100]
            #only trips with at least 1 passenger
            df = df[df.passenger_count != 0]
            #calculate trip duration
            df["trip_duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
            #only trips longer than 1 minute and shorter than 3 hours
            df = df[(df.trip_duration > timedelta(minutes=1)) & (df.trip_duration < timedelta(hours=3))]
            
            #merge borough from the zones dataframe
            #so trips can be filtered for Manhattan, Queens and Brooklyn only
            df.rename(columns={"PULocationID": "LocationID"}, inplace=True)
            df = pd.merge(df, zones[["LocationID", "Borough"]], on="LocationID", how="left")
            df.rename(columns={"LocationID": "PULocationID", "Borough": "PUBorough", "Zone": "PUZone"}, inplace=True)
            df.rename(columns={"DOLocationID": "LocationID"}, inplace=True)
            df = pd.merge(df, zones[["LocationID", "Borough"]], on="LocationID", how="left")
            df.rename(columns={"LocationID": "DOLocationID", "Borough": "DOBorough", "Zone": "DOZone"}, inplace=True)
            
            #only include trips from and to Manhattan, Queens or Brooklyn
            boroughs = ["Manhattan", "Queens", "Brooklyn"]
            df = df[(df.PUBorough.isin(boroughs)) & (df.DOBorough.isin(boroughs))]
            
            #drop everything except datetime information and pickup location
            #df = df[["tpep_pickup_datetime", "PULocationID"]]
            df = df[["tpep_dropoff_datetime", "DOLocationID"]]
            #only include trips that fill in the current year and month
            if month != 12:
                df = df[(df.tpep_dropoff_datetime >= datetime(year,month,1)) & (df.tpep_dropoff_datetime < datetime(year,month+1,1))]
            else:
                df = df[(df.tpep_dropoff_datetime >= datetime(year,month,1)) & (df.tpep_dropoff_datetime < datetime(year+1,1,1))]
            
            #round pickup datetime to full hours
            df["tpep_dropoff_datetime"] = df["tpep_dropoff_datetime"].apply(lambda x: x - timedelta(minutes=x.minute % 60, seconds=x.second))    
            #create a dummy for each zone
            df = df[["tpep_dropoff_datetime"]].join(pd.get_dummies(df.DOLocationID))
            #aggregate trips for every zone at every pickup datetime
            df = df.groupby("tpep_dropoff_datetime").sum()    
            
            #aggregate data unless it's the first data set (here 1/2017)
            if check == 0:
                final = pd.concat([final, df], sort=False)
            else:
                final = df
                check = 0
                
            print(str(year) + "-" + str(month))
            
    return final          

In [25]:
years = [2017,2018,2019]
#df1 = get_aggr_data(years)

In [31]:
#df1.to_csv("../taxi_data/aggr_dropoffs.csv")

In [35]:
df1 = pd.read_csv("../taxi_data/aggr_pickups.csv", 
                  index_col=[0], 
                  parse_dates=[0])

In [36]:
#remove duplicates or dts that do not exist due to summertime/wintertime shifts
df1 = prepData(df1)

0 duplicates found and removed.


In [38]:
# df1.to_csv("taxi_data/aggr_pickups.csv")

In [40]:
df1.tail()

Unnamed: 0_level_0,2,4,7,8,9,10,11,12,13,14,...,252,253,255,256,257,258,260,261,262,263
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-31 19:00:00,0.0,18.0,13.0,1.0,0.0,0.0,0.0,0.0,87.0,2.0,...,0.0,0.0,4.0,7.0,1.0,0.0,4.0,67.0,196.0,273.0
2019-12-31 20:00:00,0.0,25.0,24.0,0.0,0.0,1.0,0.0,0.0,86.0,1.0,...,0.0,0.0,13.0,13.0,2.0,0.0,7.0,75.0,211.0,422.0
2019-12-31 21:00:00,0.0,22.0,20.0,0.0,0.0,2.0,1.0,1.0,62.0,2.0,...,0.0,0.0,4.0,5.0,1.0,1.0,8.0,53.0,168.0,326.0
2019-12-31 22:00:00,0.0,31.0,16.0,1.0,0.0,2.0,0.0,1.0,41.0,0.0,...,0.0,0.0,6.0,11.0,1.0,2.0,9.0,31.0,85.0,225.0
2019-12-31 23:00:00,0.0,27.0,17.0,0.0,0.0,3.0,0.0,0.0,28.0,0.0,...,0.0,0.0,12.0,17.0,1.0,0.0,3.0,32.0,69.0,164.0


## Data for Forecasts

* raw data which will be the base for predictions later
* data is accumulated such that every dt has an aggregated number of trips for each zone
* no time-series format, but all zones can be included in one model

In [41]:
def get_data(years, pu, do, path, only_pu=1):
    check = 1
    for year in years:
        #print(year)
        for month in range(2,13):
            date_str = str(year)+"-"+str(month)
            print(date_str)
            
            path = "../taxi_data/{}_{}-{}{}".format(path, year, f"{month:02d}", ".csv")
            try:
                df = pd.read_csv(path, 
                                 parse_dates=[pu, do], 
                                 #index_col=["tpep_pickup_datetime"], 
                                 #low_memory=False
                                )
            except:
                continue
            
            start = datetime(year,month,1)
            if month != 12:
                end = datetime(year,month+1,1)
            else:
                end = datetime(year+1,1,1)
            df = df[(df[pu]>=start) & (df[pu]<end)]
            df = df[[pu, do,
                     #"trip_distance", 
                     #"passenger_count",
                     "PULocationID",
                     "DOLocationID"]]
            
            #df = df[df.trip_distance < 100]
            #df = df[df.passenger_count != 0]
            #df["trip_duration"] = df[do] - df[pu]
            #df = df[(df.trip_duration > timedelta(minutes=1)) & (df.trip_duration < timedelta(hours=3))]
            
            #merge borough from the zones dataframe
            #so trips can be filtered for Manhattan, Queens and Brooklyn only
            df.rename(columns={"PULocationID": "LocationID"}, inplace=True)
            df = pd.merge(df, zones[["LocationID", "Borough"]], on="LocationID", how="left")
            df.rename(columns={"LocationID": "PULocationID", 
                               "Borough": "PUBorough", 
                               "Zone": "PUZone"}, 
                      inplace=True)
            
            df.rename(columns={"DOLocationID": "LocationID"}, inplace=True)
            df = pd.merge(df, zones[["LocationID", "Borough"]], on="LocationID", how="left")
            df.rename(columns={"LocationID": "DOLocationID", 
                               "Borough": "DOBorough", 
                               "Zone": "DOZone"}, 
                      inplace=True)
                                    
            #only include trips from and to Manhattan, Queens or Brooklyn
            boroughs = ["Manhattan", "Queens", "Brooklyn"]
            df = df[(df.PUBorough.isin(boroughs)) & (df.DOBorough.isin(boroughs))]
                                          
            if only_pu == 1:
                cols = [pu, "PULocationID"]
            else:
                cols = [do, "DOLocationID"]
            
            df = df[cols]
            df = df.rename(columns={pu: "datetime", cols[1]: "location"})
            df["year"] = df["datetime"].apply(lambda x: x.year)
            df["month"] = df["datetime"].apply(lambda x: x.month)
            df["day"] = df["datetime"].apply(lambda x: x.day)
            df["hour"] = df["datetime"].apply(lambda x: x.hour)
            #df["wday"] = df["tpep_pickup_datetime"].apply(lambda x: x.weekday())
            df = df.drop("datetime", axis=1)
            df["rides"] = 0
            df = df.groupby(["location", "year", "month", "day", "hour"]).agg("count")
            df.reset_index(inplace=True)  
            if check == 0:
                final = pd.concat([final, df], sort=False)
            else:
                final = df
                check = 0
            print(str(year) + "-" + str(month) + " done!")
    return final

In [43]:
# df2 = get_data(years, "pickup_datetime", "dropoff_datetime", "fhvhv_tripdata")

In [44]:
# df2.to_csv("taxi_data/dropoffs.csv")

## Weather Data

* adjust data from UTC to NYC time (including daylist saving time)

In [45]:
def loadWeatherData(path, tarTimezone="US/Eastern", dateFormat="%Y-%m-%d %H:%M"):
    """Read weather data, transform datetime to target timezone, set as index.
    Keyword arguments:
    path -- path of weather data with data type
    tarTimezone -- target timezone date is transformed into
    dateFormat -- format of datetime
    """
    w = pd.read_csv(path)
    eastern = pytz.timezone(tarTimezone)
    fmt = dateFormat
    w["dt_iso"] = w["dt_iso"].apply(lambda x: pd.to_datetime(x[:-10], utc=True))
    w["dt"] = w["dt_iso"].apply(lambda x: x.astimezone(eastern).strftime(fmt))
    w["dt"] = w["dt"].apply(lambda x: pd.to_datetime(x))
    w = w.set_index("dt")
    return w[["temp", "feels_like", "temp_min", "temp_max", "humidity", "wind_speed", 
              "rain_1h", "rain_3h", "snow_1h", "snow_3h", "weather_main", "weather_description"]]

In [50]:
w = loadWeatherData("../taxi_data/weather.csv")

In [51]:
w = w.fillna(0)

In [52]:
w = prepData(w)

322 duplicates found and removed.


In [54]:
w.to_csv("../taxi_data/weather_prepared.csv")