# Data Mining

In [3]:
from datetime import date, datetime, timedelta
import pandas as pd
import numpy as np

In [7]:
def theTime():
    return str(datetime.datetime.now().hour)\
           + ":"\
           + str(datetime.datetime.now().minute)\
           + ":"\
           + str(datetime.datetime.now().second)

In [10]:
def getData(startMonth, months, year, saveFileName, tDelta=30, dataType=".csv"):
    print(time_now() + " -- Starting...")
    # read data 
    for month in range(startMonth, months+startMonth):
        path = "data/yellow_tripdata_{}-{}{}".format(year, f"{month:02d}", dataType)
        df = pd.read_csv(path)
        print(theTime() + " -- " + path + " loaded...")
         
        # transform data: 
        # join with taxi zones,
        # remove unnecessary columns, 
        # divide into pickup and dropoff
        #pu, do = transform(df)
        pu = transform(df)
        print(theTime() + " -- " + path + " transformed...")
        
        # transform data to time-series data
        start = datetime(year,month,1,0,0)
        if month != 12:
            end = datetime(year,month+1,1,0,0)
        else:
            end = datetime(year+1,1,1,0,0)
        
        pu = to_ts(pu, tdelta, start, end, pu.columns)
        #do = to_ts(do, tdelta, start, end, do.columns)
        print(theTime() + " -- " + path + " to time-series...")
        
        # combine data
        if month > startMonth:
            pu_ts = pd.concat([pu_ts, pu], sort=False)
            #do_ts = pd.concat([do_ts, do], sort=False)
        else:
            pu_ts = pu
            #do_ts = do

    pu_ts = pu_ts.drop("PULocationID", axis=1)
    #do_ts = do_ts.drop("PULocationID", axis=1)
    print(theTime() + " -- Finished.")
    
    pu_ts.to_csv(saveFileName)
    #do_ts.to_csv(saveFileName)
    
    return pu_ts #, do_ts


def transform(data):
    df_pu = data[["tpep_pickup_datetime", "PULocationID"]]
    #df_do = data[["tpep_dropoff_datetime", "DOLocationID"]]
    
    return df_pu #, df_do


def to_ts(data, tDelta, start, end, cols): 
    data["time_bin"] = pd.to_datetime(data[cols[0]])
    data["time_bin"] = data["time_bin"].apply(lambda x: x - datetime.timedelta(minutes=x.minute % tDelta, seconds=x.second))

    data = data[(data.time_bin >= start) & (data.time_bin < end)]
    data = data.drop(columns=[cols[0]])
    
    data = pd.concat([data, pd.get_dummies(data[cols[1]])], axis=1)

    df_bins = data.groupby("time_bin").sum()
    
    return df_bins

In [None]:
# in order for this to run you need the data from
# https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
# size is around 25 GB
# the compressed time-series files are only a fraction of this
# and included in the repository
"""
df1 = getData(1, 6, 2017, "zonePickups17_1.csv")
df2 = getData(7, 6, 2017, "zonePickips17_2.csv")
df3 = getData(1, 12, 2018, "zonePickups18.csv")
df4 = getData(1, 12, 2019, "zonePickups19.csv")
"""

In [11]:
# load all the files
# 2017 is split in two since it otherwise failed to compute (still unresolved)
df17_1 = pd.read_csv("zonePickups_17_1.csv", index_col="time_bin", parse_dates=["time_bin"])
df17_2 = pd.read_csv("zonePickups_17_2.csv", index_col="time_bin", parse_dates=["time_bin"])
df18 = pd.read_csv("zonePickups_18.csv", index_col="time_bin", parse_dates=["time_bin"])
df19 = pd.read_csv("zonePickups_19.csv", index_col="time_bin", parse_dates=["time_bin"])

In [12]:
# concatenate CSVs
df = pd.concat([df17_1, df17_2, df18, df19], sort=False)

In [14]:
df.drop("trips", axis=1, inplace=True)

In [16]:
# df.to_csv("zonePickups.csv")

In [19]:
df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,262,263,264,265,27,44,99,199,104,103
time_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01 00:00:00,0,0,0,53,0.0,0,16,0,0,0,...,92,201,187,4,,,,,,
2017-01-01 00:30:00,0,0,0,83,0.0,0,62,0,0,0,...,110,349,229,11,,,,,,
2017-01-01 01:00:00,0,0,0,69,0.0,0,83,0,0,1,...,124,386,230,7,,,,,,
2017-01-01 01:30:00,0,0,0,76,0.0,0,87,0,0,2,...,91,373,240,10,,,,,,
2017-01-01 02:00:00,0,0,0,101,0.0,0,113,1,0,2,...,85,341,212,9,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-31 21:30:00,0,0,0,13,0.0,0,9,0,0,0,...,80,163,43,2,0.0,0.0,0.0,,,
2019-12-31 22:00:00,0,0,0,16,0.0,0,13,1,0,0,...,45,141,36,3,0.0,0.0,0.0,,,
2019-12-31 22:30:00,0,0,1,15,0.0,0,5,0,0,2,...,48,102,26,2,0.0,0.0,0.0,,,
2019-12-31 23:00:00,0,0,1,11,0.0,0,13,0,0,3,...,44,110,23,3,0.0,0.0,0.0,,,
