In [1]:
import numpy as np
import pandas as pd
import pandas.io.sql as pdsql
import pickle

import psycopg2

In [2]:
conn = psycopg2.connect("dbname='nyc_taxi' user='postgres' host='localhost' password='organon'")

In [3]:
# Read the pickup_count table and the day_info table
cdf = pdsql.read_sql("SELECT * FROM pickup_count", conn, coerce_float=True, params=None)
day_info = pdsql.read_sql("SELECT * FROM day_info", conn, coerce_float=True, params=None)
conn.close()

In [5]:
# Add the rows where count is 0
gid_append = []
doy_append = []
hour_append = []

row = 0
for gid in range(1, 196):
    for doy in range(1, 366):
        for hour in range(24):
            if row < len(cdf.index) and\
            cdf.pickup_gid[row] == gid and\
            cdf.pickup_doy[row] == doy and\
            cdf.pickup_hour[row] == hour:
                row += 1
            else:
                gid_append.append(gid)
                doy_append.append(doy)
                hour_append.append(hour)
                
cdf_append = pd.DataFrame({'pickup_gid': gid_append, 'pickup_doy': doy_append, 'pickup_hour': hour_append},
                          columns=['pickup_gid', 'pickup_doy', 'pickup_hour'])
cdf_append['count'] = pd.Series([0] * len(gid_append))
cdf = cdf.append(cdf_append, ignore_index=True)
cdf = cdf.sort_values(by=['pickup_gid', 'pickup_doy', 'pickup_hour'])
cdf.index = range(len(cdf))

In [6]:
cdf

Unnamed: 0,pickup_gid,pickup_doy,pickup_hour,count
0,1,1,0,0
1,1,1,1,3
2,1,1,2,7
3,1,1,3,6
4,1,1,4,3
5,1,1,5,4
6,1,1,6,1
7,1,1,7,3
8,1,1,8,1
9,1,1,9,0


In [7]:
# Add new columns, namely day of the week and the temperature, precipitation, holiday info
pickup_dow = (cdf['pickup_doy'] + 2) % 7 # For 2014, Jan. 1 is Wednesday
cdf['pickup_dow'] = pickup_dow

for col_name in {'temperature', 'precipitation', 'holiday'}:
    col = day_info[col_name][cdf['pickup_doy'] - 1]
    col.index = range(len(col))
    cdf[col_name] = col

# Raw dataframe extracted from the original dataset ready for further process
cdf_raw = cdf[['count', 'pickup_gid', 'pickup_dow', 'pickup_hour', 'temperature', 'precipitation', 'holiday']]

In [9]:
D = [1,]

def get_cdf_delayed(cdf_raw, D):
    # Generate dataframe by adding d columns corresponding to the pickup count in the same gid in the previous d hours
    # as predictors where d is defined in D
    
    dm = max(D) # Maximum delay

    cols = {key: [] for key in cdf_raw.columns}
    cols_count = {'count_{0}'.format(d): [] for d in D}

    for gid in range(1, 196):
        for key in cols:
            cols[key] += cdf_raw[key][(gid-1)*365*24+dm : gid*365*24].values.tolist()

        for d in D:
            cols_count['count_{0}'.format(d)] += cdf_raw['count'][(gid-1)*365*24+dm-d : gid*365*24-d].values.tolist()
        
    cdf_with_history = pd.DataFrame({**cols, **cols_count}, columns=cdf_raw.columns.tolist() +
                                   ['count_{0}'.format(d) for d in D])
    
    return cdf_with_history

cdf_with_history = get_cdf_delayed(cdf_raw, D)
cdf_with_history.to_pickle('data.p')

# cdf_with_history_loaded = pickle.load(open("data.p", "rb"))