# View preprocessed NYC tripdata

In [12]:
import glob
import os
import shutil
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import PyPDF2
from tqdm import tqdm

In [13]:
sns.set_style("whitegrid")
sns.set_palette("muted")

In [14]:
DATASET_PATH = "../dat/citibike_201903_202104/"
OUTPUT_PATH = "../out/view/"

In [15]:
def replace_dir(path):
    if os.path.exists(path):
        shutil.rmtree(path)
    os.makedirs(path)

## View data samples

In [16]:
tripdata_list = glob.glob(DATASET_PATH + "*tripdata*.csv")
tripdata_list.sort()
tripdata_list

['../dat/citibike_201903_202104/tripdata_201903.csv',
 '../dat/citibike_201903_202104/tripdata_201904.csv',
 '../dat/citibike_201903_202104/tripdata_201905.csv',
 '../dat/citibike_201903_202104/tripdata_201906.csv',
 '../dat/citibike_201903_202104/tripdata_201907.csv',
 '../dat/citibike_201903_202104/tripdata_201908.csv',
 '../dat/citibike_201903_202104/tripdata_201909.csv',
 '../dat/citibike_201903_202104/tripdata_201910.csv',
 '../dat/citibike_201903_202104/tripdata_201911.csv',
 '../dat/citibike_201903_202104/tripdata_201912.csv',
 '../dat/citibike_201903_202104/tripdata_202001.csv',
 '../dat/citibike_201903_202104/tripdata_202002.csv',
 '../dat/citibike_201903_202104/tripdata_202003.csv',
 '../dat/citibike_201903_202104/tripdata_202004.csv',
 '../dat/citibike_201903_202104/tripdata_202005.csv',
 '../dat/citibike_201903_202104/tripdata_202006.csv',
 '../dat/citibike_201903_202104/tripdata_202007.csv',
 '../dat/citibike_201903_202104/tripdata_202008.csv',
 '../dat/citibike_201903_202

In [17]:
station_data = pd.read_csv(DATASET_PATH + "station_data.csv")
station_data

Unnamed: 0,index,stationid,stationname,stationlatitude,stationlongitude
0,0,3496,1 Ave & E 110 St,40.792327,-73.938300
1,1,504,1 Ave & E 15 St,40.732219,-73.981656
2,2,2003,1 Ave & E 18 St,40.733812,-73.980544
3,3,536,1 Ave & E 30 St,40.741444,-73.975361
4,4,4121,1 Ave & E 39 St,40.747140,-73.971130
...,...,...,...,...,...
1398,1398,3911,Wyckoff St & Nevins St,40.683426,-73.984275
1399,1399,2002,Wythe Ave & Metropolitan Ave,40.716887,-73.963198
1400,1400,3182,Yankee Ferry Terminal,40.686931,-74.016966
1401,1401,3481,York St,40.716490,-74.041050


In [18]:
data = pd.read_csv(tripdata_list[-2])
data.starttime = pd.to_datetime(data.starttime)
data = data.set_index("starttime")
data


Unnamed: 0_level_0,startstationid,endstationid,userage,start_station_dim,end_station_dim
starttime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-02-01 00:04:23.078,3175,4045,17,1313.0,1362.0
2021-02-01 00:07:08.808,3154,3725,22,618.0,36.0
2021-02-01 00:07:55.939,502,411,33,775.0,600.0
2021-02-01 00:09:32.682,505,3687,25,149.0,557.0
2021-02-01 00:11:53.386,346,285,38,221.0,262.0
...,...,...,...,...,...
2021-02-28 23:51:54.160,303,257,45,931.0,886.0
2021-02-28 23:53:45.016,3052,437,21,868.0,897.0
2021-02-28 23:56:09.694,485,526,13,1274.0,559.0
2021-02-28 23:57:55.556,3555,3129,25,60.0,1063.0


## View aggrigated data with shape: (timestamp, station; count)

In [32]:
def aggrigate_tripdata(freq="D", key="startstationid"):
    """
        freq (str): frequency of timeseries
            D: daily, W: weekly
        key (str): key to aggrigage original tripdata
            startstationid or userage
    """

    df_list = []

    for fn in tqdm(tripdata_list, total=len(tripdata_list)):

        # Original data (CSV)
        data = pd.read_csv(fn)
        data.starttime = pd.to_datetime(data.starttime)
        data = data.set_index("starttime")

        # CSV to TimeSeries
        g = data.groupby([pd.Grouper(freq=freq), key]).size()
        g = g.reset_index(key)
        g = g.pivot_table(index="starttime", columns=key, values=0)

        df_list.append(g)

    return pd.concat(df_list).fillna(0)

In [35]:
def save_aggrigated_tripdata(freq="D", key="startstationid", n_axis=10):

    # Prepare for output directory
    outpath_pdf = os.path.join(OUTPUT_PATH, DATASET_PATH.split("/")[-2], freq, key)
    replace_dir(outpath_pdf)

    # Get aggrigated tripdata
    df = aggrigate_tripdata(freq=freq, key=key)

    for n in tqdm(range(n_axis, df.shape[1] + n_axis, n_axis)):

        n = min(n, df.shape[1])
        fig, ax = plt.subplots(n_axis, figsize=(16, 2 * n_axis))

        for axi in range(n - n_axis, n):
            df.iloc[:, axi].plot(ax=ax[axi % n_axis])
            ax[axi % n_axis].set_ylabel(df.columns[axi])
        
        fig.tight_layout()
        fig.savefig(outpath_pdf + "/tmp_summary_{}_{}.pdf".format(key, n))
        # keep this order: clf() -> close()
        plt.clf()
        plt.close()

    figure_list = glob.glob(outpath_pdf + "/tmp*.pdf")
    figure_list.sort()
    
    # Save daily summary as a pdf file
    merger = PyPDF2.PdfFileMerger()
    for fn in figure_list:
        merger.append(fn)
    
    merger.write(outpath_pdf + "/summary.pdf")
    merger.close()

    # Delete temporal files
    for fn in figure_list:
        os.remove(fn)

In [36]:
save_aggrigated_tripdata(freq="D", key="startstationid")  # (timestamp, stationid; count)
save_aggrigated_tripdata(freq="D", key="userage")  # (timestamp, age; count)

100%|██████████| 25/25 [00:23<00:00,  1.07it/s]
100%|██████████| 12/12 [00:16<00:00,  1.39s/it]
