# Preprocessing for NYC Citibike Trip Data

In [1]:
import glob
import multiprocessing
import os
import warnings
import pandas as pd
import pprint
import tqdm
import zipfile

pp = pprint.PrettyPrinter(indent=4)
ROOTPATH = "../dat/raw/"

In [2]:
def load_data(date_index):
    return pd.read_csv(ROOTPATH + date_index + "-citibike-tripdata.csv")

In [None]:
eg = load_data(date_index="202009")

In [None]:
eg.nunique()

In [None]:
eg.groupby("usertype").size()

In [None]:
eg.groupby("gender").size()

In [None]:
eg["birth year"].hist()

In [3]:
def cleaning(df, year, usertype='customer'):


    cleaned = df.dropna(how="any")
    cleaned.columns = cleaned.columns.str.replace(" ", "")
    cleaned.columns = cleaned.columns.str.lower()

    # print(cleaned["usertype"].unique())
    usertype = usertype.capitalize()
    cleaned = cleaned.query("usertype==@usertype")

    # print(cleaned.keys())
    # print(cleaned["birth_year"])
    cleaned["birthyear"] = pd.to_numeric(cleaned["birthyear"], errors="coerce")
    cleaned = cleaned.dropna(how="any")

    cleaned["userage"] = int(year) - cleaned["birthyear"].astype(int)
    cleaned = cleaned.query("userage<=80")
    cleaned = cleaned.query("userage>10")

    # print(cleaned.head())
    # print(cleaned.shape)
    # print(cleaned.userage)
    # print(cleaned.keys())

    del cleaned["bikeid"]
    del cleaned["tripduration"]
    del cleaned["birthyear"]
    del cleaned["startstationname"]
    del cleaned["endstationname"]
    del cleaned["usertype"]
    del cleaned["gender"]

    cleaned = cleaned.astype({
        "startstationid": int,
        "endstationid": int,
        "userage": int,
    })

    # print(cleaned.head())
    # print(cleaned.shape)
    return cleaned

In [5]:
def _preprocessing(date_index, usertype="customer"):
    org = load_data(date_index)
    year = date_index[:4]
    cleaned = cleaning(org, year, usertype)
    if len(cleaned) < 100:
        return

    cleaned.to_csv(
        "../dat/processed/{}/{}-citibike-tripdata.csv.gz".format(
            usertype, date_index),
        index=False, compression="gzip")

In [7]:
warnings.filterwarnings("ignore")
os.makedirs("../dat/processed/customer/", exist_ok=True)
os.makedirs("../dat/processed/subscriber/", exist_ok=True)

date_indices = [
    s.strftime("%Y%m")
    for s in pd.date_range(start="2013-06", end="2021-04", freq='m')]

for idx in tqdm.tqdm(date_indices):
    # _preprocessing(idx, usertype='customer')
    _preprocessing(idx, usertype='subscriber')
    # break

100%|██████████| 94/94 [30:19<00:00, 15.42s/it]
