In [234]:
import numpy as np
import pandas as pd
import csv
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

In [230]:
def preprocess(df, dictCompany):

    df = df[['Trip Seconds', 'Trip Miles', 'Pickup Community Area', 'Dropoff Community Area', 'Company', 'Trip Start Timestamp']]
    df = df[(df['Trip Miles']>=0.5) & (df['Trip Miles']<=100)]
    df['tripKM'] = df['Trip Miles'].apply(lambda x: round(x*1.609,2))
    df = df[(df['Trip Seconds']>=60) & (df['Trip Seconds']<=18000)]
    df['Pickup Community Area'] = df['Pickup Community Area'].mask( ~((df['Pickup Community Area']>=1) & (df['Pickup Community Area']<=77)), 78)
    df['Dropoff Community Area'] = df['Dropoff Community Area'].mask( ~((df['Dropoff Community Area']>=1) & (df['Dropoff Community Area']<=77)), 78)
    df = df[~((df['Pickup Community Area']==78) & (df['Dropoff Community Area']==78))]
    df = df.replace({"Company": dictCompany})
    df = df.dropna()
    df['Trip Start Timestamp'] = pd.to_datetime(df['Trip Start Timestamp'])
    df['Trip Start Timestamp'] = df['Trip Start Timestamp'].apply(lambda x: pd.datetime(x.year, x.month, x.day, x.hour, 0, 0))
    df['Pickup Community Area'] = df['Pickup Community Area'].astype(int)
    df['Dropoff Community Area'] = df['Dropoff Community Area'].astype(int)
    df['Trip Seconds'] = df['Trip Seconds'].astype(int)
    df = df.rename(columns={'Trip Seconds': 'tripSeconds', 'Trip Miles': 'tripMiles', 'Pickup Community Area': 'pickupArea', 'Dropoff Community Area': 'dropArea', 'Company': 'company', 'Trip Start Timestamp': 'tripStartTime'})

    return df
    

In [231]:
def preCalcAllArea(df, str):
    
    all_areas_date = df.groupby([df['tripStartTime'].dt.date]).size().to_frame().reset_index().rename(columns={'tripStartTime': 'Date',0: 'Count'})
    all_areas_hour = df.groupby([df['tripStartTime'].dt.hour]).size().to_frame().reset_index().rename(columns={'tripStartTime': 'Hour', 0: 'Count'})
    all_areas_day = df.groupby([df['tripStartTime'].dt.dayofweek]).size().to_frame().reset_index().rename(columns={'tripStartTime': 'Day', 0: 'Count'}) # Values: 0 to 6. (0: Monday, 6: Sunday)
    all_areas_month = df.groupby([df['tripStartTime'].dt.month]).size().to_frame().reset_index().rename(columns={'tripStartTime': 'Month', 0: 'Count'})

    all_areas_mileage_miles = df
    ranges_miles = [0.49, 5, 10, 15, 20, 30, 50, 100]
    all_areas_mileage_miles['mileage_bin_miles'] = pd.cut(df['tripMiles'], bins=ranges_miles)
    all_areas_mileage_miles = all_areas_mileage_miles.groupby([all_areas_mileage_miles['mileage_bin_miles']]).size().to_frame().reset_index().rename(columns={'mileage_bin_miles': 'Mileage (miles)', 0: 'Count'})
    all_areas_mileage_miles.sort_values(by='Mileage (miles)')
    labels = ['0.5 to 5', '5 - 10', '10 - 25', '15 - 20', '20 - 30', '30 - 50',  '50 - 100']
    all_areas_mileage_miles['Mileage (miles)'] = all_areas_mileage_miles['Mileage (miles)'].cat.rename_categories(labels)

    all_areas_mileage_km = df
    ranges_km  = [0.79, 8.0, 16.0, 24.0, 32.0, 48.0, 80.0, 160.0]
    all_areas_mileage_km['mileage_bin_km'] = pd.cut(df['tripKM'], bins=ranges_km)
    all_areas_mileage_km = all_areas_mileage_km.groupby([all_areas_mileage_km['mileage_bin_km']]).size().to_frame().reset_index().rename(columns={'mileage_bin_km': 'Mileage (km)', 0: 'Count'})
    all_areas_mileage_km.sort_values(by='Mileage (km)')
    labels = ['0.8 to 8', '8 - 16', '16 - 24', '24 - 32', '32 - 48', '48 - 80',  '80 - 160']
    all_areas_mileage_km['Mileage (km)'] = all_areas_mileage_km['Mileage (km)'].cat.rename_categories(labels)

    all_areas_time = df
    ranges = [59.99, 300, 600, 900, 1200, 1800, 3600, 7200, np.inf]
    all_areas_time['time_bin'] = pd.cut(df['tripSeconds'], bins=ranges)
    all_areas_time = all_areas_time.groupby([all_areas_time['time_bin']]).size().to_frame().reset_index().rename(columns={'time_bin': 'timeTaken', 0: 'Count'})
    all_areas_time.sort_values(by='timeTaken')
    labels = ['1 - 5 min', '5 - 10 min', '10 - 15 min', '15 - 20 min', '20 - 30 min', '1/2 hr - 1 hr',  '1 -  2 hr', '> 2 Hr']
    all_areas_time.timeTaken = all_areas_time.timeTaken.cat.rename_categories(labels)

    all_areas_date.to_csv('allAreas/all_areas_date_{}.csv'.format(str), index=False)
    all_areas_hour.to_csv('allAreas/all_areas_hour_{}.csv'.format(str), index=False)
    all_areas_day.to_csv('allAreas/all_areas_day_{}.csv'.format(str), index=False)
    all_areas_month.to_csv('allAreas/all_areas_month_{}.csv'.format(str), index=False)
    all_areas_mileage_miles.to_csv('allAreas/all_areas_mileage_miles_{}.csv'.format(str), index=False)
    all_areas_mileage_km.to_csv('allAreas/all_areas_mileage_km_{}.csv'.format(str), index=False)
    all_areas_time.to_csv('allAreas/all_areas_time_{}.csv'.format(str), index=False)

In [235]:
if __name__ == '__main__':
    
    filename = "Taxi_Trips_-_2019.csv"
    df = pd.read_csv(filename)

    with open('taxiDict.csv') as csv_file:
        reader = csv.reader(csv_file)
        mydict = dict(reader)

    dictCompany = dict((v, int(k)) for k, v in mydict.items())

    dfTaxi = preprocess(df, dictCompany)

    for i in range(0,55):
        dfTaxi[dfTaxi['company']==i].to_csv('csv/taxi{}.csv'.format(i), index=False)

    preCalcAllArea(dfTaxi, "outsideCity")
    dfTaxiSub = dfTaxi[~((dfTaxi['pickupArea']==78) | (dfTaxi['dropArea']==78))]
    preCalcAllArea(dfTaxiSub, "onlyCity")