In [1]:
import pandas as pd
import numpy as np 
import os
import datetime



In [2]:
# read in appropriate yearly green taxi files
greens = []
for i in range(7,13):
    num = str(i).zfill(2)
    greens.append(pd.read_csv(f"../raw_data/green_tripdata_2015-{num}.csv"))
    
for i in range(1,7):
    num = str(i).zfill(2)
    greens.append(pd.read_csv(f"../raw_data/green_tripdata_2016-{num}.csv"))
    
green = pd.concat(greens).reset_index()


In [3]:
# Date of each day 
green["DATE"] = pd.to_datetime(green["lpep_pickup_datetime"]).dt.to_period("D").astype(str)


In [4]:
# only vendor id 1 or 2 
green = green.loc[(green["VendorID"] == 1) | (green["VendorID"] == 2)]

In [5]:
# fares outside the flat $2.50 were removed
green = green.loc[green["Fare_amount"] >= 2.5]

In [6]:

# green["duration"] = pd.to_datetime(green["Lpep_dropoff_datetime"]) - pd.to_datetime(green["lpep_pickup_datetime"])

coords = ["Pickup_longitude", "Pickup_latitude"]



In [7]:
# only pickup coordinates within new york city range
green = green.loc[(-74 < green['Pickup_longitude'] ) & (green['Pickup_longitude'] < -73.7) &  (40.48 < green['Pickup_latitude']) & (green['Pickup_latitude'] < 41)]

In [8]:
# only cash or cheque payments
green = green.loc[(green["Payment_type"] == 1) |(green["Payment_type"] == 2)]

In [9]:
# only trip distances greater than 0 and less than 200 miles 
green = green.loc[(green["Trip_distance"] > 0) & (green["Trip_distance"] < 200)]

In [10]:
# tip excluded total amount paid calculated
# only cheque payments had tip amounts recorded so only they were subtracted from total amount paid
def tip_excluded(x):
    if x[0] == 1:
        return x[1] - x[2]
    else:
        return x[1]


green["tip_excluded"] = green[["Payment_type", "Total_amount", "Tip_amount"]].apply(tip_excluded, axis = 1)

In [11]:
# only tip excluded amounts greater than 0 and less than 100
green = green.loc[(green["tip_excluded"] > 0) & (green["tip_excluded"] < 100)]

In [12]:
# pay per mile (excluding tip) calculated for outlier removal
green["pay_per_mile"] = green["tip_excluded"]/green["Trip_distance"]

In [13]:
green = green.loc[green["pay_per_mile"] < 30]

In [14]:
green.to_pickle("../preprocessed_data/green.pkl")

In [15]:

clis = []
for name in ["2680632.csv", "2680634.csv", "2680635.csv", "2680639.csv"]:
    clis.append(pd.read_csv(f"../raw_data/{name}"))
climate_data = pd.concat(clis).reset_index()

In [16]:
#Just JFK airport
climate_data = climate_data.loc[climate_data["STATION"] == "USW00094789"][["STATION", "NAME", "DATE", "PRCP", "SNOW", "TAVG"]].dropna().reset_index()


In [17]:
climate_data[["STATION", "NAME", "DATE", "SNOW", "PRCP", "TAVG"]].to_pickle("../preprocessed_data/climate.pkl")

