# Read Me

- This code cleans the original training data on Kaggle that has 55M rows. Because the dataset it big, so the code was written that can be run on Google Cloud. 
- The code sampled 25% of the cleaned data which was used in further analysis
- Cleaning specs include: 
1. NA 
2. Fare: 2.5<=Fare<=500
3. Drop-off/Pick-up location: has to be in NYC
4. Passenger count: 0<passenger count<=

In [1]:
# Dependencies
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import math
import calendar
import statsmodels.formula.api as sm
from sklearn.linear_model import LinearRegression
from datetime import datetime

#gcp dependencies
from google.cloud import storage
from io import BytesIO
client = storage.Client()
bucket = client.get_bucket("nytaxi_mz")

In [2]:
#read from gcp storage
blob = storage.blob.Blob("train.csv.zip",bucket)
content = blob.download_as_string()
train_data = pd.read_csv(BytesIO(content),compression='zip', header=0, sep=',', quotechar='"')
train_data.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [3]:
#descriptives of train_data to spot any thing that requires cleaning
train_data.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,55423860.0,55423860.0,55423860.0,55423480.0,55423480.0,55423860.0
mean,11.34505,-72.50968,39.91979,-72.51121,39.92068,1.68538
std,20.71083,12.84888,9.642353,12.7822,9.633346,1.327664
min,-300.0,-3442.06,-3492.264,-3442.025,-3547.887,0.0
25%,6.0,-73.99207,40.73493,-73.9914,40.73403,1.0
50%,8.5,-73.9818,40.75265,-73.98015,40.75316,1.0
75%,12.5,-73.96708,40.76713,-73.96367,40.7681,2.0
max,93963.36,3457.626,3408.79,3457.622,3537.133,208.0


In [4]:
train_cleaned=train_data

#drop NAs
train_cleaned = train_cleaned.dropna()

#cleaned out the fare to make sure 2.5<fare<=500
#lowest fare of NYC taxi is 2.5
train_cleaned = train_cleaned[(train_cleaned['fare_amount']<=500) & (train_cleaned['fare_amount']>=2.5)]

In [5]:
#making sure all pick-up and drop-off are based inside New York City.
#New York City latitude and longitude range is: North Latitude: 40.917577 South Latitude: 40.477399 East Longitude: -73.700272 West Longitude: -74.259090
#Source: https://www.mapdevelopers.com/geocode_bounding_box.php
#Data out of the range will be cleaned out

train_cleaned = train_cleaned[(train_cleaned['pickup_longitude']<=-73.700272)& 
                              (train_cleaned['pickup_longitude']>=-74.259090)&
                              (train_cleaned['dropoff_longitude']<=-73.700272)& 
                              (train_cleaned['dropoff_longitude']>=-74.259090)&
                              (train_cleaned['pickup_latitude']<=40.917577)& 
                              (train_cleaned['pickup_latitude']>=40.477399)&
                              (train_cleaned['dropoff_latitude']<=40.917577)& 
                              (train_cleaned['dropoff_latitude']>=40.477399)]

In [6]:
#clean out passenger to make it 0<passenger<=9
train_cleaned = train_cleaned[(train_cleaned['passenger_count']<=9) & (train_cleaned['passenger_count']>0)]

#final clean data descriptives
train_cleaned.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,53992450.0,53992450.0,53992450.0,53992450.0,53992450.0,53992450.0
mean,11.29041,-73.9755,40.75085,-73.97464,40.7512,1.691318
std,9.462659,0.03445833,0.026897,0.03385355,0.0307315,1.307174
min,2.5,-74.25903,40.47759,-74.25905,40.47765,1.0
25%,6.0,-73.99228,40.73657,-73.99159,40.73558,1.0
50%,8.5,-73.9821,40.75336,-73.98062,40.75384,1.0
75%,12.5,-73.96835,40.76752,-73.96542,40.76837,2.0
max,500.0,-73.70028,40.91754,-73.70028,40.91756,9.0


In [None]:
#sample 25% of the training data, and zip for easy sharing
sample_data = train_cleaned.sample(frac=.25)

#save the file as csv
sample_data.to_csv("train_data_cleaned_final.csv",compression='zip')

#save to gcp, needs to fix the code here
blob=bucket.blob('sample_data_cleaned.csv')
blob.upload_from_filename('sample_data_cleaned.csv')

In [None]:
#export the final cleaned train data to csv then zip
train_cleaned.to_csv('train_data_cleaned.csv',compression='gzip')

#save to gcp, need to fix the code here
blob=bucket.blob('train_data_cleaned.csv')
blob.upload_from_filename('train_data_cleaned.csv')