In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importing data

In [2]:
raw_fare = pd.read_csv("data/trip_fare_4.csv")

In [3]:
raw_data = pd.read_csv("data/trip_data_4.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


The files were too big for pandas to automatically assign the best data types, will need to do that manually.

In [4]:
clean_fare = raw_fare
clean_data = raw_data

# Initial data inspection and cleaning

## Checking the variable names and dtypes

In [5]:
clean_fare.columns

Index(['medallion', ' hack_license', ' vendor_id', ' pickup_datetime',
       ' payment_type', ' fare_amount', ' surcharge', ' mta_tax',
       ' tip_amount', ' tolls_amount', ' total_amount'],
      dtype='object')

Visual inspection of the column names shows there is unnecessary whitespace.

In [6]:
# removing the whitespace in column names
clean_fare.columns = clean_fare.columns.str.replace(" ", "")
clean_data.columns = clean_data.columns.str.replace(" ", "")

In [7]:
# Change columns to a category
clean_fare["medallion"] = clean_fare["medallion"].astype("category")
clean_fare["hack_license"] = clean_fare["hack_license"].astype("category")

clean_fare.vendor_id.unique() # Only 2 types "CMT" and "VTF"
clean_fare["vendor_id"] = clean_fare["vendor_id"].astype("category")

clean_fare.payment_type.unique() # 'CRD', 'CSH', 'UNK', 'NOC', 'DIS'
clean_fare["payment_type"] = clean_fare["payment_type"].astype("category")

# Change type to datetime 
clean_fare["pickup_datetime"] = clean_fare["pickup_datetime"].astype("datetime64")

In [8]:
# Change columns to a category
clean_data["medallion"] = clean_data["medallion"].astype("category")
clean_data["hack_license"] = clean_data["hack_license"].astype("category")

clean_data.vendor_id.unique() # Only 2 types "CMT" and "VTF"
clean_data["vendor_id"] = clean_data["vendor_id"].astype("category")

clean_data.rate_code.unique() # 0-9 and 65, 77, 206, 208, 210
clean_data["rate_code"] = clean_data["rate_code"].astype("category")

clean_data.store_and_fwd_flag.unique() # Y, N, NA
clean_data["store_and_fwd_flag"] = clean_data["store_and_fwd_flag"].astype("category")

# Change type to datetime 
clean_data["pickup_datetime"] = clean_data["pickup_datetime"].astype("datetime64")
clean_data["dropoff_datetime"] = clean_data["dropoff_datetime"].astype("datetime64")

In [9]:
def downcaster(df):
    """
    Checks the dtype of each numerical variable and downcasts to the lowest 
    memory usage datatype possible
    
    param: pandas.core.frame.DataFrame
    
    returns: the downcasted dataframe
    """
    for column in df.columns:
        if df[column].dtype == "int":
            downcast_type = "integer"
        elif df[column].dtype == "float":
            downcast_type = "float"
        else:
            continue
        df[column] = pd.to_numeric(df[column], 
                                   errors='ignore', 
                                   downcast=downcast_type)
    return df

In [10]:
# Downcasting to make things less memory intensive
clean_data = downcaster(clean_data)
clean_fare = downcaster(clean_fare)

In [11]:
clean_data.info()
clean_fare.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15100468 entries, 0 to 15100467
Data columns (total 14 columns):
 #   Column              Dtype         
---  ------              -----         
 0   medallion           category      
 1   hack_license        category      
 2   vendor_id           category      
 3   rate_code           category      
 4   store_and_fwd_flag  category      
 5   pickup_datetime     datetime64[ns]
 6   dropoff_datetime    datetime64[ns]
 7   passenger_count     int8          
 8   trip_time_in_secs   int16         
 9   trip_distance       float32       
 10  pickup_longitude    float32       
 11  pickup_latitude     float32       
 12  dropoff_longitude   float32       
 13  dropoff_latitude    float32       
dtypes: category(5), datetime64[ns](2), float32(5), int16(1), int8(1)
memory usage: 693.1 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15100468 entries, 0 to 15100467
Data columns (total 11 columns):
 #   Column           Dtype         


Saved a few gb of memory!

## Missing Values

In [12]:
clean_fare.isna().sum()

medallion          0
hack_license       0
vendor_id          0
pickup_datetime    0
payment_type       0
fare_amount        0
surcharge          0
mta_tax            0
tip_amount         0
tolls_amount       0
total_amount       0
dtype: int64

In [13]:
clean_data.isna().sum()

medallion                   0
hack_license                0
vendor_id                   0
rate_code                   0
store_and_fwd_flag    7518657
pickup_datetime             0
dropoff_datetime            0
passenger_count             0
trip_time_in_secs           0
trip_distance               0
pickup_longitude            0
pickup_latitude             0
dropoff_longitude         146
dropoff_latitude          146
dtype: int64

Only missing values are relating to coordinates and store and fwd flag (this relates to when the fare system is down and the taxi driver needs to store the fare and upload it later, assuming NA means the system was functioning).

## Summary Statistics

In [14]:
clean_fare.describe().apply(lambda s: s.apply('{0:.2f}'.format))

Unnamed: 0,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
count,15100468.0,15100468.0,15100468.0,15100468.0,15100468.0,15100468.0
mean,12.27,0.33,0.5,1.35,0.24,14.69
std,9.96,0.37,0.03,2.13,1.19,11.94
min,2.5,0.0,0.0,0.0,0.0,2.5
25%,6.5,0.0,0.5,0.0,0.0,8.0
50%,9.5,0.0,0.5,1.0,0.0,11.0
75%,14.0,0.5,0.5,2.0,0.0,16.5
max,500.0,15.0,0.5,200.0,20.0,628.1


In [15]:
clean_data.describe().apply(lambda s: s.apply('{0:.2f}'.format))

Unnamed: 0,passenger_count,trip_time_in_secs,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
count,15100468.0,15100468.0,15100468.0,15100468.0,15100468.0,15100322.0,15100322.0
mean,1.71,746.61,2.86,-72.73,40.07,-72.69,40.05
std,1.39,550.44,3.34,9.73,6.96,9.86,6.98
min,0.0,0.0,0.0,-2323.42,-3481.14,-2771.29,-3547.9
25%,1.0,360.0,1.04,-73.99,40.74,-73.99,40.73
50%,1.0,600.0,1.78,-73.98,40.75,-73.98,40.75
75%,2.0,960.0,3.2,-73.97,40.77,-73.96,40.77
max,9.0,10800.0,100.0,2228.72,3210.39,2228.75,3577.13


The ranges make sense and it doesn't look like the data needs any more cleaning.

## Saving for later use

In [16]:
# Saving dataframes as a python object to use in another notebook
clean_fare.to_pickle('data/clean_fare.pickle')
clean_data.to_pickle('data/clean_data.pickle')