# Uber Project - Datathon

## Setup environment

In [1]:
# Import libraries
import os
from glob import glob
import pandas as pd
import numpy as np
import pandas_profiling as pp

In [2]:
# Change directory
os.chdir("..")
print(os.getcwd())

/mnt/d/Projects/ds4a_2019/datathon


## Import documents

In [3]:
os.listdir("./data/raw/")

['demographics.csv',
 'geographic.csv',
 'green_trips.csv.gz',
 'mta_trips.csv.gz',
 'uber_trips_2014.csv.gz',
 'uber_trips_2015.csv.gz',
 'weather.csv',
 'yellow_trips.csv.gz',
 'zones.csv']

In [4]:
# Import uncompressed csv
demographics = pd.read_csv("./data/raw/demographics.csv")
geographic = pd.read_csv("./data/raw/geographic.csv")
weather = pd.read_csv("./data/raw/weather.csv")
zones = pd.read_csv("./data/raw/zones.csv")

In [5]:
# Import compressed csv
mta_trips = pd.read_csv("./data/raw/mta_trips.csv.gz", compression="gzip", chunksize=100000).get_chunk()
uber_trips_2014 = pd.read_csv("./data/raw/uber_trips_2014.csv.gz", compression="gzip", chunksize=100000).get_chunk()
uber_trips_2015 = pd.read_csv("./data/raw/uber_trips_2015.csv.gz", compression="gzip", chunksize=100000).get_chunk()
yellow_trips = pd.read_csv("./data/raw/yellow_trips.csv.gz", compression="gzip", chunksize=100000).get_chunk()

In [6]:
# Define a function to get a filename by chunks
def green_trips_chuks(file_name, chunk_size=100000):
    final_df = list()

    for chunk in pd.read_csv(file_name,
                             compression="gzip",
                             chunksize=chunk_size):

        chunk["pickup_datetime"] = (
            chunk.pickup_datetime.values.astype("datetime64[ns]"))
        chunk["dropoff_datetime"] = (
            chunk.dropoff_datetime.values.astype("datetime64[ns]"))
        chunk["pickup_month"] = chunk.pickup_datetime.dt.month
        chunk["pickup_hour"] = chunk.pickup_datetime.dt.hour
        chunk["pickup_time"] = chunk.pickup_datetime.dt.time
        chunk["dropoff_time"] = chunk.dropoff_datetime.dt.time

        final_df.append(chunk)
        df = pd.concat(final_df)

    return df

In [55]:
# Define a function to get a filename by chunks
def mta_trips_chuks(file_name, chunk_size=100000):
    final_df = list()

    for chunk in pd.read_csv(file_name,
                             compression="gzip",
                             chunksize=chunk_size):

        chunk["datetime"] = pd.to_datetime(chunk.datetime)
        chunk["new_entries"] = chunk.new_entries * 1000000
        chunk["new_exits"] = chunk.new_exits * 1000000
        chunk["month"] = chunk.datetime.dt.month
        chunk["hour"] = chunk.datetime.dt.hour
        chunk["time"] = chunk.datetime.dt.time

        final_df.append(chunk)
        
        df = pd.concat(final_df)
        df["station"] = df.station.astype("category")
        df["line_name"] = df.line_name.astype("category")
        df["division"] = df.division.astype("category")
        df["audit_type"] = df.audit_type.astype("category")
        df["unit_id"] = df.unit_id.astype("category")

    return df

In [56]:
mta_trips = mta_trips_chuks("./data/raw/mta_trips.csv.gz")

In [58]:
import pandas_profiling as pp

In [57]:
mta_trips.dtypes

station              category
line_name            category
division             category
audit_type           category
unit_id              category
datetime       datetime64[ns]
new_entries             int64
new_exits               int64
latitude              float64
longitude             float64
month                   int64
hour                    int64
time                   object
dtype: object

In [52]:
mta_trips.station = mta_trips.station.astype('category')

In [27]:
mta_trips.head()

Unnamed: 0,station,line_name,division,audit_type,unit_id,datetime,new_entries,new_exits,latitude,longitude
0,WHITEHALL ST,R1,BMT,REGULAR,R001_A058_01-00-00,03/29/2014 05:00:00,4,6,40.703087,-74.012994
1,WHITEHALL ST,R1,BMT,REGULAR,R001_A058_01-00-00,03/29/2014 09:00:00,1,13,40.703087,-74.012994
2,WHITEHALL ST,R1,BMT,REGULAR,R001_A058_01-00-00,03/29/2014 13:00:00,1,8,40.703087,-74.012994
3,WHITEHALL ST,R1,BMT,REGULAR,R001_A058_01-00-00,03/29/2014 17:00:00,1,8,40.703087,-74.012994
4,WHITEHALL ST,R1,BMT,REGULAR,R001_A058_01-00-00,03/29/2014 21:00:00,2,3,40.703087,-74.012994


In [62]:
green_trips = green_trips_chuks("./data/raw/green_trips.csv.gz")

In [64]:
green_trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3589048 entries, 0 to 3589047
Data columns (total 13 columns):
pickup_datetime      datetime64[ns]
dropoff_datetime     datetime64[ns]
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      int64
trip_distance        float64
total_amount         float64
pickup_month         int64
pickup_hour          int64
pickup_time          object
dropoff_time         object
dtypes: datetime64[ns](2), float64(6), int64(3), object(2)
memory usage: 356.0+ MB


In [63]:
green_trips.profile_report(style={'full_width':True})



In [22]:
pd.to_datetime(mta_trips.datetime)

0       2014-03-29 05:00:00
1       2014-03-29 09:00:00
2       2014-03-29 13:00:00
3       2014-03-29 17:00:00
4       2014-03-29 21:00:00
                ...        
99995   2014-05-14 00:00:00
99996   2014-05-14 04:00:00
99997   2014-05-14 07:42:17
99998   2014-05-14 08:00:00
99999   2014-05-14 12:00:00
Name: datetime, Length: 100000, dtype: datetime64[ns]

In [25]:
print(mta_trips.dtypes)

station         object
line_name       object
division        object
audit_type      object
unit_id         object
datetime        object
new_entries      int64
new_exits        int64
latitude       float64
longitude      float64
dtype: object


In [None]:

print("\n")
print(uber_trips_2014.dtypes)
print("\n")
print(uber_trips_2015.dtypes)
print("\n")
print(yellow_trips.dtypes)
print("\n")

In [None]:
print(green_trips.describe())
print("\n")
print(mta_trips.describe())
print("\n")
print(uber_trips_2014.describe())
print("\n")
print(uber_trips_2015.describe())
print("\n")
print(yellow_trips.describe())
print("\n")