In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import glob
import os

# 2019 Data

In [2]:
parquet_urls_19 = [
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2019-01.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2019-02.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2019-03.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2019-04.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2019-05.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2019-06.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2019-07.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2019-08.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2019-09.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2019-10.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2019-11.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2019-12.parquet'
]

In [3]:
all_files = glob.glob('parquet_urls_19')
dfs_19 = []
for url in parquet_urls_19:
    df = pd.read_parquet(url)
    dfs_19.append(df)

df19 = pd.concat(dfs_19, ignore_index=True)

#### Add column for percent of total fare the taxi riders have tipped

In [4]:
df19['tip_percent'] = (df19['tip_amount'] / df19['fare_amount']) * 100

#### To clean the data, make df with only rides that have fares and distance over 0

In [5]:
df19_cleaned = df19[(df19['total_amount'] > 0) & (df19['trip_distance'] > 0)]

### Total Ride Count by Month and Year
> This includes all payment types, and cleaned to only include 2020 data

In [6]:
df19_count = df19_cleaned.groupby([df19_cleaned['tpep_pickup_datetime'].dt.year.rename('year'), df19_cleaned['tpep_pickup_datetime'].dt.month.rename('month')]).agg(count=('VendorID', 'size')).reset_index()
df19_count_clean = df19_count[df19_count['year'] == 2019]
df19_count_clean

Unnamed: 0,year,month,count
7,2019,1,7635304
8,2019,2,6991042
9,2019,3,7802205
10,2019,4,7414533
11,2019,5,7530381
12,2019,6,6895948
13,2019,7,6231001
14,2019,8,5990659
15,2019,9,6482328
16,2019,10,7127820


### Credit Card vs. Cash Payments
> <p> Filtering out other payment types that aren't card or cash and making it a df </p>

In [7]:
df19_paymenttype = df19_cleaned[(df19_cleaned['payment_type'] == 1) | (df19_cleaned['payment_type'] == 2)]

In [8]:
pay_breakdown_19 = df19_paymenttype.payment_type.value_counts()
pay_breakdown_19_df = pd.DataFrame({'Payment Type': pay_breakdown_19.index, 'Count': pay_breakdown_19.values})
pay_breakdown_19_df['Payment Type'] = pay_breakdown_19_df['Payment Type'].replace({1: 'Credit Card', 2: 'Cash'})
pay_breakdown_19_df['Year'] = 2019
pay_breakdown_19_df

Unnamed: 0,Payment Type,Count,Year
0,Credit Card,60243015,2019
1,Cash,22604397,2019


### Median tip percentage by month
> only for trips where payment method is credit card (1)

In [9]:
df19_credit = df19_cleaned[(df19_cleaned['payment_type'] == 1)]
df19_tip = df19_credit.groupby([df19_credit['tpep_pickup_datetime'].dt.year.rename('year'), df19_credit['tpep_pickup_datetime'].dt.month.rename('month')]).agg(count=('VendorID', 'size'), median_tip=('tip_percent', 'median')).reset_index()
df19_tip_clean = df19_tip[df19_tip['year'] == 2019]
df19_tip_clean

Unnamed: 0,year,month,count,median_tip
7,2019,1,5462338,22.222222
8,2019,2,5164758,26.0
9,2019,3,5697701,26.190476
10,2019,4,5331684,26.190476
11,2019,5,5431608,26.0
12,2019,6,4952447,25.931034
13,2019,7,4387521,26.08
14,2019,8,4177053,26.163934
15,2019,9,4686109,25.909091
16,2019,10,5177750,26.0


## Export each df to csv
> to be able to combine with 2020-2023 data in other notebook

2019 Ride Count

In [11]:
df19_count_clean.to_csv('df19_count.csv', index=False)

2019 Payment Method Breakdown

In [10]:
pay_breakdown_19_df.to_csv('df19_pay_breakdown.csv', index=False)

2019 Median Tip Percentage

In [12]:
df19_tip_clean.to_csv('df19_tip.csv', index=False)