 create a data frame from two different CSV files containing New York taxi data: one from July 2019 (before the pandemic) and a second
from July 2020 (near the height of the pandemic, at least in New York). 

The dataframe should contain three columns from the files: passenger_count, total_amount, and payment_type. 

The DataFrame should also include a fifth column, year, which should be set to 2019 or 2020, depending on the file from which the data was loaded.

In [94]:
import pandas as pd
import numpy as np
from pandas import DataFrame, Series

In [4]:
df2019 = pd.read_csv('../data/nyc_taxi_2019-07.csv', 
        usecols=['passenger_count','total_amount', 'payment_type'])

In [5]:
df2019['year'] = 2019

In [6]:
df2019.shape

(6310419, 4)

In [7]:
df2019.head()

Unnamed: 0,passenger_count,payment_type,total_amount,year
0,1.0,1.0,4.94,2019
1,1.0,2.0,20.3,2019
2,1.0,1.0,70.67,2019
3,1.0,1.0,66.36,2019
4,0.0,1.0,15.3,2019


In [8]:
df2020 = pd.read_csv('../data/nyc_taxi_2020-07.csv', 
        usecols=['passenger_count','total_amount', 'payment_type'])
df2020['year'] = 2020

In [9]:
df2020.shape

(800412, 4)

In [47]:
df_all = pd.concat([df2019, df2020])

In [16]:
num_rides_2019 = df2019.shape[0]
num_rides_2020 = df2020.shape[0]
diff = num_rides_2020 - num_rides_2019

In [50]:
print(f"diff = ${diff:,.2f}")

diff = $-5,510,007.00


In [18]:
pct_change = diff / num_rides_2019 * 100
pct_change

-87.31602449853172

In [53]:
print(f"2019$: ${df2019['total_amount'].sum():,.2f}")

2019$: $123,761,823.33


In [54]:
print(f"2020$: ${df2020['total_amount'].sum():,.2f}")

2020$: $14,912,844.09


In [55]:
diff_dollars = (df2020['total_amount'].sum() - 
         df2019['total_amount'].sum())
print(f"diff_dollars = ${diff_dollars:,.2f}")

diff_dollars = $-108,848,979.24


In [56]:
diff_pct = (df2020['total_amount'].sum() - 
         df2019['total_amount'].sum())/df2019['total_amount'].sum()

In [59]:
print(f"total amount change pct = {diff_pct:,.2f}%")

total amount change pct = -0.88%


In [60]:
df2019['passenger_count'].value_counts(normalize=True).sort_index()

passenger_count
0.0    0.018623
1.0    0.697987
2.0    0.151958
3.0    0.044481
4.0    0.022259
5.0    0.040510
6.0    0.024171
7.0    0.000005
8.0    0.000004
9.0    0.000003
Name: proportion, dtype: float64

In [69]:
print(f"2019 multi-pass ride count = {df2019['passenger_count'].value_counts(normalize=True).sort_index().loc[2:].sum()}")

2019 multi-pass ride count = 0.2833900000955953


In [70]:
print(f"2020 pass count % = {df2020['passenger_count'].value_counts(normalize=True).sort_index()}")

      

2020 pass count % = passenger_count
0.0    0.026446
1.0    0.767402
2.0    0.123243
3.0    0.029527
4.0    0.010589
5.0    0.023194
6.0    0.019587
7.0    0.000007
8.0    0.000003
9.0    0.000001
Name: proportion, dtype: float64


In [84]:
print(f"2020 multi-pass ride count = {df2020['passenger_count'].value_counts(normalize=True).sort_index().loc[2:].sum()}")

2020 multi-pass ride count = 0.2061513222563435


In [68]:
df2019['payment_type'].value_counts(normalize=True)[2]  #.loc[2:].sum()

0.2870595845428793

In [45]:
df2020['payment_type'].value_counts(normalize=True)[2]  #.loc[2:].sum()

0.320558865998251

In [85]:
df_all.loc[(df_all['year'] == 2020) & 
       (df_all['passenger_count'] > 1), 'passenger_count'].count() / \
        df_all.loc[df_all['year'] == 2020, 'payment_type'].count()

0.2061513222563435

In [86]:
# corrected
df_all.loc[
        (df_all['year'] == 2020) & 
       (df_all['passenger_count'] > 1), 'passenger_count'].count() / \
        df_all.loc[df_all['year'] == 2020, 'passenger_count'].count()



0.2061513222563435

In [87]:
df_all.corr()

Unnamed: 0,passenger_count,payment_type,total_amount,year
passenger_count,1.0,0.01641,0.014943,-0.049558
payment_type,0.01641,1.0,-0.138561,0.029277
total_amount,0.014943,-0.138561,1.0,-0.019706
year,-0.049558,0.029277,-0.019706,1.0


In [88]:
(
    df_all.loc[df_all['year'] == 2020, 'total_amount'].describe().round(2) - 
    df_all.loc[df_all['year'] == 2019, 'total_amount'].describe().round(2)
)

count   -5510007.00
mean          -0.98
std           -0.75
min           53.20
25%           -0.50
50%           -0.60
75%           -0.75
max        -4672.45
Name: total_amount, dtype: float64

In [91]:
df2020['passenger_count'].value_counts().sort_index()

passenger_count
0.0     19506
1.0    566009
2.0     90900
3.0     21778
4.0      7810
5.0     17107
6.0     14447
7.0         5
8.0         2
9.0         1
Name: count, dtype: int64

In [92]:
df2019['passenger_count'].value_counts().sort_index()

passenger_count
0.0     116884
1.0    4380890
2.0     953756
3.0     279181
4.0     139710
5.0     254262
6.0     151710
7.0         29
8.0         22
9.0         16
Name: count, dtype: int64

In [100]:
df_2019_jul = pd.read_csv('../data/nyc_taxi_2019-07.csv',
    usecols=['passenger_count', 'total_amount', 'payment_type'])
# ,
#     dtype={'passenger_count':np.int8, 'total_amount':np.float32, 'payment_type':np.float64})

In [110]:
df_2019_jul.loc[df_2019_jul['passenger_count'].isnull()]['passenger_count']

6276460   NaN
6276461   NaN
6276462   NaN
6276463   NaN
6276464   NaN
           ..
6310414   NaN
6310415   NaN
6310416   NaN
6310417   NaN
6310418   NaN
Name: passenger_count, Length: 33959, dtype: float64

In [111]:
df_2019_jul.loc[df_2019_jul['payment_type'].isnull()]['payment_type']

6276460   NaN
6276461   NaN
6276462   NaN
6276463   NaN
6276464   NaN
           ..
6310414   NaN
6310415   NaN
6310416   NaN
6310417   NaN
6310418   NaN
Name: payment_type, Length: 33959, dtype: float64

In [113]:
df_2019_jul.loc[df_2019_jul['total_amount'].isnull()]['total_amount'].count()

0

In [102]:
df_2019_jul.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6310419 entries, 0 to 6310418
Data columns (total 3 columns):
 #   Column           Dtype  
---  ------           -----  
 0   passenger_count  float64
 1   payment_type     float64
 2   total_amount     float64
dtypes: float64(3)
memory usage: 144.4 MB


In [103]:
df_2019_jul.describe()

Unnamed: 0,passenger_count,payment_type,total_amount
count,6276460.0,6276460.0,6310419.0
mean,1.572045,1.30558,19.6123
std,1.214311,0.4877842,15.81046
min,0.0,1.0,-450.8
25%,1.0,1.0,11.3
50%,1.0,1.0,14.76
75%,2.0,2.0,21.3
max,9.0,4.0,6667.45
