In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
filename = '../data/nyc-parking-violations-2020.csv'
df = pd.read_csv(filename,
                usecols=['Plate ID', 
                         'Registration State',
                         'Plate Type',
                         'Feet From Curb',
                        'Vehicle Make',
                         'Vehicle Color'])
df.columns = ['pid', 'state', 'ptype', 'make', 'color', 'feet']

In [3]:
# Find all of the cars whose registration state is from New York, New Jersey, or Connecticut, using .loc.
# How long does it take to perform this query?
%timeit df.loc[(df['state'] == 'NY') | (df['state'] == 'NJ') |  (df['state'] == 'CT')]

3.2 s ± 420 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
# Find all of the cars whose registration state is from New York, New Jersey, or Connecticut, using df.query.
# How long does it take to perform this query?
%timeit df.query("state == 'NY' or state == 'NJ' or state == 'CT'")

1.49 s ± 69.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
# How much faster was it to use "query"?
1.03 / 1.84

0.5597826086956522

In [6]:
# Use "isin" to search for the states.  How does this technique compare?
%timeit df.loc[df['state'].isin(['NY', 'NJ', 'CT'])]

838 ms ± 28.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%timeit df.query('state.isin(["NY", "NJ", "CT"])')

854 ms ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
# Find cars from New York using df.loc, and time it
%timeit df.loc[(df['state'] == 'NY')]

924 ms ± 14.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
# Find cars from New York using df.query, and time it
%timeit df.query('state == "NY"')

799 ms ± 13.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
# Find cars from New York using df.eval, and time it
%timeit df[df.eval('state == "NY"')]

775 ms ± 22.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%timeit df.loc[df.eval('state == "NY"')]

771 ms ± 19.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
# Find cars from New York with PAS plates using df.loc, and time it
%timeit df.loc[((df['state'] == 'NY') & (df['ptype'] == 'PAS'))]

1.27 s ± 10.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
# Find cars from New York with PAS plates using df.query, and time it
%timeit df.query('state == "NY" & ptype == "PAS"')

965 ms ± 58.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
# Find cars from New York with PAS plates using df.eval
%timeit df.loc[df.eval('state == "NY" & ptype == "PAS"')]

924 ms ± 16.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
# Find cars from New York with PAS plates with WHITE color using df.loc, and time it
%timeit df.loc[((df['state'] == 'NY') & (df['ptype'] == 'PAS') & (df['color'] == 'WHITE'))]

1.34 s ± 14.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
# Find cars from New York with PAS plates with WHITE color using df.query, and time it
%timeit df.query('state == "NY" & ptype == "PAS" & color == "WHITE"')

728 ms ± 3.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
# Find cars from New York with PAS plates with WHITE color  using df.eval
%timeit df.loc[df.eval('state == "NY" & ptype == "PAS" & color == "WHITE"')]

727 ms ± 2.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
# Find cars from New York with PAS plates with WHITE color parked > 1 foot from the curb using df.loc, and time it
%timeit df.loc[((df['state'] == 'NY') & (df['ptype'] == 'PAS') & (df['color'] == 'WHITE') & (df['feet'] > 1))]

1.31 s ± 3.62 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
# Find cars from New York with PAS plates with WHITE color  parked > 1 foot from the curb  using df.query, and time it
%timeit df.query('state == "NY" & ptype == "PAS" & color == "WHITE" & feet > 1')

712 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
# Find cars from New York with PAS plates with WHITE color  parked > 1 foot from the curb using df.eval
%timeit df.loc[df.eval('state == "NY" & ptype == "PAS" & color == "WHITE" & feet > 1')]

706 ms ± 3.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
# Find cars with make TOYOT from New York with PAS plates with WHITE color parked > 1 foot from the curb using df.loc, and time it
%timeit df.loc[((df['state'] == 'NY') & (df['ptype'] == 'PAS') & (df['color'] == 'WHITE') & (df['feet'] > 1) & (df['make'] == 'TOYOT'))]

1.75 s ± 4.85 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
# Find cars with make TOYOT  from New York with PAS plates with WHITE color  parked > 1 foot from the curb  using df.query, and time it
%timeit df.query('state == "NY" & ptype == "PAS" & color == "WHITE" & feet > 1 & make == "TOYOT"')

896 ms ± 2.98 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
# Fin d cars with make TOYOT  from New York with PAS plates with WHITE color  parked > 1 foot from the curb  using df.eval
%timeit df.loc[df.eval('state == "NY" & ptype == "PAS" & color == "WHITE" & feet > 1 & make == "TOYOT"')]

899 ms ± 4.33 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
