In [35]:
import pandas as pd
import numpy as np

In [36]:
cols = ['Plate ID', 'Registration State', 'Vehicle Make', 
        'Vehicle Color', 'Violation Time', 'Street Name']
cols

['Plate ID',
 'Registration State',
 'Vehicle Make',
 'Vehicle Color',
 'Violation Time',
 'Street Name']

In [37]:
df = pd.read_csv('../data/nyc-parking-violations-2020.csv', 
            usecols=cols, header=0)

In [38]:
df.head()

Unnamed: 0,Plate ID,Registration State,Vehicle Make,Violation Time,Street Name,Vehicle Color
0,J58JKX,NJ,HONDA,0523P,43 ST,BK
1,KRE6058,PA,ME/BE,0428P,UNION ST,BLK
2,444326R,NJ,LEXUS,0625A,CLERMONT AVENUE,BLACK
3,F728330,OH,CHEVR,1106A,DIVISION AVE,
4,FMY9090,NY,JEEP,1253A,GRAND ST,GREY


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12495734 entries, 0 to 12495733
Data columns (total 6 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   Plate ID            object
 1   Registration State  object
 2   Vehicle Make        object
 3   Violation Time      object
 4   Street Name         object
 5   Vehicle Color       object
dtypes: object(6)
memory usage: 572.0+ MB


In [40]:
orig_rows = df.shape[0]
orig_rows

12495734

In [41]:
# faster?
len(df)

12495734

In [42]:
df.index

RangeIndex(start=0, stop=12495734, step=1)

In [43]:
# even faster?
len(df.index)

12495734

In [44]:
df_drop_all_na = df.dropna()

In [45]:
len(df.index) - len(df_drop_all_na)

447359

In [46]:
row_count_after_drop_all_na = df_drop_all_na.shape[0]
row_count_after_drop_all_na

12048375

In [47]:
na_rows = orig_rows - row_count_after_drop_all_na
print(f"${na_rows * 100:,}")

$44,735,900


In [48]:
drop_na_subset = ['Plate ID', 'Registration State', 'Vehicle Make', 
        'Street Name']

In [49]:
df_drop_na_subset = df.dropna(subset=drop_na_subset)

In [50]:
new_rows_count_drop_subset = df_drop_na_subset.shape[0]

In [51]:
na_row_count_dropped_subset = orig_rows - new_rows_count_drop_subset
print(f"${na_row_count_dropped_subset * 100:,}")

$6,378,500


In [52]:
drop_na_subset2 = ['Plate ID', 'Registration State', 'Street Name']
df_loose = df.dropna(subset=drop_na_subset2)

In [53]:
len(df.index) - len(df_loose.index)

1618

So far, you have specified which columns must be all non-null. But sometimes
it’s OK for some columns to have null values, as long as it’s not too many. How
many rows would you eliminate if you required at least three non-null values
from the four columns Plate ID, Registration State, Vehicle Make, and
Street Name?

In [54]:
semi_good_subset = df.dropna(subset=drop_na_subset, thresh=3)

In [55]:
len(df) - len(semi_good_subset.index)

253

Which of the columns you’ve imported has the greatest number of NaN values?
Is this a problem?

In [61]:
df.isnull().sum()

Plate ID                9084
Registration State         0
Vehicle Make           62420
Violation Time           278
Street Name             1417
Vehicle Color         391982
dtype: int64

Null data is bad, but there is plenty of bad non-null data, too. For example,
many cars with BLANKPLATE as a plate ID were ticketed. Turn these into NaN val-
ues, and rerun the previous query.

In [57]:
df.columns

Index(['Plate ID', 'Registration State', 'Vehicle Make', 'Violation Time',
       'Street Name', 'Vehicle Color'],
      dtype='object')

In [58]:
df.loc[df['Plate ID'] == 'BLANKPLATE'].count()

Plate ID              8882
Registration State    8882
Vehicle Make          8139
Violation Time        8879
Street Name           8794
Vehicle Color         8045
dtype: int64

In [59]:
df['Plate ID'] = df['Plate ID'].replace('BLANKPLATE',np.NaN)

In [62]:
df.isnull().sum()

Plate ID                9084
Registration State         0
Vehicle Make           62420
Violation Time           278
Street Name             1417
Vehicle Color         391982
dtype: int64