# Ex 28 - Inconsistent data
pg. 154

In [2]:
import pandas as pd

In [3]:
cols = ['Plate ID', 'Registration State', 'Vehicle Make', 'Vehicle Color', 'Street Name']

In [52]:
df = pd.read_csv('../data/nyc-parking-violations-2020.csv', header=0,
            usecols=cols)

In [53]:
df['Vehicle Color'].unique().shape

(1897,)

In [54]:
df['Vehicle Color'].value_counts()[:30]

Vehicle Color
WH       2344858
GY       2307704
BK       2066374
WHITE    1061234
BL        775124
RD        483298
BLACK     465110
GREY      306787
BROWN     292348
SILVE     191477
GR        182929
BLUE      178298
RED       161693
TN        120576
BR        102204
YW         98700
BLK        91539
OTHER      60245
GREEN      58765
GL         54851
GRY        46527
MR         42812
GRAY       40854
WHT        35433
YELLO      32792
WHI        29760
OR         28100
BK.        27830
WT         25583
WT.        24593
Name: count, dtype: int64

In [55]:
colors = { 'WT': 'WHITE', 'WT.': 'WHITE', 'WHI': 'WHITE', 'WH': 'WHITE', 
          'WHT':'WHITE', 'WH.':'WHITE', 'TN':'TAN',
         'BK':'BLACK', 'BLK':'BLACK', 'BK.': 'BLACK', 'BLACK':'BLACK', 'BLK.':'BLACK', 
          'GY': 'GREY', 
          'GY.':'GREY', 'GRY':'GREY', 'BRO':'BROWN',
          'GRAY':'GREY', 'BL':'BLUE', 'BL.': 'BLUE', 
          'BLU':'BLUE', 'SILVE':'SILVE', 'SIL':'SILVE', 'SL.':'SILVE', 
          'BR': 'BROWN', 'BRN':'BROWN', 'OR':'ORANG', 
          'GR':'GREEN', 'GRN':'GREEN', 'YW': 'YELLO', 'YELLO':'YELLO', 'BL':'BLUE', 'RD':'RED', 'RD.':'RED'
         }
colors

{'WT': 'WHITE',
 'WT.': 'WHITE',
 'WHI': 'WHITE',
 'WH': 'WHITE',
 'WHT': 'WHITE',
 'WH.': 'WHITE',
 'TN': 'TAN',
 'BK': 'BLACK',
 'BLK': 'BLACK',
 'BK.': 'BLACK',
 'BLACK': 'BLACK',
 'BLK.': 'BLACK',
 'GY': 'GREY',
 'GY.': 'GREY',
 'GRY': 'GREY',
 'BRO': 'BROWN',
 'GRAY': 'GREY',
 'BL': 'BLUE',
 'BL.': 'BLUE',
 'BLU': 'BLUE',
 'SILVE': 'SILVE',
 'SIL': 'SILVE',
 'SL.': 'SILVE',
 'BR': 'BROWN',
 'BRN': 'BROWN',
 'OR': 'ORANG',
 'GR': 'GREEN',
 'GRN': 'GREEN',
 'YW': 'YELLO',
 'YELLO': 'YELLO',
 'RD': 'RED',
 'RD.': 'RED'}

In [56]:
df['Vehicle Color'] = df['Vehicle Color'].replace(colors)

In [57]:
df['Vehicle Color'].unique().shape

(1868,)

In [58]:
df['Vehicle Color'].value_counts()[:30]

Vehicle Color
WHITE    3525272
GREY     2724332
BLACK    2655706
BLUE      978311
RED       652119
BROWN     401936
GREEN     246523
SILVE     224738
TAN       141667
YELLO     131492
OTHER      60245
GL         54851
MR         42812
ORANG      39606
GOLD       21687
LTGY       13055
SL         10343
LTG        10093
LT/         8976
PR          7518
DK/         7498
W           7367
DKGY        6004
GYGY        5039
B           4145
DKG         3702
PURPL       3635
BKGY        3504
WHBL        3489
DKBL        2912
Name: count, dtype: int64

## Beyond 1

Run value_counts on the Vehicle Make column, and look at some vehicle
names. (There are more than 5,200 distinct makes, which almost certainly indi-
cates a lot of inconsistency in this data.) What problems do you see? Write a
function that, given a value, cleans up the data: putting the name in all caps,
removing punctuation, and standardizing whatever names you can. Then use
the apply method to fix the column. How many distinct vehicle makes are
there when you’re done?

In [73]:
len(df['Vehicle Make'].unique())


5211

In [74]:
df['Vehicle Make'].value_counts().sort_index()

Vehicle Make
,IT      1
-VL      3
.        1
.IA      1
.SUB     1
        ..
ZVWIE    2
ZYMER    1
ZYNTE    2
ZYWIE    5
ken      1
Name: count, Length: 5210, dtype: int64

In [75]:
# I could have used regular expressions, but decided to make it a bit easier to follow

import string

def clean_name(one_string):

    if not isinstance(one_string, str):
        return one_string

    output = ''
    
    for one_character in one_string.strip().upper():
        if one_character in string.ascii_uppercase:
            output += one_character

    return output


In [76]:
print(len(df['Vehicle Make'].value_counts()))
df['Vehicle Make'] = df['Vehicle Make'].apply(clean_name)
print(len(df['Vehicle Make'].value_counts()))

5210
4915


How standardized are the street names in the data set? What changes could you
apply to improve things?

In [None]:
s = df['Street Name'].dropna()

In [84]:
s[s.str.contains('110')].value_counts()[:40]

Street Name
W 110th St              2970
110th St                2388
E 110th St              2048
WB 110TH AVE/BRINKER     922
110th Ave                704
110 ST                    94
110th Rd                  93
W 110 ST                  87
E 110 ST                  71
SB 110TH ST @ 67TH D      65
Beach 110th St            43
110 STREET                36
EAST 110 STREET           36
110 AVE                   33
E 110 STREET              23
W 110TH ST                14
110 AVENUE                13
EAST 110 ST               11
E 110TH ST                10
WEST 110 ST                9
W 110 STREET               5
110TH ST                   5
S/S E 110 ST               4
110TH AVE                  4
E 110                      4
S/E/C 110 STREET           4
110TH STREET               3
C/O 110 AVE                3
S/S/O 110 ST               3
110TH AVENUE               3
E 110TH STREET             3
N/E/C 110 STREET           3
BEACH 110 ST               3
WEST 110 STREET            2
S/

In [79]:
df['Street Name'].value_counts()[:40]

Street Name
Broadway                180225
3rd Ave                 133003
5th Ave                  78211
2nd Ave                  75533
Madison Ave              75419
Lexington Ave            62859
1st Ave                  58491
Queens Blvd              58423
8th Ave                  54641
WB ATLANTIC AVE @ CL     54298
WB SEAGIRT BLVD @ CR     53142
7th Ave                  51583
6th Ave                  50750
EB HORACE HARDING EX     50263
Amsterdam Ave            46657
EB CONDUIT BLVD @ GL     46185
SB MAIN ST @ 82ND DR     45274
NB SPRINGFIELD BLVD      44446
Jamaica Ave              42859
EB E 233RD ST @ KATO     42727
37th Ave                 39865
Park Ave                 36140
WB ATLANTIC AVE @ SH     34989
Roosevelt Ave            34702
WB GOETHALS RD N @ J     34210
WB LINDEN BLVD @ LIN     34187
SB FRANCIS LEWIS BLV     34026
Fulton St                33491
EB HILLSIDE AVE @ 25     33382
Columbus Ave             32971
SB WEST ST @ LEROY S     32225
White Plains Rd          31

In [85]:
s[s.str.contains('BWAY') | s.str.contains('BROADWAY')].value_counts()

Street Name
SB BROADWAY @ 252ND     21939
NB BROADWAY @ W 228T    13367
BROADWAY                10771
SB BROADWAY @ W 196T     6623
NB BROADWAY @ W 120T     5691
                        ...  
S/B BWAY                    1
BROADWAY PL                 1
S/S BWAY                    1
S/O 1350 BROADWAY           1
N/E 220 BROADWAY            1
Name: count, Length: 181, dtype: int64

In [92]:
df['Registration State'].value_counts()[0:52]

Registration State
NY    9753643
NJ    1096110
PA     338779
FL     174056
CT     165205
IN     138824
MA      83726
VA      78272
NC      63687
MD      59940
TX      54600
GA      43497
ME      37158
IL      32855
OH      27309
SC      26997
AZ      26034
99      24654
CA      23897
MN      23432
GV      21283
TN      20535
OK      17213
DE      15673
MI      15170
OR      14535
WI      12776
RI      11239
AL      10727
WA       9137
NH       8944
VT       8097
ON       5548
MO       5439
LA       4441
CO       4298
KY       4285
QB       3551
DC       3448
WV       3032
DP       2904
MS       2587
IA       2277
SD       2202
AR       1912
NV       1700
KS       1382
UT       1368
ID       1328
NE       1286
NM       1248
MT       1097
Name: count, dtype: int64