In [1]:
import pandas as pd
from pathlib import Path
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
big_path = "Traffic_Violations.csv"
big_df_smaller = pd.read_csv(big_path, low_memory=False)

In [3]:
# Drop rows that do not result in a citation or warning
big_df_smaller.drop(big_df_smaller[big_df_smaller['Violation Type'] == 'ESERO'].index, inplace=True)
big_df_smaller.drop(big_df_smaller[big_df_smaller['Violation Type'] == 'SERO'].index, inplace=True)

In [4]:
# Clean the Year column
# Drop rows with null year
big_df_smaller = big_df_smaller.dropna(subset=["Year"])
# Change year to integer
big_df_smaller["Year"]= big_df_smaller["Year"].astype(int)
# Get years in a list
years = big_df_smaller["Year"].unique()
# Make a list of garbage years
years_to_remove = [0, 6338, 1005, 1196, 2912, 1009, 2088, 1007, 2102, 2109, 2105, 2997]
# Clean the list of years
good_years = [value for value in years if value not in years_to_remove]
# Keep only rows with good years
big_df_smaller = big_df_smaller[big_df_smaller["Year"].isin(good_years)]
big_df_smaller.shape

(1790375, 43)

In [5]:
big_df_smaller["Search Reason"].value_counts(dropna=False)

NaN                       1707686
Incident to Arrest          47315
Probable Cause              20678
Consensual                  11540
K-9                          1616
Other                        1029
Exigent Circumstances         500
Probable Cause for CDS          4
Arrest/Tow                      3
plain view marijuana            3
DUI                             1
Name: Search Reason, dtype: int64

In [6]:
# Get value counts including null values
value_counts_result = big_df_smaller['Driver State'].value_counts(dropna=False).reset_index()

# Rename the columns for clarity
value_counts_result.columns = ['Driver State', 'Count']

In [7]:
big_df_smaller.fillna('None', inplace=True)

In [8]:
# Assuming 'Driver State' is a column in your DataFrame
driver_states_info = big_df_smaller['Driver State']

# Create a new column 'State Category' and set default value to 'out of state'
big_df_smaller['Driver State Category'] = 'out of state'

# Update the values based on the condition
big_df_smaller.loc[driver_states_info == 'MD', 'Driver State Category'] = 'in state'

In [9]:
# Assuming 'Driver State' is a column in your DataFrame
plate_states_info = big_df_smaller['State']

# Create a new column 'State Category' and set default value to 'out of state'
big_df_smaller['License Plate State Category'] = 'out of state'

# Update the values based on the condition
big_df_smaller.loc[plate_states_info == 'MD', 'License Plate State Category'] = 'in state'

In [10]:
big_df_smaller['Make'].replace('TOYT', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('HOND', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('CHEV', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('NISS', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('MERZ', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('HYUN', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('VOLK', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('ACUR', 'ACURA', inplace=True)
big_df_smaller['Make'].replace('DODG', 'DODGE', inplace=True)
big_df_smaller['Make'].replace('CHEVY', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('CHRY', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('MITS', 'MITSUBISHI', inplace=True)

In [11]:
big_df_smaller['Make'].replace('SUBA', 'SUBARU', inplace=True)
big_df_smaller['Make'].replace('MAZD', 'MAZDA', inplace=True)
big_df_smaller['Make'].replace('CADI', 'CADILLAC', inplace=True)
big_df_smaller['Make'].replace('VW', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('INFI', 'INFINITI', inplace=True)
big_df_smaller['Make'].replace('VOLKS', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('MERC', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('LEXS', 'LEXUS', inplace=True)
big_df_smaller['Make'].replace('LEXU', 'LEXUS', inplace=True)
big_df_smaller['Make'].replace('VOLV', 'VOLVO', inplace=True)
big_df_smaller['Make'].replace('PONT', 'PONTIAC', inplace=True)
big_df_smaller['Make'].replace('TOYO', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('LINC', 'LINCOLN', inplace=True)
big_df_smaller['Make'].replace('INIFINITY', 'INFINITI', inplace=True)
big_df_smaller['Make'].replace('BUIC', 'BUICK', inplace=True)
big_df_smaller['Make'].replace('MERCEDEZ', 'MERCEDES', inplace=True)

In [12]:
big_df_smaller['Make'].replace('MERCEDES BENZ', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('TOY', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('OLDS', 'OLDSMOBILE', inplace=True)
big_df_smaller['Make'].replace('NISSIAN', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('LNDR', 'LAND-ROVER', inplace=True)
big_df_smaller['Make'].replace('LAND ROVER', 'LAND-ROVER', inplace=True)
big_df_smaller['Make'].replace('PORSHE', 'PORSCHE', inplace=True)
big_df_smaller['Make'].replace('SATU', 'SATURN', inplace=True)
big_df_smaller['Make'].replace('HYUND', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('RANGE ROVER', 'RANGE-ROVER', inplace=True)

In [13]:
big_df_smaller['Make'].replace('TOYOT', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('TOYTA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('CHEVORLET', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('INTL', 'INTERNATIONAL', inplace=True)

In [14]:
big_df_smaller['Make'].replace('PTRB', 'PETERBILT', inplace=True)
big_df_smaller['Make'].replace('ISUZ', 'ISUZU', inplace=True)
big_df_smaller['Make'].replace('CHRYS', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('LANDROVER', 'LAND-ROVER', inplace=True)
big_df_smaller['Make'].replace('JAGU', 'JAGUAR', inplace=True)
big_df_smaller['Make'].replace('ISU', 'ISUZU', inplace=True)
big_df_smaller['Make'].replace('SCIO', 'SCION', inplace=True)
big_df_smaller['Make'].replace('LEX', 'LEXUS', inplace=True)
big_df_smaller['Make'].replace('SUZU', 'SUZUKI', inplace=True)
big_df_smaller['Make'].replace('FRHT', 'FREIGHTLINER', inplace=True)
big_df_smaller['Make'].replace('SUZI', 'SUZUKI', inplace=True)
big_df_smaller['Make'].replace('STRN', 'SATURN', inplace=True)
big_df_smaller['Make'].replace('HYUNDIA', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('CHRYSTLER', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('CADILAC', 'CADILLAC', inplace=True)
big_df_smaller['Make'].replace('SUB', 'SUBARU', inplace=True)
big_df_smaller['Make'].replace('PLYM', 'PLYMOUTH', inplace=True)
big_df_smaller['Make'].replace('MITZ', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('TESL', 'TESLA', inplace=True)
big_df_smaller['Make'].replace('CHEVEROLET', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('INFIN', 'INFINITI', inplace=True)
big_df_smaller['Make'].replace('MNNI', 'MINI', inplace=True)
big_df_smaller['Make'].replace('SUBURU', 'SUBARU', inplace=True)
big_df_smaller['Make'].replace('MITSU', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('MERCEDES-BENZ', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('HYUNDI', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('TOYTOA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('CRYSLER', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('YAMA', 'YAMAHA', inplace=True)
big_df_smaller['Make'].replace('SUBU', 'SUBARU', inplace=True)
big_df_smaller['Make'].replace('MAZ', 'MAZDA', inplace=True)
big_df_smaller['Make'].replace('HYUNDA', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('HUMM', 'HUMMER', inplace=True)
big_df_smaller['Make'].replace('RANG', 'RANGE-ROVER', inplace=True)
big_df_smaller['Make'].replace('CAD', 'CADILLAC', inplace=True)
big_df_smaller['Make'].replace('TOTY', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('MERZ BENZ', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('JAG', 'JAGUAR', inplace=True)
big_df_smaller['Make'].replace('MAZADA', 'MAZDA', inplace=True)
big_df_smaller['Make'].replace('BENZ', 'MERCEDES', inplace=True)

In [15]:
big_df_smaller['Make'].replace('KAWK', 'KAWASAKI', inplace=True)
big_df_smaller['Make'].replace('TAO TAO', 'TAOTAO', inplace=True)
big_df_smaller['Make'].replace('NISSA', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('SAA', 'SAAB', inplace=True)
big_df_smaller['Make'].replace('MINI COOPER', 'MINI-COOPER', inplace=True)
big_df_smaller['Make'].replace('MINI', 'MINI-COOPER', inplace=True)
big_df_smaller['Make'].replace('KW', 'KAWASAKI', inplace=True)
big_df_smaller['Make'].replace('SATR', 'SATURN', inplace=True)
big_df_smaller['Make'].replace('INF', 'INFINITI', inplace=True)
big_df_smaller['Make'].replace('HARLEY DAVIDSON', 'HARLEY-DAVIDSON', inplace=True)
big_df_smaller['Make'].replace('PETE', 'PETERBILT', inplace=True)
big_df_smaller['Make'].replace('HINDA', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('PETERBUILT', 'PETERBILT', inplace=True)
big_df_smaller['Make'].replace('IZUZU', 'ISUZU', inplace=True)
big_df_smaller['Make'].replace('HYUNDAY', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('VOLKWAGON', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('FREI', 'FREIGHTLINER', inplace=True)
big_df_smaller['Make'].replace('INFINTI', 'INFINITI', inplace=True)
big_df_smaller['Make'].replace('TOYOYA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('HUNDAI', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('HYNDAI', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('MERCEDEZ BENZ', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('NISAN', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('TSMR', 'TESLA', inplace=True)
big_df_smaller['Make'].replace('CHEVE', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('VOLKS WAGON', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('HARLEY', 'HARLEY-DAVIDSON', inplace=True)
big_df_smaller['Make'].replace('TOTOTA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('PORCHE', 'PORSCHE', inplace=True)
big_df_smaller['Make'].replace('HON', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('HYUDAI', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('MASE', 'MASERATI', inplace=True)
big_df_smaller['Make'].replace('HYANDAI', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('KENW', 'KENWORTH', inplace=True)
big_df_smaller['Make'].replace('MER', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('HUYN', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('LAND', 'LAND-ROVER', inplace=True)
big_df_smaller['Make'].replace('INFINIT', 'INFINITI', inplace=True)
big_df_smaller['Make'].replace('BWM', 'BMW', inplace=True)
big_df_smaller['Make'].replace('MADZA', 'MAZDA', inplace=True)
big_df_smaller['Make'].replace('CHYRSLER', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('CHEVEY', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('HONA', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('CHYSLER', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('TOYTOTA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('MB', 'MERCEDES', inplace=True)


In [16]:
big_df_smaller['Make'].replace('INTE', 'INTERNATIONAL', inplace=True)
big_df_smaller['Make'].replace('ACCURA', 'ACURA', inplace=True)
big_df_smaller['Make'].replace('FREIGHT', 'FREIGHTLINER', inplace=True)
big_df_smaller['Make'].replace('MERZEDES', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('CHEVERLOT', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('CEHVY', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('TOYOA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('TYOTA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('TOYOYTA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('VOLKSWAGGON', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('VOLKWAGEN', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('VOLSWAGON', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('VOLTSWAGON', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('FOR', 'FORD', inplace=True)
big_df_smaller['Make'].replace('SMRT', 'SMART', inplace=True)
big_df_smaller['Make'].replace('HUYNDAI', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('HYU', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('HYND', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('HYN', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('CRYS', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('CHRYLSER', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('CRYSTLER', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('HNDA', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('HODA', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('MIT', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('MITSIBISHI', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('MISTUBISHI', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('MITSUBUSHI', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('MERC BENZ', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('MERCADES', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('NEW FLYER', 'NEW-FLYER', inplace=True)
big_df_smaller['Make'].replace('HD', 'HARLEY-DAVIDSON', inplace=True)
big_df_smaller['Make'].replace('RANGE', 'RANGE-ROVER', inplace=True)
big_df_smaller['Make'].replace('SUBURA', 'SUBARU', inplace=True)
big_df_smaller['Make'].replace('ACRUA', 'ACURA', inplace=True)
big_df_smaller['Make'].replace('INFINI', 'INFINITI', inplace=True)
big_df_smaller['Make'].replace('INFINITE', 'INFINITI', inplace=True)
big_df_smaller['Make'].replace('INIFINITI', 'INFINITI', inplace=True)
big_df_smaller['Make'].replace('SUZ', 'SUZUKI', inplace=True)
big_df_smaller['Make'].replace('ROV', 'LEXUS', inplace=True)
big_df_smaller['Make'].replace('INT', 'INTERNATIONAL', inplace=True)
big_df_smaller['Make'].replace('DUCA', 'DUCATI', inplace=True)

In [17]:
big_df_smaller['Make'].replace('SAT', 'SATURN', inplace=True)
big_df_smaller['Make'].replace('CADDILAC', 'CADILLAC', inplace=True)
big_df_smaller['Make'].replace('MITTS', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('COOPER', 'MINI-COOPER', inplace=True)
big_df_smaller['Make'].replace('ALFA ROMEO', 'ALFA-ROMEO', inplace=True)
big_df_smaller['Make'].replace('FOED', 'FORD', inplace=True)
big_df_smaller['Make'].replace('STER', 'STERLING', inplace=True)
big_df_smaller['Make'].replace('THOM', 'THOMAS', inplace=True)
big_df_smaller['Make'].replace('MAZA', 'MAZDA', inplace=True)
big_df_smaller['Make'].replace('ALFA', 'ALFA-ROMEO', inplace=True)
big_df_smaller['Make'].replace('MECURY', 'MERCURY', inplace=True)
big_df_smaller['Make'].replace('INIFINITI', 'INFINITI', inplace=True)
big_df_smaller['Make'].replace('LEXIS', 'LEXUS', inplace=True)
big_df_smaller['Make'].replace('HODNA', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('SUZIKI', 'SUZUKI', inplace=True)
big_df_smaller['Make'].replace('DOGE', 'DODGE', inplace=True)
big_df_smaller['Make'].replace('MITSUBSHI', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('HIUNDAI', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('HYUNAI', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('DIDGE', 'DODGE', inplace=True)
big_df_smaller['Make'].replace('HYUANDI', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('UNK', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('HOMDA', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('HIONDA', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('CHEVROLETE', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('NIS', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('CADILLIAC', 'CADILLAC', inplace=True)
big_df_smaller['Make'].replace('VOLKSW', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('SION', 'SCION', inplace=True)
big_df_smaller['Make'].replace('VOLSWAGEN', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('VOLKSWAGAN', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('LINCON', 'LINCOLN', inplace=True)
big_df_smaller['Make'].replace('TOOTA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('CHEROLET', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('MINNI', 'MINI-COOPER', inplace=True)

In [18]:
big_df_smaller['Make'].replace('WHITE', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('INFINITY', 'INFINITI', inplace=True)
big_df_smaller['Make'].replace('CHRISLER', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('TOTOYA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('HUNDAY', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('CHYR', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('FIRD', 'FORD', inplace=True)
big_df_smaller['Make'].replace('SUSUKI', 'SUZUKI', inplace=True)
big_df_smaller['Make'].replace('MITZUBISHI', 'MITZUBISHI', inplace=True)
big_df_smaller['Make'].replace('HYUANDAI', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('MIFU', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('None', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('CRY', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('HARL', 'HARLEY-DAVIDSON', inplace=True)
big_df_smaller['Make'].replace('WEST', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('TOYOTA SCION', 'SCION', inplace=True)
big_df_smaller['Make'].replace('BUIK', 'BUICK', inplace=True)
big_df_smaller['Make'].replace('SUBAR', 'SUBARU', inplace=True)
big_df_smaller['Make'].replace('VOLV0', 'VOLVO', inplace=True)
big_df_smaller['Make'].replace('PREM', 'PREMIER', inplace=True)
big_df_smaller['Make'].replace('NSSAN', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('HYUNDAU', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('TOYORA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('NISSAM', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('MITUBISHI', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('MITSHUBISHI', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('BENT', 'BENTLEY', inplace=True)
big_df_smaller['Make'].replace('MADZ', 'MAZDA', inplace=True)
big_df_smaller['Make'].replace('CHVY', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('LEXSUS', 'LEXUS', inplace=True)
big_df_smaller['Make'].replace('INTER', 'INTERNATIONAL', inplace=True)
big_df_smaller['Make'].replace('SSR', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('VIP', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('MRAEZ', 'MASERATI', inplace=True)
big_df_smaller['Make'].replace('MISSAN', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('HONAD', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('WORK', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('CHECY', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('CHEVROLEY', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('CHRSYLER', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('ORD', 'FORD', inplace=True)
big_df_smaller['Make'].replace('MITSUBISH', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('VOLS', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('MERCZ', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('MITIS', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('CAMRY', 'TOYOTA', inplace=True)

In [19]:
big_df_smaller['Make'].replace('CHRISLER', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('NONE', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('INFINITY', 'INFINITI', inplace=True)
big_df_smaller['Make'].replace('MITZUBISHI', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('MREZ', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('TOYATA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('SABB', 'SAAB', inplace=True)
big_df_smaller['Make'].replace('CHEVRLET', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('', 'CADILLAC', inplace=True)


In [20]:
big_df_smaller['Make'].replace('PORSH', 'PORSCHE', inplace=True)
big_df_smaller['Make'].replace('FRT', 'FREIGHTLINER', inplace=True)
big_df_smaller['Make'].replace('SCOOTER', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('BMX', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('MIST', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('KAWA', 'KAWASAKI', inplace=True)
big_df_smaller['Make'].replace('ALTIMA', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('ISZU', 'ISUZU', inplace=True)
big_df_smaller['Make'].replace('FOD', 'FORD', inplace=True)
big_df_smaller['Make'].replace('MERCEDS', 'MERCEDES', inplace=True)


In [21]:
big_df_smaller['Make'].replace('ONDA', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('BENTLY', 'BENTLEY', inplace=True)
big_df_smaller['Make'].replace('ACCORD', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('VK', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('KAW', 'KAWASAKI', inplace=True)
big_df_smaller['Make'].replace('MD', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('SUNNY', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('HOMD', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('COROLLA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('MECEDES', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('NISSSAN', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('TPYOTA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('SUBR', 'SUBARU', inplace=True)
big_df_smaller['Make'].replace('XX', 'UNKNOWN', inplace=True)

In [22]:
big_df_smaller['Make'].replace('FERR', 'FERRARI', inplace=True)
big_df_smaller['Make'].replace('WV', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('VOKSWAGON', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('LEXUX', 'LEXUS', inplace=True)
big_df_smaller['Make'].replace('WOLKSWAGEN', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('KAWASKI', 'KAWASAKI', inplace=True)
big_df_smaller['Make'].replace('TOYPTA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('NISSON', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('FORK', 'FORD', inplace=True)
big_df_smaller['Make'].replace('CADALIC', 'CADILLAC', inplace=True)
big_df_smaller['Make'].replace('VOLKSWA', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('MITTSUBISHI', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('HUYUNDAI', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('RANGEROVER', 'RANGE-ROVER', inplace=True)
big_df_smaller['Make'].replace('IZU', 'ISUZU', inplace=True)
big_df_smaller['Make'].replace('HOME', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('KENILWORTH', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('DODGE RAM', 'DODGE', inplace=True)
big_df_smaller['Make'].replace('VOVLO', 'VOLVO', inplace=True)
big_df_smaller['Make'].replace('LUXUS', 'LEXUS', inplace=True)
big_df_smaller['Make'].replace('WESTERN STAR', 'WESTERN-STAR', inplace=True)
big_df_smaller['Make'].replace('MBENZ', 'MERCEDES', inplace=True)

In [23]:
big_df_smaller['Make'].replace('LEXAS', 'LEXUS', inplace=True)
big_df_smaller['Make'].replace('NISSN', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('STLG', 'STERLING', inplace=True)
big_df_smaller['Make'].replace('MADA', 'MAZDA', inplace=True)
big_df_smaller['Make'].replace('MAD', 'MAZDA', inplace=True)
big_df_smaller['Make'].replace('VOLKWAG', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('TELSA', 'TESLA', inplace=True)
big_df_smaller['Make'].replace('PORSCH', 'PORSCHE', inplace=True)
big_df_smaller['Make'].replace('PLYMOTH', 'PLYMOUTH', inplace=True)
big_df_smaller['Make'].replace('LANR', 'LAND-ROVER', inplace=True)
big_df_smaller['Make'].replace('TAO', 'TAOTAO', inplace=True)

In [24]:
big_df_smaller['Make'].replace('CHEVYROLET', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('DOGDE', 'DODGE', inplace=True)
big_df_smaller['Make'].replace('VOL', 'VOLVO', inplace=True)
big_df_smaller['Make'].replace('TAIWAN GOLDEN B', 'TAIWAN GOLDEN BEE', inplace=True)
big_df_smaller['Make'].replace('MITSUB', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('MERCE', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('BNW', 'BMW', inplace=True)
big_df_smaller['Make'].replace('LICOLN', 'LINCOLN', inplace=True)
big_df_smaller['Make'].replace('MERCED', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('CHRYLER', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('LINCOLIN', 'LINCOLN', inplace=True)
big_df_smaller['Make'].replace('MERECEDES', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('TOYTO', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('CHVROLET', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('EPO', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('CHEVR', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('TOT', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('BWI', 'UNKNOWN', inplace=True)

In [25]:
big_df_smaller['Make'].replace('MISS', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('TOTYOTA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('NISAAN', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('CIVIC', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('TEST', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('CHEVOLET', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('MITISUBISHI', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('TPYPTA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('OLDMOBILE', 'OLDSMOBILE', inplace=True)
big_df_smaller['Make'].replace('CHY', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('YOYOTA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('CHEY', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('SMAR', 'MART', inplace=True)
big_df_smaller['Make'].replace('HONDS', 'HONDA', inplace=True)

In [26]:
big_df_smaller['Make'].replace('HYUNADAI', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('GENS', 'GENESIS', inplace=True)
big_df_smaller['Make'].replace('ASTON MARTIN', 'ASTON-MARTIN', inplace=True)
big_df_smaller['Make'].replace('CADILLA', 'CADILLAC', inplace=True)
big_df_smaller['Make'].replace('HUND', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('FTL', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('SUBUARU', 'SUBARU', inplace=True)
big_df_smaller['Make'].replace('MITSUBIHI', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('CEHV', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('RAV4', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('BUCK', 'BUICK', inplace=True)
big_df_smaller['Make'].replace('PONIAC', 'PONTIAC', inplace=True)
big_df_smaller['Make'].replace('MART', 'SMART', inplace=True)
big_df_smaller['Make'].replace('MERCEDEES', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('FROD', 'FORD', inplace=True)
big_df_smaller['Make'].replace('VOLKESWAGON', 'VOLKWAGON', inplace=True)
big_df_smaller['Make'].replace('SURE', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('JEP', 'JEEP', inplace=True)
big_df_smaller['Make'].replace('MERCEDEZ-BENZ', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('HUDS', 'HUDSON', inplace=True)
big_df_smaller['Make'].replace('TOYOTOA', 'TOYOTA', inplace=True)

In [27]:
big_df_smaller['Make'].replace('NIISSAN', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('VOLKWAGON', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('CEHVROLET', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('HONDA`', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('TOMAS', 'THOMAS', inplace=True)
big_df_smaller['Make'].replace('MERDZ', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('CADDILLAC', 'CADILLAC', inplace=True)
big_df_smaller['Make'].replace('VOLKSWAG', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('CHEVROET', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('HAUL', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('CHR', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('MERCEDE', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('ASTO', 'ASTON-MARTIN', inplace=True)
big_df_smaller['Make'].replace('RANGE ROV', 'RANGE-ROVER', inplace=True)
big_df_smaller['Make'].replace('JETTA', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('LODA', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('TAGA', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('FRIEGHTLINER', 'FREIGHTLINER', inplace=True)
big_df_smaller['Make'].replace('KAWI', 'KAWASAKI', inplace=True)
big_df_smaller['Make'].replace('TOYOTS', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('SPRINTER', 'MERCEDES', inplace=True)

In [28]:
big_df_smaller['Make'].replace('PORS', 'PORSCHE', inplace=True)
big_df_smaller['Make'].replace('HIUNDAY', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('STAR', 'WESTERN-STAR', inplace=True)
big_df_smaller['Make'].replace('RANG ROVER', 'RANGE-ROVER', inplace=True)
big_df_smaller['Make'].replace('CADDI', 'CADILLAC', inplace=True)
big_df_smaller['Make'].replace('FRD', 'FORD', inplace=True)
big_df_smaller['Make'].replace('STRG', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('GORD', 'FORD', inplace=True)
big_df_smaller['Make'].replace('VOKS', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('HUNDI', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('BUEL', 'BUELL', inplace=True)
big_df_smaller['Make'].replace('TAOT', 'TAOTAO', inplace=True)
big_df_smaller['Make'].replace('HOBDA', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('ATV', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('TOYOTA/SCION', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('PONTAIC', 'PONTIAC', inplace=True)
big_df_smaller['Make'].replace('UTIL', 'UTILITY', inplace=True)
big_df_smaller['Make'].replace('DATS', 'DATSUN', inplace=True)
big_df_smaller['Make'].replace('CEV', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('TOYOTSA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('DAEW', 'DAEWOO', inplace=True)
big_df_smaller['Make'].replace('GILG', 'GILGEE', inplace=True)
big_df_smaller['Make'].replace('MOPED', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('MBW', 'BMW', inplace=True)
big_df_smaller['Make'].replace('HIND', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('JEEF', 'JEEP', inplace=True)
big_df_smaller['Make'].replace('SUBURAU', 'SUBARU', inplace=True)
big_df_smaller['Make'].replace('LEUS', 'LEXUS', inplace=True)
big_df_smaller['Make'].replace('CRAN', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('MASERATTI', 'MASERATI', inplace=True)
big_df_smaller['Make'].replace('MERCDES', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('INIFITI', 'INFINITI', inplace=True)
big_df_smaller['Make'].replace('GNC', 'GM', inplace=True)
big_df_smaller['Make'].replace('CADALLAC', 'CADILLAC', inplace=True)
big_df_smaller['Make'].replace('TOYOTO', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('MITSBISHI', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('HUN', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('TIYITA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('HOINDA', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('HYUD', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('SUBARY', 'SUBARU', inplace=True)
big_df_smaller['Make'].replace('MITSUBISSHI', 'MITSUBISHI', inplace=True)

In [29]:
big_df_smaller['Make'].replace('ELANTRA', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('BLUE', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('OLDS MOBILE', 'OLDSMOBILE', inplace=True)
big_df_smaller['Make'].replace('JAQUAR', 'JAGUAR', inplace=True)
big_df_smaller['Make'].replace('CEVROLET', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('MISTSUBISHI', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('YAHMAHA', 'YAMAHA', inplace=True)
big_df_smaller['Make'].replace('TOUOTA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('FRTL', 'FREIGHTLINER', inplace=True)
big_df_smaller['Make'].replace('DOD', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('JONDA', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('UHAUL', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('VOLKE', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('RIDE ON', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('CHEVT', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('MZDA', 'MAZDA', inplace=True)
big_df_smaller['Make'].replace('TIYT', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('TRUCK', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('CHRYSL', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('SATUR', 'SATURN', inplace=True)
big_df_smaller['Make'].replace('CHEVU', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('PORC', 'PORSCHE', inplace=True)
big_df_smaller['Make'].replace('2016', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('TOOYOTA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('VOLKW', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('IHC', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('MRZ', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('MTIS', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('HYNDIA', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('FRGHT', 'FREIGHTLINER', inplace=True)
big_df_smaller['Make'].replace('INFNITY', 'INFINITI', inplace=True)
big_df_smaller['Make'].replace('POINTIAC', 'PONTIAC', inplace=True)

In [30]:
big_df_smaller['Make'].replace('AURA', 'SATURN', inplace=True)
big_df_smaller['Make'].replace('CHEVROLT', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('BIGT', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('HYUNDUI', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('AUCRA', 'ACURA', inplace=True)
big_df_smaller['Make'].replace('INFNITI', 'INFINITI', inplace=True)
big_df_smaller['Make'].replace('TTOYOTA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('VULC', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('CHE', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('HOONDA', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('FPRD', 'FORD', inplace=True)
big_df_smaller['Make'].replace('SUBARA', 'SUBARY', inplace=True)
big_df_smaller['Make'].replace('BISSAN', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('PORCH', 'PORSCHE', inplace=True)
big_df_smaller['Make'].replace('FORDQ', 'FORD', inplace=True)
big_df_smaller['Make'].replace('MITSIBUSHI', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('YAM', 'YAMAHA', inplace=True)
big_df_smaller['Make'].replace('TOYA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('TOY0TA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('MITZU', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('MIITS', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('ROUTER', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('MINICOOPER', 'MINI-COOPER', inplace=True)
big_df_smaller['Make'].replace('JEEO', 'JEEP', inplace=True)
big_df_smaller['Make'].replace('MEBE', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('LANDR', 'LAND-ROVER', inplace=True)
big_df_smaller['Make'].replace('ACRA', 'ACURA', inplace=True)
big_df_smaller['Make'].replace('HONDAY', 'HONDA', inplace=True)
big_df_smaller['Make'].replace('MISUBISHI', 'MITSUBISHI', inplace=True)
big_df_smaller['Make'].replace('BRIM', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('GENISIS', 'GENESIS', inplace=True)
big_df_smaller['Make'].replace('DDODGE', 'DODGE', inplace=True)
big_df_smaller['Make'].replace('PASSAT', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('NISSAB', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('CHRYSER', 'CHRYSLER', inplace=True)
big_df_smaller['Make'].replace('LAMBO', 'LAMBORGHINI', inplace=True)
big_df_smaller['Make'].replace('YUKON', 'GM', inplace=True)
big_df_smaller['Make'].replace('MERCERY', 'FORD', inplace=True)
big_df_smaller['Make'].replace('MAXDA', 'MAZDA', inplace=True)
big_df_smaller['Make'].replace('VWOLKS', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('DORD', 'FORD', inplace=True)
big_df_smaller['Make'].replace('HUMAN', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('OYOTA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('CHEVROLE', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('Z71', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('TOYOAT', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('OLSMOBILE', 'OLDSMOBILE', inplace=True)
big_df_smaller['Make'].replace('SUBARY', 'SUBARU', inplace=True)

In [31]:
big_df_smaller['Make'].replace('TRAILER', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('CHEVOROLET', 'CHEVROLET', inplace=True)
big_df_smaller['Make'].replace('NISSANA', 'NISSAN', inplace=True)
big_df_smaller['Make'].replace('MURCURY', 'FORD', inplace=True)
big_df_smaller['Make'].replace('MNI', 'MINI-COOPER', inplace=True)
big_df_smaller['Make'].replace('POR', 'PORSCHE', inplace=True)
big_df_smaller['Make'].replace('KIS', 'KIA', inplace=True)
big_df_smaller['Make'].replace('LANDOVER', 'LAND-ROVER', inplace=True)
big_df_smaller['Make'].replace('THOMAS BUILT', 'THOMAS', inplace=True)
big_df_smaller['Make'].replace('MAZFA', 'MAZDA', inplace=True)
big_df_smaller['Make'].replace('ACUA', 'ACURA', inplace=True)
big_df_smaller['Make'].replace('SILVER', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('VOKSWAGEN', 'VOLKSWAGON', inplace=True)
big_df_smaller['Make'].replace('GOLD', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('MEZ', 'MERCEDES', inplace=True)
big_df_smaller['Make'].replace('TOYITA', 'TOYOTA', inplace=True)
big_df_smaller['Make'].replace('AVURA', 'ACURA', inplace=True)
big_df_smaller['Make'].replace('LOAD', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('FHRT', 'FREIGHTLINER', inplace=True)
big_df_smaller['Make'].replace('HYUMDAI', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('DODE', 'DODGE', inplace=True)
big_df_smaller['Make'].replace('HUYANDI', 'HYUNDAI', inplace=True)
big_df_smaller['Make'].replace('SABARU', 'SUBARU', inplace=True)
big_df_smaller['Make'].replace('SPNR', 'UNKNOWN', inplace=True)
big_df_smaller['Make'].replace('VOLO', 'VOLVO', inplace=True)

In [32]:
makes_list = big_df_smaller['Make'].value_counts()
print(makes_list)
filtered_makes = makes_list[makes_list < 15]
filtered_makes.head(50)

TOYOTA         317315
HONDA          267382
FORD           166197
NISSAN         138338
CHEVROLET      133213
                ...  
BUICK VAN           1
JOND                1
MAZDVAL2013         1
DADG                1
HYUNVAL2005         1
Name: Make, Length: 3964, dtype: int64


In [33]:
big_df_smaller['Date Of Stop'] = pd.to_datetime(big_df_smaller['Date Of Stop'])
big_df_smaller['Year Date'] = big_df_smaller['Date Of Stop'].dt.year
big_df_smaller['Month'] = big_df_smaller['Date Of Stop'].dt.month
big_df_smaller['Day'] = big_df_smaller['Date Of Stop'].dt.day

In [34]:
big_df_smaller['Time Of Stop'] = pd.to_datetime(big_df_smaller['Time Of Stop'])
big_df_smaller['Hour of Stop'] = big_df_smaller['Time Of Stop'].dt.hour

In [35]:
# rf_df = big_df_smaller.drop(['Time Of Stop', 'Location', 'Latitude',
#        'Longitude', 'Belts', 'Personal Injury',
#        'Fatal', 'Alcohol', 'Work Zone', 'Search Conducted',
#        'Search Disposition', 'Search Outcome', 'Search Type',
#        'Search Arrest Reason', 'State', 'Year','Description','Make'], axis = 1)

In [36]:
random_forest_df = big_df_smaller.drop(['Date Of Stop','Driver State','State','Time Of Stop'], axis = 1)

In [37]:
random_forest_df.columns

Index(['SeqID', 'Agency', 'SubAgency', 'Description', 'Location', 'Latitude',
       'Longitude', 'Accident', 'Belts', 'Personal Injury', 'Property Damage',
       'Fatal', 'Commercial License', 'HAZMAT', 'Commercial Vehicle',
       'Alcohol', 'Work Zone', 'Search Conducted', 'Search Disposition',
       'Search Outcome', 'Search Reason', 'Search Reason For Stop',
       'Search Type', 'Search Arrest Reason', 'VehicleType', 'Year', 'Make',
       'Model', 'Color', 'Violation Type', 'Charge', 'Article',
       'Contributed To Accident', 'Race', 'Gender', 'Driver City', 'DL State',
       'Arrest Type', 'Geolocation', 'Driver State Category',
       'License Plate State Category', 'Year Date', 'Month', 'Day',
       'Hour of Stop'],
      dtype='object')

In [38]:
# Check if there are any null values in the entire DataFrame
if random_forest_df.isnull().values.any():
    print("There are null values in the DataFrame.")
else:
    print("There are no null values in the DataFrame.")

There are no null values in the DataFrame.


In [39]:
big_df_smaller.to_csv('Traffic_Violations_Updated_Makes.csv', index=False)

In [47]:
big_df_smaller.columns

Index(['SeqID', 'Date Of Stop', 'Time Of Stop', 'Agency', 'SubAgency',
       'Description', 'Location', 'Latitude', 'Longitude', 'Accident', 'Belts',
       'Personal Injury', 'Property Damage', 'Fatal', 'Commercial License',
       'HAZMAT', 'Commercial Vehicle', 'Alcohol', 'Work Zone',
       'Search Conducted', 'Search Disposition', 'Search Outcome',
       'Search Reason', 'Search Reason For Stop', 'Search Type',
       'Search Arrest Reason', 'State', 'VehicleType', 'Year', 'Make', 'Model',
       'Color', 'Violation Type', 'Charge', 'Article',
       'Contributed To Accident', 'Race', 'Gender', 'Driver City',
       'Driver State', 'DL State', 'Arrest Type', 'Geolocation',
       'Driver State Category', 'License Plate State Category', 'Year Date',
       'Month', 'Day', 'Hour of Stop'],
      dtype='object')

In [40]:
# # Convert categorical data to numeric with `pd.get_dummies`
# # Specify the columns you want to one-hot encode
# columns_to_encode = ['Accident', 'Search Reason', 'Race','Gender', 'Driver State','Property Damage','Color']

# # Use get_dummies to convert the specified columns into dummy variables
# random_dummies = pd.get_dummies(random_forest_df[columns_to_encode], prefix=columns_to_encode, drop_first=True)

# # Concatenate the dummy variables with the original DataFrame
# random_forest_df = pd.concat([random_forest_df, random_dummies], axis=1)

# # Drop the original categorical columns if needed
# random_forest_df = random_forest_df.drop(columns_to_encode, axis=1)

# # Print the resulting DataFrame
# print(random_forest_df)

In [41]:
# #Beginning Random Forest Modeling
# X=random_forest_df.copy()
# X.drop("Violation Type",axis=1, inplace=True)
# y=random_forest_df['Violation Type'].ravel()

In [42]:
# X.dtypes

In [43]:
# Splitting into Train and Test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [44]:
# #Fitting the Random Forest Model
# # Create a random forest classifier
# rf_model = RandomForestClassifier(n_estimators=500, random_state=7)
# # Fitting the model
# rf_model = rf_model.fit(X_train, y_train)

In [45]:
# #Making Predictions
# predictions = rf_model.predict(X_test)
# # Calculating the confusion matrix
# cm = confusion_matrix(y_test, predictions)
# cm_df = pd.DataFrame(
#     cm, index=["Actual citation", "Actual warning"], columns=["Predicted citation", "Predicted warning"]
# )

# # Calculating the accuracy score
# acc_score = accuracy_score(y_test, predictions)

In [46]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


NameError: name 'cm_df' is not defined

In [None]:
# # Random Forests in sklearn will automatically calculate feature importance
# importances = rf_model.feature_importances_
# # We can sort the features by their importance
# sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [None]:
# # Visualize the features by importance
# importances_df = pd.DataFrame(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
# importances_df.set_index(importances_df[1], inplace=True)
# importances_df.drop(columns=1, inplace=True)
# importances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
# importances_sorted = importances_df.sort_values(by='Feature Importances').head(20)
# importances_sorted.plot(kind='barh', color='lightgreen', title= 'Features Importances', legend=False)