In [1]:
import pandas as pd

csv_url = "./data/Traffic_Violations.csv"
##csv_url = "https://data.montgomerycountymd.gov/api/views/4mse-ku6q/rows.csv?accessType=DOWNLOAD"
df = pd.read_csv(csv_url)

In [2]:
# Removed columns that do not contribute to outcome or are incomplete
# Some columns I removed here like longitude may make a nice visual for later
columns_to_drop = [
        'SeqID',
        'HAZMAT',
        'Search Conducted',
        'Search Disposition',
        'Search Outcome',
        'Search Type',
        'Article',
        'Contributed To Accident',
        'Arrest Type',
        'Search Reason',
        'Search Reason For Stop',
        'Charge',
        'Search Arrest Reason']

# Drop the specified columns
df.drop(columns=columns_to_drop, errors='ignore', inplace=True)

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer



label_encoder = LabelEncoder()
for col in ['VehicleType', 'Make', 'Model', 'Color', 'State','Description','Location','Driver State','DL State','Driver City']:
    df[col] = label_encoder.fit_transform(df[col])

df['Date Of Stop'] = pd.to_datetime(df['Date Of Stop'])
df['Year_Stop'] = df['Date Of Stop'].dt.year
df['Month_Stop'] = df['Date Of Stop'].dt.month
df['Day_Stop'] = df['Date Of Stop'].dt.day
year_imputer = IterativeImputer(max_iter=10, random_state=0)
df['Year'] = year_imputer.fit_transform(df[['Year']])

df.isna().sum().sort_values()

Date Of Stop          0
Year_Stop             0
Geolocation           0
DL State              0
Driver State          0
Driver City           0
Gender                0
Race                  0
Violation Type        0
Color                 0
Model                 0
Make                  0
Year                  0
VehicleType           0
State                 0
Month_Stop            0
Work Zone             0
Commercial Vehicle    0
Commercial License    0
Fatal                 0
Property Damage       0
Personal Injury       0
Belts                 0
Accident              0
Longitude             0
Latitude              0
Location              0
Description           0
SubAgency             0
Agency                0
Time Of Stop          0
Alcohol               0
Day_Stop              0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1879912 entries, 0 to 1879911
Data columns (total 33 columns):
 #   Column              Dtype         
---  ------              -----         
 0   Date Of Stop        datetime64[ns]
 1   Time Of Stop        object        
 2   Agency              object        
 3   SubAgency           object        
 4   Description         int32         
 5   Location            int32         
 6   Latitude            float64       
 7   Longitude           float64       
 8   Accident            object        
 9   Belts               object        
 10  Personal Injury     object        
 11  Property Damage     object        
 12  Fatal               object        
 13  Commercial License  object        
 14  Commercial Vehicle  object        
 15  Alcohol             object        
 16  Work Zone           object        
 17  State               int32         
 18  VehicleType         int32         
 19  Year                float64       
 20  Ma

In [8]:
df.to_csv('data/Traffic_Violations_Imputed.csv', index=False)