In [1]:
import pandas as pd

csv_url = "./data/Traffic_Violations.csv"
##csv_url = "https://data.montgomerycountymd.gov/api/views/4mse-ku6q/rows.csv?accessType=DOWNLOAD"
df = pd.read_csv(csv_url)

In [2]:
# Removed columns that do not contribute to outcome or are incomplete
# Some columns I removed here like longitude may make a nice visual for later
columns_to_drop = [
        'SeqID',
        'HAZMAT',
        'Search Conducted',
        'Search Disposition',
        'Search Outcome',
        'Search Type',
        'Article',
        'Contributed To Accident',
        'Arrest Type',
        'Search Reason',
        'Search Reason For Stop',
        'Charge',
        'Search Arrest Reason']

# Drop the specified columns
df.drop(columns=columns_to_drop, errors='ignore', inplace=True)
df.dropna(subset=['Description'], inplace=True)

In [3]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

categorical_cols = ['VehicleType', 'Make', 'Model', 'Color', 'State', 'Driver State', 'DL State', 'Driver City']

df['Date Of Stop'] = pd.to_datetime(df['Date Of Stop'])
df['Year_Stop'] = df['Date Of Stop'].dt.year
df['Month_Stop'] = df['Date Of Stop'].dt.month
df['Day_Stop'] = df['Date Of Stop'].dt.day

year_imputer = IterativeImputer(max_iter=10, random_state=0)
df['Year'] = year_imputer.fit_transform(df[['Year']])

for col in categorical_cols:
    most_frequent_value = df[col].mode()[0]  
    df[col].fillna(most_frequent_value, inplace=True)

missing_values = df.isna().sum().sort_values()
print(missing_values)


Date Of Stop          0
Year_Stop             0
Geolocation           0
DL State              0
Driver State          0
Driver City           0
Gender                0
Race                  0
Violation Type        0
Color                 0
Model                 0
Make                  0
Year                  0
VehicleType           0
State                 0
Month_Stop            0
Work Zone             0
Commercial Vehicle    0
Commercial License    0
Fatal                 0
Property Damage       0
Personal Injury       0
Belts                 0
Accident              0
Longitude             0
Latitude              0
Description           0
SubAgency             0
Agency                0
Time Of Stop          0
Alcohol               0
Day_Stop              0
Location              4
dtype: int64


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1879902 entries, 0 to 1879911
Data columns (total 33 columns):
 #   Column              Dtype         
---  ------              -----         
 0   Date Of Stop        datetime64[ns]
 1   Time Of Stop        object        
 2   Agency              object        
 3   SubAgency           object        
 4   Description         object        
 5   Location            object        
 6   Latitude            float64       
 7   Longitude           float64       
 8   Accident            object        
 9   Belts               object        
 10  Personal Injury     object        
 11  Property Damage     object        
 12  Fatal               object        
 13  Commercial License  object        
 14  Commercial Vehicle  object        
 15  Alcohol             object        
 16  Work Zone           object        
 17  State               object        
 18  VehicleType         object        
 19  Year                float64       
 20  Make   

In [5]:
import nltk
import string
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    translator = str.maketrans('', '', string.punctuation) 
    tokens = text.lower().translate(translator).split()
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

df['Description'] = df['Description'].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ishaq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
df.columns

Index(['Date Of Stop', 'Time Of Stop', 'Agency', 'SubAgency', 'Description',
       'Location', 'Latitude', 'Longitude', 'Accident', 'Belts',
       'Personal Injury', 'Property Damage', 'Fatal', 'Commercial License',
       'Commercial Vehicle', 'Alcohol', 'Work Zone', 'State', 'VehicleType',
       'Year', 'Make', 'Model', 'Color', 'Violation Type', 'Race', 'Gender',
       'Driver City', 'Driver State', 'DL State', 'Geolocation', 'Year_Stop',
       'Month_Stop', 'Day_Stop'],
      dtype='object')

In [7]:
df.to_csv('data/Traffic_Violations_Imputed.csv', index=False)