In [1]:
# Initial imports
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline


In [2]:
# Set the limit for max rows and columns to display
pd.options.display.max_rows = 50
pd.options.display.max_columns = 50

In [3]:
# Loading data
#file_path = "data\Traffic_Violations_Processed.csv"
file_path = "Traffic_Violations_Processed.csv"
df_traffic = pd.read_csv(file_path)
df_traffic.head()

Unnamed: 0,Description,Location,Latitude,Longitude,Accident,Belts,Personal Injury,Property Damage,Fatal,Alcohol,Work Zone,Search Conducted,Search Disposition,Search Outcome,Search Reason,Search Type,Search Arrest Reason,Year,Make,Color,Violation Type,Race,Gender,Year of Stop,Month of Stop,Driver State Category,License Plate State Category,Hour of Stop,Color Type
0,Registration Violation,KEMP MILL ROAD @ ALPERT LANE,39.053038,-77.024637,No,No,No,No,No,No,No,No,,Citation,,,,2013,HYUNDAI,RED,Citation,WHITE,F,2023,4,in state,in state,23,Colorful
1,NEGLIGENT DRIVING VEHICLE IN CARELESS AND IMPR...,JONES MILL ROAD/ LE VELLE DRIVE,38.99148,-77.097113,No,No,No,No,No,No,No,No,,Citation,,,,2019,NISSAN,GRAY,Citation,BLACK,F,2023,4,in state,in state,4,Neutral
2,Reckless Driving,JONES MILL ROAD/ LE VELLE DRIVE,38.99148,-77.097113,No,No,No,No,No,No,No,No,,Citation,,,,2019,NISSAN,GRAY,Citation,BLACK,F,2023,4,in state,in state,4,Neutral
3,FAILURE TO DRIVE VEHICLE ON RIGHT HALF OF ROAD...,JONES MILL ROAD/ LE VELLE DRIVE,38.99148,-77.097113,No,No,No,No,No,No,No,No,,Citation,,,,2019,NISSAN,GRAY,Citation,BLACK,F,2023,4,in state,in state,4,Neutral
4,DUI,JONES MILL ROAD/ LE VELLE DRIVE,38.99148,-77.097113,No,No,No,No,No,No,No,No,,Citation,,,,2019,NISSAN,GRAY,Citation,BLACK,F,2023,4,in state,in state,4,Neutral


In [4]:
df_traffic.columns

Index(['Description', 'Location', 'Latitude', 'Longitude', 'Accident', 'Belts',
       'Personal Injury', 'Property Damage', 'Fatal', 'Alcohol', 'Work Zone',
       'Search Conducted', 'Search Disposition', 'Search Outcome',
       'Search Reason', 'Search Type', 'Search Arrest Reason', 'Year', 'Make',
       'Color', 'Violation Type', 'Race', 'Gender', 'Year of Stop',
       'Month of Stop', 'Driver State Category',
       'License Plate State Category', 'Hour of Stop', 'Color Type'],
      dtype='object')

In [5]:
 # Get required columns
column_list = ['Description','Accident','Alcohol','Search Outcome','Violation Type','License Plate State Category']
df_traffic_trimmed = df_traffic[column_list]
df_traffic_trimmed.head()

Unnamed: 0,Description,Accident,Alcohol,Search Outcome,Violation Type,License Plate State Category
0,Registration Violation,No,No,Citation,Citation,in state
1,NEGLIGENT DRIVING VEHICLE IN CARELESS AND IMPR...,No,No,Citation,Citation,in state
2,Reckless Driving,No,No,Citation,Citation,in state
3,FAILURE TO DRIVE VEHICLE ON RIGHT HALF OF ROAD...,No,No,Citation,Citation,in state
4,DUI,No,No,Citation,Citation,in state


In [7]:
# Check the number of records
df_traffic_trimmed.count()

Description                     1665836
Accident                        1665836
Alcohol                         1665836
Search Outcome                  1006148
Violation Type                  1665836
License Plate State Category    1665836
dtype: int64

In [8]:
# Fill the null values with None (mainly in Search Outcome column)
df_traffic_clean = df_traffic_trimmed.fillna("None")

In [9]:
df_traffic_clean.count()

Description                     1665836
Accident                        1665836
Alcohol                         1665836
Search Outcome                  1665836
Violation Type                  1665836
License Plate State Category    1665836
dtype: int64

In [10]:
# One-hot encoding of categorical columns
columns_to_encode = ['Description','Accident','Alcohol','Search Outcome','Violation Type','License Plate State Category']

traffic_dummies = pd.get_dummies(df_traffic_clean[columns_to_encode],columns_to_encode)
traffic_dummies.head()

Unnamed: 0,Description_DRIVER ENTERING INTERSECTION AT FLASHING RED TRAFFIC SIGNAL WITHOUT STOPPING,Description_DRIVER FAILURE TO OBEY PROPERLY PLACED TRAFFIC CONTROL DEVICE INSTRUCTIONS,Description_DRIVING VEH. W/O ADEQUATE REAR REG. PLATE ILLUMINATION,Description_DUI,Description_FAILURE TO DRIVE VEHICLE ON RIGHT HALF OF ROADWAY WHEN REQUIRED,Description_Failure to Yield,Description_Failure to stop at different circumstances,Description_Improper Equipment,Description_Improper Stop,Description_Involved in Accident,Description_Lane Violation,Description_Learners Permit,Description_License Violation,"Description_NEGLIGENT DRIVING VEHICLE IN CARELESS AND IMPRUDENT MANNER ENDANGERING PROPERTY, LIFE AND PERSON",Description_Non-Moving Violation,Description_Obstructed View,Description_Other,Description_Reckless Driving,Description_Registration Violation,Description_SeatBelt - Not restrained,Description_Speeding,Description_Suspended License or Registration,Description_Uninsured Vehicle,Description_Unsafe Turns,Description_Using Phone,Accident_No,Accident_Yes,Alcohol_No,Alcohol_Yes,Search Outcome_Arrest,Search Outcome_Citation,Search Outcome_None,Search Outcome_Recovered Evidence,Search Outcome_SERO,Search Outcome_Warning,Violation Type_Citation,Violation Type_Warning,License Plate State Category_in state,License Plate State Category_out of state
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0


In [11]:
# # Check the columns after one-hot encoding
traffic_dummies.columns

Index(['Description_DRIVER ENTERING INTERSECTION AT FLASHING RED TRAFFIC SIGNAL WITHOUT STOPPING',
       'Description_DRIVER FAILURE TO OBEY PROPERLY PLACED TRAFFIC CONTROL DEVICE INSTRUCTIONS',
       'Description_DRIVING VEH. W/O ADEQUATE REAR REG. PLATE ILLUMINATION',
       'Description_DUI',
       'Description_FAILURE TO DRIVE VEHICLE ON RIGHT HALF OF ROADWAY WHEN REQUIRED',
       'Description_Failure to Yield',
       'Description_Failure to stop at different circumstances',
       'Description_Improper Equipment', 'Description_Improper Stop',
       'Description_Involved in Accident', 'Description_Lane Violation',
       'Description_Learners Permit', 'Description_License Violation',
       'Description_NEGLIGENT DRIVING VEHICLE IN CARELESS AND IMPRUDENT MANNER ENDANGERING PROPERTY, LIFE AND PERSON',
       'Description_Non-Moving Violation', 'Description_Obstructed View',
       'Description_Other', 'Description_Reckless Driving',
       'Description_Registration Violation',

In [12]:
# Define the Feature set
X= traffic_dummies.copy()
X.drop(['Violation Type_Citation','Violation Type_Warning'] ,axis=1, inplace=True)
X.head()

Unnamed: 0,Description_DRIVER ENTERING INTERSECTION AT FLASHING RED TRAFFIC SIGNAL WITHOUT STOPPING,Description_DRIVER FAILURE TO OBEY PROPERLY PLACED TRAFFIC CONTROL DEVICE INSTRUCTIONS,Description_DRIVING VEH. W/O ADEQUATE REAR REG. PLATE ILLUMINATION,Description_DUI,Description_FAILURE TO DRIVE VEHICLE ON RIGHT HALF OF ROADWAY WHEN REQUIRED,Description_Failure to Yield,Description_Failure to stop at different circumstances,Description_Improper Equipment,Description_Improper Stop,Description_Involved in Accident,Description_Lane Violation,Description_Learners Permit,Description_License Violation,"Description_NEGLIGENT DRIVING VEHICLE IN CARELESS AND IMPRUDENT MANNER ENDANGERING PROPERTY, LIFE AND PERSON",Description_Non-Moving Violation,Description_Obstructed View,Description_Other,Description_Reckless Driving,Description_Registration Violation,Description_SeatBelt - Not restrained,Description_Speeding,Description_Suspended License or Registration,Description_Uninsured Vehicle,Description_Unsafe Turns,Description_Using Phone,Accident_No,Accident_Yes,Alcohol_No,Alcohol_Yes,Search Outcome_Arrest,Search Outcome_Citation,Search Outcome_None,Search Outcome_Recovered Evidence,Search Outcome_SERO,Search Outcome_Warning,License Plate State Category_in state,License Plate State Category_out of state
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0


In [13]:
# Check the number of Citation and Warning
df_traffic_clean['Violation Type'].value_counts()

Citation    756296
Name: Violation Type, dtype: int64

In [14]:
# Define the outcome column
y = traffic_dummies['Violation Type_Warning'].ravel()

# Output 0 represents Citation and 1 represents Warning
y[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)

In [15]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=35)

## Fitting the Random Forest Model

In [16]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=200, random_state=35)

In [17]:
 # Fitting the model
rf_model = rf_model.fit(X_train, y_train)

## Making Predictions using the Random Forest Model

In [18]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test)

## Model Evaluation - Random Forest Model

In [19]:
# Calculating the confusion matrix
#index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Citation", "Actual Warning"], columns=["Predicted Citation", "Predicted Warning"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [20]:
# Displaying results
print("Confusion Matrix - Random Forest")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report - Random Forest")
print(classification_report(y_test, predictions))

Confusion Matrix - Random Forest


Unnamed: 0,Predicted Citation,Predicted Warning
Actual Citation,137386,52045
Actual Warning,24895,202133


Accuracy Score : 0.8152519215577043
Classification Report - Random Forest
              precision    recall  f1-score   support

           0       0.85      0.73      0.78    189431
           1       0.80      0.89      0.84    227028

    accuracy                           0.82    416459
   macro avg       0.82      0.81      0.81    416459
weighted avg       0.82      0.82      0.81    416459



## Logistic Regression Model

In [21]:
# Create a Logistic Regression classifier
lr_model = LogisticRegression(solver='sag',
                                max_iter=200,
                                random_state=1)
lr_model


### Fitting Logistic Regression Model

In [22]:
# Fitting the model
lr_model.fit(X_train, y_train)

### Making Predictions using Logistic Regression Model

In [23]:
# Making predictions using the testing data
predictions_lr = lr_model.predict(X_test)

### Model Evaluation - Logistic Regression

In [24]:
# Calculating the confusion matrix

cm_lr = confusion_matrix(y_test, predictions_lr)
cm_df_lr = pd.DataFrame(
    cm_lr, index=["Actual Citation", "Actual Warning"], columns=["Predicted Citation", "Predicted Warning"]
)

# Calculating the accuracy score
acc_score_lr = accuracy_score(y_test, predictions_lr)

In [25]:
# Displaying results
print("Confusion Matrix - Logistic Regression Model")
display(cm_df_lr)
print(f"Accuracy Score : {acc_score_lr}")
print("Classification Report - Logistic Regression Model")
print(classification_report(y_test, predictions_lr))

Confusion Matrix - Logistic Regression Model


Unnamed: 0,Predicted Citation,Predicted Warning
Actual Citation,137537,51894
Actual Warning,25865,201163


Accuracy Score : 0.8132853414141609
Classification Report - Logistic Regression Model
              precision    recall  f1-score   support

           0       0.84      0.73      0.78    189431
           1       0.79      0.89      0.84    227028

    accuracy                           0.81    416459
   macro avg       0.82      0.81      0.81    416459
weighted avg       0.82      0.81      0.81    416459

