<a href="https://colab.research.google.com/github/ishaqmarashy/red-light-traffic-violation-classification/blob/main/decision%20tree%20classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

csv_url = "./data/Traffic_Violations.csv"
##csv_url = "https://data.montgomerycountymd.gov/api/views/4mse-ku6q/rows.csv?accessType=DOWNLOAD"
# Load the CSV data into a Pandas DataFrame
df = pd.read_csv(csv_url)

In [2]:
list(df)

['SeqID',
 'Date Of Stop',
 'Time Of Stop',
 'Agency',
 'SubAgency',
 'Description',
 'Location',
 'Latitude',
 'Longitude',
 'Accident',
 'Belts',
 'Personal Injury',
 'Property Damage',
 'Fatal',
 'Commercial License',
 'HAZMAT',
 'Commercial Vehicle',
 'Alcohol',
 'Work Zone',
 'Search Conducted',
 'Search Disposition',
 'Search Outcome',
 'Search Reason',
 'Search Reason For Stop',
 'Search Type',
 'Search Arrest Reason',
 'State',
 'VehicleType',
 'Year',
 'Make',
 'Model',
 'Color',
 'Violation Type',
 'Charge',
 'Article',
 'Contributed To Accident',
 'Race',
 'Gender',
 'Driver City',
 'Driver State',
 'DL State',
 'Arrest Type',
 'Geolocation']

In [3]:
# The dataset being used contains all traffic violations
# Keywords a crucial for determining the type of violation
# I used the unique to find the feature values in description
# My initial stratagy for finding these keywords is by using ctrl+f on notepad++ and searching for red and stop
df_filtered = df[df['Description'].str.contains(' stop ', case=False) &
                               df['Description'].str.contains(' red ', case=False) &
                               df['Description'].str.contains(' fail', case=False)]
df_filtered['Description'].value_counts()

Description
DRIVER FAILURE TO STOP AT STEADY CIRCULAR RED SIGNAL                                                 21716
DRIVER FAIL TO STOP AT FLASHING RED TRAFFIC SIGNAL STOP LINE                                         18277
DRIVER FAIL TO STOP AT RED TRAFFIC SIGNAL BEFORE RIGHT TURN                                           8121
DRIVER FAIL TO STOP AT STEADY RED ARROW SIGNAL                                                        4290
DRIVER FAIL TO STOP AT STEADY CIRCULAR RED SIGNAL                                                     1981
DRIVER FAIL TO STOP AT STEADY CIRCULAR RED SIGNAL OR AT STEADY RED ARROW SIGNAL                        585
DRIVER FAILING TO STOP AT RED SIGNAL BEFORE LEFT TURN                                                  295
DRIVER FAIL TO STOP AT RED TRAFFIC SIGNAL BEFORE ANY OTHER TURN                                        216
DRIVER FAILING TO STOP AT RED SIGNAL BEFORE LEFT TURN, MAKING IMPROPER LEFT TURN AT RED SIGNAL         212
DRIVER FAILING TO STOP AT

In [4]:
# Removed columns that do not contribute to outcome or are incomplete
# Some columns I removed here like longitude may make a nice visual for later
columns_to_drop = [
        'SeqID',
        'Agency',
        'SubAgency',
        'Commercial License',
        'HAZMAT',
        'Commercial Vehicle',
        'Search Conducted',
        'Search Disposition',
        'Search Outcome',
        'Search Type',
        'Article',
        'Contributed To Accident',
        'Driver City',
        'Driver State',
        'Arrest Type',
        'Violation Type',
        'SeqID',
        'Search Reason',
        'Search Reason For Stop',
        'Charge',
        'Search Arrest Reason',
        'Date Of Stop',
        'Time Of Stop',
    'Description']


# Drop the specified columns
df_filtered = df.drop(columns=columns_to_drop, errors='ignore')
list(df_filtered)

['Location',
 'Latitude',
 'Longitude',
 'Accident',
 'Belts',
 'Personal Injury',
 'Property Damage',
 'Fatal',
 'Alcohol',
 'Work Zone',
 'State',
 'VehicleType',
 'Year',
 'Make',
 'Model',
 'Color',
 'Race',
 'Gender',
 'DL State',
 'Geolocation']

In [8]:
# Check the features here for colums that may be useful
# Unique entries are a good way to get an idea of what the features mean 
#df_filtered['Date Of Stop'].value_counts()

In [9]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1879912 entries, 0 to 1879911
Data columns (total 20 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Location         object 
 1   Latitude         float64
 2   Longitude        float64
 3   Accident         object 
 4   Belts            object 
 5   Personal Injury  object 
 6   Property Damage  object 
 7   Fatal            object 
 8   Alcohol          object 
 9   Work Zone        object 
 10  State            object 
 11  VehicleType      object 
 12  Year             float64
 13  Make             object 
 14  Model            object 
 15  Color            object 
 16  Race             object 
 17  Gender           object 
 18  DL State         object 
 19  Geolocation      object 
dtypes: float64(3), object(17)
memory usage: 286.9+ MB


In [10]:
df_filtered.dropna(inplace=True)
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1857187 entries, 0 to 1879911
Data columns (total 20 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Location         object 
 1   Latitude         float64
 2   Longitude        float64
 3   Accident         object 
 4   Belts            object 
 5   Personal Injury  object 
 6   Property Damage  object 
 7   Fatal            object 
 8   Alcohol          object 
 9   Work Zone        object 
 10  State            object 
 11  VehicleType      object 
 12  Year             float64
 13  Make             object 
 14  Model            object 
 15  Color            object 
 16  Race             object 
 17  Gender           object 
 18  DL State         object 
 19  Geolocation      object 
dtypes: float64(3), object(17)
memory usage: 297.6+ MB


In [11]:
#Data contains more white race and black
#Conclusion cannot be drawn as the area data is being collected from may have more
#White and black individuals compared to other races
#Which over represents a race
df_filtered['Race'].value_counts()

Race
WHITE              632776
BLACK              589878
HISPANIC           411268
OTHER              114611
ASIAN              105306
NATIVE AMERICAN      3348
Name: count, dtype: int64

In [12]:
df_filtered['Gender'].value_counts()

Gender
M    1250305
F     604051
U       2831
Name: count, dtype: int64