<a href='https://colab.research.google.com/github/ishaqmarashy/red-light-traffic-violation-classification/blob/main/decision%20tree%20classifier.ipynb' target='_parent'><img src='https://colab.research.google.com/assets/colab-badge.svg' alt='Open In Colab'/></a>

## Preprocess Data

In [271]:
import pandas as pd
import os
import numpy as np

csv_filename1 = "data/tv_noaa.csv"
df = pd.read_csv(csv_filename1)

In [272]:
df.drop(columns=['Accident','Violation Type','Charge','Article','Contributed To Accident','Article','HAZMAT','Personal Injury','Fatal'],inplace=True)

In [273]:
df.drop(columns=['Latitude', 'Longitude'],inplace=True)

In [274]:
df.isna().sum()

Belts                 0
Property Damage       0
Commercial License    0
Commercial Vehicle    0
Alcohol               0
Work Zone             0
State                 0
VehicleType           0
Year                  1
Make                  0
Model                 0
Color                 5
Race                  0
Gender                0
Driver City           0
Driver State          0
DL State              1
DV                    0
DATE                  0
LATITUDE              0
LONGITUDE             0
AWND                  0
PGTM                  0
PRCP                  0
TMAX                  0
WSF2                  0
WSF5                  0
TMIN                  0
WDF2                  0
WDF5                  0
dtype: int64

In [275]:
import pandas as pd
from sklearn.impute import SimpleImputer


data_for_imputation = df.copy()

categorical_imputer = SimpleImputer(strategy='most_frequent')

imputed_values = categorical_imputer.fit_transform(data_for_imputation)

imputed_df = pd.DataFrame(imputed_values,columns=df.columns)

df= imputed_df

df


Unnamed: 0,Belts,Property Damage,Commercial License,Commercial Vehicle,Alcohol,Work Zone,State,VehicleType,Year,Make,...,LONGITUDE,AWND,PGTM,PRCP,TMAX,WSF2,WSF5,TMIN,WDF2,WDF5
0,No,No,No,No,No,No,MD,02 - Automobile,2020.0,LINC,...,-75.12284,2.2,2301.0,41.1,15.6,9.8,16.1,10.0,300.0,320.0
1,No,No,No,No,No,No,MD,02 - Automobile,2023.0,MAZDA,...,-75.12284,3.4,1356.0,0.5,12.8,7.6,11.6,6.1,230.0,220.0
2,No,Yes,No,No,No,No,MD,02 - Automobile,2023.0,VOLK,...,-75.12284,2.9,1452.0,0.0,26.7,6.7,10.7,11.1,230.0,230.0
3,No,No,No,No,No,No,MD,02 - Automobile,2012.0,HONDA,...,-75.12284,4.0,1231.0,0.0,18.9,8.1,11.6,7.2,10.0,360.0
4,No,Yes,No,No,No,No,MD,02 - Automobile,2018.0,HOND,...,-75.12284,0.9,2212.0,4.3,20.6,4.5,6.7,11.1,320.0,320.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1781,No,No,No,No,No,No,MD,02 - Automobile,2005.0,HOND,...,-75.12284,1.5,932.0,0.0,22.2,5.4,8.1,13.3,30.0,30.0
1782,No,No,No,No,No,No,MD,02 - Automobile,1997.0,MITS,...,-75.12284,1.0,1613.0,0.0,22.2,2.2,3.1,8.3,20.0,360.0
1783,No,No,No,No,No,No,MD,02 - Automobile,2005.0,HOND,...,-75.12284,0.8,1159.0,0.0,23.3,3.6,6.3,6.7,160.0,200.0
1784,No,No,No,No,No,No,MD,02 - Automobile,1996.0,ISUZU,...,-75.12284,3.8,1519.0,0.0,9.4,8.9,13.0,1.1,290.0,310.0


In [276]:
df['VehicleType'].value_counts()

VehicleType
02 - Automobile              1580
05 - Light Duty Truck          90
28 - Other                     40
03 - Station Wagon             31
01 - Motorcycle                18
06 - Heavy Duty Truck          15
10 - Transit Bus                6
07 - Truck/Road Tractor         3
08 - Recreational Vehicle       1
20 - Commercial Rig             1
29 - Unknown                    1
Name: count, dtype: int64

In [277]:
def classify_vehicle(vehicle_type):
    if any(category in vehicle_type for category in ['01 - Motorcycle']):
        return 'Light'
    elif any(category in vehicle_type for category in ['28 - Other','29 - Unknown', '03 - Station Wagon', '02 - Automobile']):
        return 'Medium'
    else:
        return 'Heavy'
    
df['VehicleType'] = df['VehicleType'].apply(classify_vehicle)

In [278]:
df['Year']=df['Year'].astype(int)
df['Vehicle'] = df[[ 'Year', 'Make', 'Model']].astype(str).apply(lambda x: ', '.join(x), axis=1)
df.drop(columns=['Year', 'Make', 'Model'],inplace=True)

In [279]:
df['DATE'] = pd.to_datetime(df['DATE'])
df['Weekend'] = df['DATE'].dt.dayofweek.apply(lambda x: 1 if x in [5, 6] else 0)
df['Time'] = df['DATE'].dt.time
df['Year'] = df['DATE'].dt.year
df['Month'] = df['DATE'].dt.month
df['Day of Week'] = pd.to_datetime(df['DATE']).dt.dayofweek + 1

In [280]:
def categorize_time(time):
    if 5 < time.hour <= 9:
        return 'am-peak'
    elif 16 < time.hour <= 20:
        return 'pm-peak'
    else:
        return 'off-peak'
    
df['TimeCategory'] = df['Time'].apply(categorize_time)

In [281]:
def categorize_time(time):
    if 0 < time.hour <= 12:
        return 'AM'
    else:
        return 'PM'
    
df['TimeCategory_AMPM'] = df['Time'].apply(categorize_time)

In [282]:
def categorize_time_custom(time):
    if 0 < time.hour <= 6:
        return 'Night'
    elif 6 < time.hour <= 12:
        return 'Morning'
    elif 12 < time.hour <= 18:
        return 'Afternoon'
    else:
        return 'Evening'

df['TimeCategory_NMAE'] = df['Time'].apply(categorize_time_custom)


In [283]:
df['Day of Week'] = np.sin((2 * np.pi *pd.to_datetime(df['DATE']).dt.dayofweek + 1)/ (7))

In [284]:
def sine_of_time(time):
    return np.sin ((time.hour ) / (24))

df['SinOfTime'] = df['Time'].apply(sine_of_time)

In [285]:
def time_of_day(time):
    return time.hour 

df['Time'] = df['Time'].apply(time_of_day)

In [286]:
    
df['TimeCategory'] 

0       off-peak
1       off-peak
2       off-peak
3       off-peak
4       off-peak
          ...   
1781    off-peak
1782    off-peak
1783    off-peak
1784    off-peak
1785    off-peak
Name: TimeCategory, Length: 1786, dtype: object

In [287]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd

label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder()

columns_to_label_encode = ['Belts', 'Property Damage', 'Commercial License', 'TimeCategory_AMPM',
                           'Commercial Vehicle', 'Alcohol', 'Gender', 'Work Zone']

columns_to_onehot_encode = []

for column in columns_to_label_encode:
    df[column] = label_encoder.fit_transform(df[column])

onehot_encoded = onehot_encoder.fit_transform(df[columns_to_onehot_encode]).toarray()
onehot_columns = onehot_encoder.get_feature_names_out(columns_to_onehot_encode)
df_onehot = pd.DataFrame(onehot_encoded, columns=onehot_columns)

df = pd.concat([df, df_onehot], axis=1)

df = df.drop(columns=columns_to_onehot_encode)
df

Unnamed: 0,Belts,Property Damage,Commercial License,Commercial Vehicle,Alcohol,Work Zone,State,VehicleType,Color,Race,...,Vehicle,Weekend,Time,Year,Month,Day of Week,TimeCategory,TimeCategory_AMPM,TimeCategory_NMAE,SinOfTime
0,0,0,0,0,0,0,MD,Medium,WHITE,BLACK,...,"2020, LINC, UT",1,3,2023,4,-0.685100,off-peak,0,Night,0.124675
1,0,0,0,0,0,0,MD,Medium,"BLUE, LIGHT",HISPANIC,...,"2023, MAZDA, CX-5",0,12,2023,5,0.862634,off-peak,0,Morning,0.479426
2,0,1,0,0,0,0,MD,Medium,WHITE,BLACK,...,"2023, VOLK, ATLAS",0,10,2023,5,0.862634,off-peak,0,Morning,0.404715
3,0,0,0,0,0,0,MD,Medium,GRAY,WHITE,...,"2012, HONDA, ODYSSEY",0,14,2023,5,0.933316,off-peak,1,Afternoon,0.550809
4,0,1,0,0,0,0,MD,Medium,BLACK,HISPANIC,...,"2018, HOND, SW",1,10,2023,5,-0.996677,off-peak,0,Morning,0.404715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1781,0,0,0,0,0,0,MD,Medium,GREEN,ASIAN,...,"2005, HOND, 4S",0,2,2013,9,0.862634,off-peak,0,Night,0.083237
1782,0,0,0,0,0,0,MD,Medium,BLACK,WHITE,...,"1997, MITS, 4S",0,2,2013,9,0.933316,off-peak,0,Night,0.083237
1783,0,0,0,0,0,0,MD,Medium,BLACK,WHITE,...,"2005, HOND, 4S",0,2,2013,9,0.301191,off-peak,0,Night,0.083237
1784,0,0,0,0,0,0,MD,Medium,WHITE,WHITE,...,"1996, ISUZU, RODEO",0,23,2014,12,0.933316,off-peak,1,Evening,0.818235


In [288]:
    
df['TimeCategory'] 

0       off-peak
1       off-peak
2       off-peak
3       off-peak
4       off-peak
          ...   
1781    off-peak
1782    off-peak
1783    off-peak
1784    off-peak
1785    off-peak
Name: TimeCategory, Length: 1786, dtype: object

In [289]:
df.columns

Index(['Belts', 'Property Damage', 'Commercial License', 'Commercial Vehicle',
       'Alcohol', 'Work Zone', 'State', 'VehicleType', 'Color', 'Race',
       'Gender', 'Driver City', 'Driver State', 'DL State', 'DV', 'DATE',
       'LATITUDE', 'LONGITUDE', 'AWND', 'PGTM', 'PRCP', 'TMAX', 'WSF2', 'WSF5',
       'TMIN', 'WDF2', 'WDF5', 'Vehicle', 'Weekend', 'Time', 'Year', 'Month',
       'Day of Week', 'TimeCategory', 'TimeCategory_AMPM', 'TimeCategory_NMAE',
       'SinOfTime'],
      dtype='object')

In [290]:
df['AWND'] = pd.to_numeric(df['AWND'], errors='coerce')
df['PGTM'] = pd.to_numeric(df['PGTM'], errors='coerce')
df['PRCP'] = pd.to_numeric(df['PRCP'], errors='coerce')
df['TMAX'] = pd.to_numeric(df['TMAX'], errors='coerce')
df['WSF2'] = pd.to_numeric(df['WSF2'], errors='coerce')
df['WSF5'] = pd.to_numeric(df['WSF5'], errors='coerce')
df['TMIN'] = pd.to_numeric(df['TMIN'], errors='coerce')
df['WDF2'] = pd.to_numeric(df['WDF2'], errors='coerce')
df['WDF5'] = pd.to_numeric(df['WDF5'], errors='coerce')
df['DV']   = pd.to_numeric(df['DV'], errors='coerce')

x=df.drop(columns=['DV'])
y=df['DV']

In [291]:
df['DV'].value_counts()

DV
0    1227
1     559
Name: count, dtype: int64

In [292]:
import pandas as pd


result = df.groupby(['Year', 'DV']).size().reset_index(name='Count')

result


Unnamed: 0,Year,DV,Count
0,2012,0,131
1,2012,1,62
2,2013,0,133
3,2013,1,57
4,2014,0,131
5,2014,1,40
6,2015,0,103
7,2015,1,53
8,2016,0,122
9,2016,1,56


In [293]:
result = df.groupby(['Weekend', 'DV']).size().reset_index(name='Count')
result

Unnamed: 0,Weekend,DV,Count
0,0,0,926
1,0,1,412
2,1,0,301
3,1,1,147


In [294]:
result = df.groupby(['TimeCategory', 'DV']).size().reset_index(name='Count')
result


Unnamed: 0,TimeCategory,DV,Count
0,am-peak,0,207
1,am-peak,1,130
2,off-peak,0,824
3,off-peak,1,336
4,pm-peak,0,196
5,pm-peak,1,93


In [295]:
result = df.groupby(['Belts', 'DV']).size().reset_index(name='Count')
result


Unnamed: 0,Belts,DV,Count
0,0,0,1123
1,0,1,450
2,1,0,104
3,1,1,109


In [296]:
result = df.groupby(['Gender', 'DV']).size().reset_index(name='Count')
result


Unnamed: 0,Gender,DV,Count
0,0,0,473
1,0,1,241
2,1,0,754
3,1,1,316
4,2,1,2


In [297]:
result = df.groupby(['TimeCategory_NMAE', 'DV']).size().reset_index(name='Count')
result


Unnamed: 0,TimeCategory_NMAE,DV,Count
0,Afternoon,0,346
1,Afternoon,1,191
2,Evening,0,268
3,Evening,1,76
4,Morning,0,340
5,Morning,1,213
6,Night,0,273
7,Night,1,79


In [298]:
result = df.groupby(['VehicleType', 'DV']).size().reset_index(name='Count')
result


Unnamed: 0,VehicleType,DV,Count
0,Heavy,0,69
1,Heavy,1,47
2,Light,0,11
3,Light,1,7
4,Medium,0,1147
5,Medium,1,505


In [299]:
result = df.groupby(['TimeCategory_AMPM', 'DV']).size().reset_index(name='Count')
result


Unnamed: 0,TimeCategory_AMPM,DV,Count
0,0,0,613
1,0,1,292
2,1,0,614
3,1,1,267


In [300]:
result = df.groupby(['Race', 'DV']).size().reset_index(name='Count')
result


Unnamed: 0,Race,DV,Count
0,ASIAN,0,108
1,ASIAN,1,54
2,BLACK,0,282
3,BLACK,1,140
4,HISPANIC,0,316
5,HISPANIC,1,124
6,NATIVE AMERICAN,0,2
7,NATIVE AMERICAN,1,1
8,OTHER,0,88
9,OTHER,1,43


In [301]:
result = df.groupby(['Alcohol', 'DV']).size().reset_index(name='Count')
result


Unnamed: 0,Alcohol,DV,Count
0,0,0,1225
1,0,1,559
2,1,0,2


In [None]:
result = df.groupby(['Alcohol', 'DV']).size().reset_index(name='Count')
result
