In [1]:
import pandas as pd
from pathlib import Path
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
big_path = "Traffic_Violations.csv"
big_df = pd.read_csv(big_path, low_memory=False)

In [3]:
big_df_smaller = big_df.drop(["Agency", "Geolocation", "VehicleType", "HAZMAT", 
                                "Commercial Vehicle", "Commercial License", "Article", "Charge", 
                                "SeqID", "SubAgency", "Model", "Contributed To Accident", "DL State", 
                                "Driver City", "Arrest Type", "Search Reason For Stop"], axis = 1)

In [4]:
# Drop rows that do not result in a citation or warning
big_df_smaller.drop(big_df_smaller[big_df_smaller['Violation Type'] == 'ESERO'].index, inplace=True)
big_df_smaller.drop(big_df_smaller[big_df_smaller['Violation Type'] == 'SERO'].index, inplace=True)

In [5]:
# Clean the Year column
# Drop rows with null year
big_df_smaller = big_df_smaller.dropna(subset=["Year"])
# Change year to integer
big_df_smaller["Year"]= big_df_smaller["Year"].astype(int)
# Get years in a list
years = big_df_smaller["Year"].unique()
# Make a list of garbage years
years_to_remove = [0, 6338, 1005, 1196, 2912, 1009, 2088, 1007, 2102, 2109, 2105, 2997]
# Clean the list of years
good_years = [value for value in years if value not in years_to_remove]
# Keep only rows with good years
big_df_smaller = big_df_smaller[big_df_smaller["Year"].isin(good_years)]
big_df_smaller.shape

(1790375, 27)

In [6]:
big_df_smaller["Search Reason"].value_counts(dropna=False)

NaN                       1707686
Incident to Arrest          47315
Probable Cause              20678
Consensual                  11540
K-9                          1616
Other                        1029
Exigent Circumstances         500
Probable Cause for CDS          4
Arrest/Tow                      3
plain view marijuana            3
DUI                             1
Name: Search Reason, dtype: int64

In [7]:
# Get value counts including null values
value_counts_result = big_df_smaller['Driver State'].value_counts(dropna=False).reset_index()

# Rename the columns for clarity
value_counts_result.columns = ['Driver State', 'Count']

In [8]:
big_df_smaller.fillna('None', inplace=True)

In [9]:
big_df_smaller['Date Of Stop'] = pd.to_datetime(big_df_smaller['Date Of Stop'])
big_df_smaller['Year Date'] = big_df_smaller['Date Of Stop'].dt.year
big_df_smaller['Month'] = big_df_smaller['Date Of Stop'].dt.month
big_df_smaller['Day'] = big_df_smaller['Date Of Stop'].dt.day

In [10]:
rf_df = big_df_smaller.drop(['Time Of Stop', 'Location', 'Latitude',
       'Longitude', 'Belts', 'Personal Injury', 'Property Damage',
       'Fatal', 'Alcohol', 'Work Zone', 'Search Conducted',
       'Search Disposition', 'Search Outcome', 'Search Type',
       'Search Arrest Reason', 'State', 'Year','Description'], axis = 1)

In [11]:
random_forest_df = rf_df.drop(['Date Of Stop'], axis = 1)

In [12]:
random_forest_df.columns

Index(['Accident', 'Search Reason', 'Make', 'Color', 'Violation Type', 'Race',
       'Gender', 'Driver State', 'Year Date', 'Month', 'Day'],
      dtype='object')

In [13]:
# Check if there are any null values in the entire DataFrame
if random_forest_df.isnull().values.any():
    print("There are null values in the DataFrame.")
else:
    print("There are no null values in the DataFrame.")

There are no null values in the DataFrame.


In [14]:
# Convert categorical data to numeric with `pd.get_dummies`
# Specify the columns you want to one-hot encode
columns_to_encode = ['Accident', 'Search Reason', 'Race','Gender', 'Driver State','Make']

# Use get_dummies to convert the specified columns into dummy variables
random_dummies = pd.get_dummies(random_forest_df[columns_to_encode], prefix=columns_to_encode, drop_first=True)

# Concatenate the dummy variables with the original DataFrame
random_forest_df = pd.concat([random_forest_df, random_dummies], axis=1)

# Drop the original categorical columns if needed
random_forest_df = random_forest_df.drop(columns_to_encode, axis=1)

# Print the resulting DataFrame
print(random_forest_df)

MemoryError: Unable to allocate 7.48 GiB for an array with shape (4485, 1790375) and data type uint8

In [None]:
#Beginning Random Forest Modeling
X=random_forest_df.copy()
X.drop("Violation Type",axis=1, inplace=True)
y=random_forest_df['Violation Type'].ravel()

In [None]:
X.dtypes

Make               object
Color              object
Year Date           int64
Month               int64
Day                 int64
                    ...  
Driver State_WA     uint8
Driver State_WI     uint8
Driver State_WV     uint8
Driver State_WY     uint8
Driver State_XX     uint8
Length: 90, dtype: object

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [None]:
#Fitting the Random Forest Model
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=7)
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

ValueError: could not convert string to float: 'CHEV'

In [None]:
#Making Predictions
predictions = rf_model.predict(X_test)
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual citation", "Actual warning"], columns=["Predicted citation", "Predicted warning"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [None]:
# Visualize the features by importance
importances_df = pd.DataFrame(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
importances_df.set_index(importances_df[1], inplace=True)
importances_df.drop(columns=1, inplace=True)
importances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
importances_sorted = importances_df.sort_values(by='Feature Importances')
importances_sorted.plot(kind='barh', color='lightgreen', title= 'Features Importances', legend=False)