In [33]:
# Initial imports
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline
import os
from scipy.sparse import csr_matrix


In [11]:
# Set the limit for max rows and columns to display
pd.options.display.max_rows = 50
pd.options.display.max_columns = 50

In [12]:
# Loading data
#file_path = "data\Traffic_Violations_Processed.csv"
file_path = r"Traffic_Violations_Processed.csv"


In [16]:
# Assuming 'data\Traffic_Violations_Processed.csv' is the correct path
processed_file_path = r"data\Traffic_Violations_Processed.csv"
df_traffic = pd.read_csv(processed_file_path)

In [17]:
print("Current Working Directory:", os.getcwd())
print("Files in Current Directory:", os.listdir())

Current Working Directory: c:\class_work\Project_4
Files in Current Directory: ['.git', '.gitignore', 'combined_data_cleaning.ipynb', 'data', 'DataExploration_Findings.md', 'data_cleaning.ipynb', 'data_cleaning_binning.ipynb', 'DESCRIPTIONS.docx', 'drop_columns.ipynb', 'dv_data_cleaning.ipynb', 'dv_random_forest copy.ipynb', 'dv_random_forest.ipynb', 'Haley_data_cleaning.ipynb', 'image-1.png', 'image.png', 'README.md', 'Sample_data.csv', 'sm_random_forest.ipynb', 'sm_state_make_cleaning.ipynb', 'tables.ipynb', 'Traffic_Violations.csv']


In [18]:
df_traffic.columns

Index(['Date Of Stop', 'Time Of Stop', 'Description', 'Location', 'Latitude',
       'Longitude', 'Accident', 'Belts', 'Personal Injury', 'Property Damage',
       'Fatal', 'Alcohol', 'Work Zone', 'Search Conducted',
       'Search Disposition', 'Search Outcome', 'Search Reason', 'Search Type',
       'Search Arrest Reason', 'State', 'Year', 'Make', 'Color',
       'Violation Type', 'Race', 'Gender', 'Driver State', 'Year of Stop',
       'Month of Stop', 'Driver State Category',
       'License Plate State Category', 'Hour of Stop'],
      dtype='object')

In [49]:
 # Get required columns
column_list = ['Race','Gender','Make','Violation Type']
df_traffic_trimmed = df_traffic[column_list]
df_traffic_trimmed.head()

Unnamed: 0,Race,Gender,Make,Violation Type
0,WHITE,F,HYUNDAI,Citation
1,BLACK,F,NISSAN,Citation
2,BLACK,F,NISSAN,Citation
3,BLACK,F,NISSAN,Citation
4,BLACK,F,NISSAN,Citation


In [51]:
df_traffic_trimmed.columns

Index(['Race', 'Gender', 'Make', 'Violation Type'], dtype='object')

In [52]:
df_traffic['Violation Type'].value_counts(dropna=False)

Citation    756296
Name: Violation Type, dtype: int64

In [53]:
# Assuming 'Violation Type' is the column with null values
df_traffic['Violation Type'] = df_traffic['Violation Type'].fillna("None")

In [54]:
# Fill null values in the entire DataFrame with "None"
df_traffic_clean = df_traffic.fillna("None")

In [55]:
df_traffic_clean.count()

Date Of Stop                    1665836
Time Of Stop                    1665836
Description                     1665836
Location                        1665836
Latitude                        1665836
Longitude                       1665836
Accident                        1665836
Belts                           1665836
Personal Injury                 1665836
Property Damage                 1665836
Fatal                           1665836
Alcohol                         1665836
Work Zone                       1665836
Search Conducted                1665836
Search Disposition              1665836
Search Outcome                  1665836
Search Reason                   1665836
Search Type                     1665836
Search Arrest Reason            1665836
State                           1665836
Year                            1665836
Make                            1665836
Color                           1665836
Violation Type                  1665836
Race                            1665836


In [57]:
# One-hot encoding of categorical columns
traffic_dummies = pd.get_dummies(df_traffic_clean[columns_to_encode], columns=columns_to_encode, sparse=True)


In [58]:
# # Check the columns after one-hot encoding
traffic_dummies.columns

Index(['Race_ASIAN', 'Race_BLACK', 'Race_HISPANIC', 'Race_NATIVE AMERICAN',
       'Race_OTHER', 'Race_WHITE', 'Gender_F', 'Gender_M', 'Gender_U',
       'Make_'NISSAN',
       ...
       'Make_ZUZU', 'Make_ZUZUKI', 'Make_]LEXUS', 'Make_`', 'Make_`DODGE',
       'Make_`LINC', 'Make_`SCION', 'Make_`TOYOTA', 'Violation Type_Citation',
      dtype='object', length=3660)

In [61]:
# Update the columns_to_drop list based on the actual column names
columns_to_drop = ['Violation Type_Citation', 'Violation Type_Warning']

# Drop the specified columns and create a sparse matrix directly
X_sparse = csr_matrix(traffic_dummies.drop(columns_to_drop, axis=1).values)


In [62]:
# Drop the specified columns directly from the original DataFrame
traffic_dummies.drop(['Violation Type_Citation', 'Violation Type_Warning'], axis=1, inplace=True)

In [63]:
# Check the number of Citation and Warning
df_traffic_clean['Violation Type'].value_counts()

Citation    756296
Name: Violation Type, dtype: int64

In [66]:
print(traffic_dummies.columns)

Index(['Race_ASIAN', 'Race_BLACK', 'Race_HISPANIC', 'Race_NATIVE AMERICAN',
       'Race_OTHER', 'Race_WHITE', 'Gender_F', 'Gender_M', 'Gender_U',
       'Make_'NISSAN',
       ...
       'Make_ZR900', 'Make_ZUMMERQ', 'Make_ZUZU', 'Make_ZUZUKI', 'Make_]LEXUS',
       'Make_`', 'Make_`DODGE', 'Make_`LINC', 'Make_`SCION', 'Make_`TOYOTA'],
      dtype='object', length=3658)


In [68]:
# Define the outcome column
y = (df_traffic_clean['Violation Type'] == 'Warning').astype(int).values

# Output 0 represents Citation and 1 represents Warning
print(y[:10])

[0 0 0 0 0 0 0 0 0 0]


In [72]:
# Define the outcome column
x = (df_traffic_clean['Violation Type'] == 'Citation').astype(int).values

# Output 0 represents Citation and 1 represents Warning
print(x[:10])


[1 1 1 1 1 1 1 1 1 1]


In [77]:
# Select relevant columns for the features
feature_columns = ['Race', 'Gender', 'Make'] 

# Create the feature set X and outcome variable y
X = df_traffic_clean[feature_columns]
y = (df_traffic_clean['Violation Type'] == 'Warning').astype(int)

# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=35)

## Fitting the Random Forest Model

In [78]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=200, random_state=35)

In [93]:
# Train the Random Forest model
rf_model.fit(X_train, y_train)

ValueError: could not convert string to float: 'HISPANIC'

In [86]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

ValueError: could not convert string to float: 'HISPANIC'

## Making Predictions using the Random Forest Model

In [90]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test)

ValueError: could not convert string to float: 'BLACK'

## Model Evaluation

In [91]:
# Calculating the confusion matrix 
#index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Citation", "Actual Warning"], columns=["Predicted Citation", "Predicted Warning"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

NameError: name 'predictions' is not defined

In [92]:
# Displaying results
print("Confusion Matrix:")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report:")
print(classification_report(y_test, predictions))

Confusion Matrix:


NameError: name 'cm_df' is not defined