In [52]:
import pandas as pd

# Load the CSV file
file_path = 'Training_Data.csv'  # Replace with the path to your CSV file
data = pd.read_csv(file_path)

# Define the columns to drop
columns_to_drop = [
    'Pass_vehiclenumber', 'Pass_InjuryType', 'Pass_age', 'Pass_gender', 'Pass_Safety_Equip_Used',
    'Pass_Airbag_Deployment', 'Pass_Ejected_From_Vehicle', 'Pass_Ems_Transport_Ind', 'Bike_VehicleNumber',
    'Bike_InjuryType', 'Bike_Age', 'Bike_Gender', 'Ped_Number', 'Ped_InjuryType', 'PED_Age', 'Ped_Gender',
    'Ped_Action', 'Ped_Drink', 'Ped_Cond', 'Ped_Al_Test', 'Ped_Drug', 'Ped_Rflct', 'OBJECTID',	'Document_Nbr',	'Crash_Year', 'Second_Crash_Event_Cd', 'Third_Crash_Event_Cd', 'Fourth_Crash_Event_Cd','Local Case CD', 'Comm_Cargo_Body_Type_Cd', 'Comm_Vehicle_Body_Type_Cd'

]

# Drop the unwanted columns
filtered_data = data.drop(columns=columns_to_drop)


In [53]:
severity_mapping = {'A': 4, 'B': 3, 'C': 2, 'O': 1, 'n/a': 0}
filtered_data['Crash_Severity'] = filtered_data['Crash_Severity'].map(severity_mapping).fillna(0)

# Handle semicolon-separated values
def handle_semicolon_separated(value):
    if pd.isna(value):
        return 0
    first_value = value.split(';')[0]
    return first_value if first_value.isnumeric() else 0

# Apply to relevant columns that are semicolon-separated
semicolon_columns = ['Driver_VehicleNumber', 'Driver_InjuryType', 'Driver_Age', 'Speed_Before', 'Speed_Posted', 'Speed_Max_Safe']
for col in semicolon_columns:
    filtered_data[col] = filtered_data[col].apply(handle_semicolon_separated).astype(float)

# Map categorical columns to numeric values
filtered_data['Driver_Gender'] = filtered_data['Driver_Gender'].replace({'Male': 1, 'Female': 0, 'n/a': 0})
filtered_data['Driver_Airbag_Deployment'] = filtered_data['Driver_Airbag_Deployment'].replace({'Deployed': 1, 'Not Deployed': 0, 'n/a': 0})
filtered_data['Driver_Alcohol_Test_Type_Cd'] = filtered_data['Driver_Alcohol_Test_Type_Cd'].replace({'No Test': 0, 'Positive': 1, 'Negative': -1, 'n/a': 0})
filtered_data['Driver_Condition_Type_Cd'] = filtered_data['Driver_Condition_Type_Cd'].replace({'No Defects': 0, 'Under Influence': 1, 'n/a': 0})
filtered_data['Driver_Drinking_Type_Cd'] = filtered_data['Driver_Drinking_Type_Cd'].replace({'No Drinking': 0, 'Drinking': 1, 'n/a': 0})
filtered_data['Driver_Ejected_From_Vehicle'] = filtered_data['Driver_Ejected_From_Vehicle'].replace({'Not Ejected': 0, 'Ejected': 1, 'n/a': 0})

# Handle n/a values
def extract_first_number(value):
    if pd.isna(value):
        return 0
    first_part = value.split('.')[0]
    if first_part.isnumeric():
        return int(first_part)
    return 0

crash_event_columns = ['First_Crash_Event_Cd']
for col in crash_event_columns:
    filtered_data[col] = filtered_data[col].apply(extract_first_number)



In [54]:
filtered_data = filtered_data.drop(columns=['Vehicle_Model_Nm', 'Driver_Alcohol_Test_Type_Cd', 'Vehicle_Year_Nbr', 'Vehicle_Make_Nm'])

def extract_first_gender(gender_string):
    if isinstance(gender_string, (int, float)):
        return gender_string
    # If it's NaN, return 0
    if pd.isna(gender_string):
        return 0
    first_gender = gender_string.split(';')[0].strip().lower()
    if first_gender == 'male':
        return 1
    elif first_gender == 'female':
        return 0
    else:
        return 0  

filtered_data['Driver_Gender'] = filtered_data['Driver_Gender'].apply(extract_first_gender)

# Extract the first number from Driver_Condition_Type_Cd and ignore the words after
def extract_first_number(value):
    if pd.isna(value):
        return 0
    first_part = value.split('.')[0]
    if first_part.isnumeric():
        return int(first_part)
    return 0

# Apply the function to the 'Driver_Condition_Type_Cd' column
filtered_data['Driver_Condition_Type_Cd'] = filtered_data['Driver_Condition_Type_Cd'].apply(extract_first_number)
filtered_data['Driver_Action_Type_Cd'] = filtered_data['Driver_Action_Type_Cd'].apply(extract_first_number)
filtered_data['Driver_Airbag_Deployment'] = filtered_data['Driver_Airbag_Deployment'].apply(extract_first_number)
filtered_data['Driver_Distraction_Type_Cd'] = filtered_data['Driver_Distraction_Type_Cd'].apply(extract_first_number)
filtered_data['Most_Harmful_Crash_Event_Cd'] = filtered_data['Most_Harmful_Crash_Event_Cd'].apply(extract_first_number)
filtered_data['Vehicle_Maneuver_Type_Cd'] = filtered_data['Vehicle_Maneuver_Type_Cd'].apply(extract_first_number)
filtered_data['Vehicle_Body_Type_Cd'] = filtered_data['Vehicle_Body_Type_Cd'].apply(extract_first_number)
filtered_data['Driver_Drinking_Type_Cd'] = filtered_data['Driver_Drinking_Type_Cd'].apply(extract_first_number)
filtered_data['Driver_Drug_Use_Cd'] = filtered_data['Driver_Drug_Use_Cd'].apply(extract_first_number)
filtered_data['Driver_Ejected_From_Vehicle'] = filtered_data['Driver_Ejected_From_Vehicle'].apply(extract_first_number)
filtered_data['Driver_Safety_Equip_Used'] = filtered_data['Driver_Safety_Equip_Used'].apply(extract_first_number)
filtered_data['Driver_Vis_Obscured_Type_Cd'] = filtered_data['Driver_Vis_Obscured_Type_Cd'].apply(extract_first_number)
filtered_data['Summons_Issued_Cd'] = filtered_data['Summons_Issued_Cd'].apply(extract_first_number)





In [55]:
def translate_yes_no(value):
    if pd.isna(value) or value == '':
        return 0
    first_value = value.split(';')[0].strip().lower()
    if first_value == 'yes':
        return 1
    elif first_value == 'no' or first_value == 'notprovided':
        return 0
    else:
        return 0  
def count_vehicles(value):
    if pd.isna(value) or value == '':
        return 0
    return len(value.split(';'))

filtered_data['Vehiclenumber'] = filtered_data['Vehiclenumber'].apply(count_vehicles)
def extract_first_number(value):
    if pd.isna(value):
        return 0
    first_part = value.split(';')[0]
    if first_part.isnumeric():
        return int(first_part)
    return 0

filtered_data['Initial_Veh_Impact_Area_Cd'] = filtered_data['Initial_Veh_Impact_Area_Cd'].apply(extract_first_number)

# Function to extract the first direction and map to numeric values
direction_mapping = {'north': 1, 'east': 2, 'south': 3, 'west': 4, 'n/a': 0}

def extract_first_direction(value):
    if pd.isna(value):
        return 0
    first_direction = value.split(';')[0].strip().lower()
    return direction_mapping.get(first_direction, 0)

# Apply the function to 'Direction_Of_Travel_Cd' column
filtered_data['Direction_Of_Travel_Cd'] = filtered_data['Direction_Of_Travel_Cd'].apply(extract_first_direction)
filtered_data['Driver_Ems_Transport_Ind'] = filtered_data['Driver_Ems_Transport_Ind'].apply(translate_yes_no)
filtered_data['Driver_Fled_Scene_Ind'] = filtered_data['Driver_Fled_Scene_Ind'].apply(translate_yes_no)

In [56]:
output_file_path = 'Cleaned_Training_Data.csv'  # You can specify a different path or filename if needed
filtered_data.to_csv(output_file_path, index=False)

In [57]:
!pip install scikit-learn



You should consider upgrading via the 'c:\users\nathan cheng\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [59]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


file_path = 'Cleaned_Training_Data.csv' 
data = pd.read_csv(file_path)
data = data.drop(columns=['Route or Street Name'])

#Separate the features (X) and the target (y)
X = data.drop(columns=['Rating']) 
y = data['Rating']  # Target is the 'Rating' column

#Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Train a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

#Make predictions on the test set
y_pred = rf_model.predict(X_test)

#Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

#Display the feature importances
importances = rf_model.feature_importances_
feature_names = X.columns
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print(feature_importances.sort_values(by='Importance', ascending=False))

import joblib
joblib.dump(rf_model, 'random_forest_model.pkl')

Mean Squared Error: 332.220649
                        Feature  Importance
3                    Driver_Age    0.184793
20                 Speed_Before    0.085149
5         Driver_Action_Type_Cd    0.073559
22               Speed_Max_Safe    0.059704
25   Initial_Veh_Impact_Area_Cd    0.057389
21                 Speed_Posted    0.054320
26       Direction_Of_Travel_Cd    0.050126
19     Vehicle_Maneuver_Type_Cd    0.048681
18         Vehicle_Body_Type_Cd    0.043910
24  Most_Harmful_Crash_Event_Cd    0.040694
6      Driver_Airbag_Deployment    0.039088
0                Crash_Severity    0.033763
8    Driver_Distraction_Type_Cd    0.029892
23         First_Crash_Event_Cd    0.026446
16            Summons_Issued_Cd    0.025480
17                Vehiclenumber    0.024234
4                 Driver_Gender    0.021291
9       Driver_Drinking_Type_Cd    0.021083
15  Driver_Vis_Obscured_Type_Cd    0.019911
10           Driver_Drug_Use_Cd    0.019545
14     Driver_Safety_Equip_Used    0.015222
7

['random_forest_model.pkl']

In [60]:
from sklearn.metrics import r2_score, mean_squared_error

y_pred = rf_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R-squared: {r2}")
print(f"Mean Squared Error: {mse}")

R-squared: -0.09329717208388844
Mean Squared Error: 332.220649
