In [8]:
import pandas as pd
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

data = pd.read_excel('Road Accident Data.xlsx')
data.head()

Unnamed: 0,Accident_Index,Accident Date,Day_of_Week,Junction_Control,Junction_Detail,Accident_Severity,Latitude,Light_Conditions,Local_Authority_(District),Carriageway_Hazards,...,Number_of_Casualties,Number_of_Vehicles,Police_Force,Road_Surface_Conditions,Road_Type,Speed_limit,Time,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,BS0000001,2021-01-01,Thursday,Give way or uncontrolled,T or staggered junction,Serious,51.512273,Daylight,Kensington and Chelsea,,...,1,2,Metropolitan Police,Dry,One way street,30,15:11:00,Urban,Fine no high winds,Car
1,BS0000002,2021-01-05,Monday,Give way or uncontrolled,Crossroads,Serious,51.514399,Daylight,Kensington and Chelsea,,...,11,2,Metropolitan Police,Wet or damp,Single carriageway,30,10:59:00,Urban,Fine no high winds,Taxi/Private hire car
2,BS0000003,2021-01-04,Sunday,Give way or uncontrolled,T or staggered junction,Slight,51.486668,Daylight,Kensington and Chelsea,,...,1,2,Metropolitan Police,Dry,Single carriageway,30,14:19:00,Urban,Fine no high winds,Taxi/Private hire car
3,BS0000004,2021-01-05,Monday,Auto traffic signal,T or staggered junction,Serious,51.507804,Daylight,Kensington and Chelsea,,...,1,2,Metropolitan Police,Frost or ice,Single carriageway,30,08:10:00,Urban,Other,Motorcycle over 500cc
4,BS0000005,2021-01-06,Tuesday,Auto traffic signal,Crossroads,Serious,51.482076,Darkness - lights lit,Kensington and Chelsea,,...,1,2,Metropolitan Police,Dry,Single carriageway,30,17:25:00,Urban,Fine no high winds,Car


In [30]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
df=data

# Drop unnecessary columns
columns_to_keep = ['Accident_Severity', 'Day_of_Week', 'Junction_Control', 'Light_Conditions', 
                   'Road_Surface_Conditions', 'Road_Type', 'Speed_limit', 'Time',
                   'Urban_or_Rural_Area', 'Weather_Conditions', 'Vehicle_Type']

# Check for column existence
missing_columns = set(columns_to_keep) - set(df.columns)
if missing_columns:
    print(f"Error: Columns {missing_columns} not found in the dataset.")
else:
    # Continue with the code
    df = data[columns_to_keep]


# Handle missing values if any
df.dropna(inplace=True)
# Group 'Vehicle_Type' and 'Light_Conditions' 
df['Vehicle_Type'] = df['Vehicle_Type'].replace({
    'Motorcycle 50cc and under': 'Motorcycle', 
    'Motorcycle over 125cc and up to 500cc': 'Motorcycle',
    'Motorcycle over 500cc': 'Motorcycle', 
    'Motorcycle over 125cc and under': 'Motorcycle',
    "Bus or coach (17 or more pass seats)":"Bus",
    "Minibus (8 - 16 passenger seats)":"Bus",
    "Goods 7.5 tonnes mgw and over":"Van","Goods over 3.5t. and under 7.5t":"Van",
    "Van / Goods 3.5 tonnes mgw or under":"Van","Taxi/Private hire car":"Car"})
df['Light_Conditions'] = df['Light_Conditions'].replace({"Darkness - lights lit":"Darkness",
                                                         "Darkness - lighting unknown":"Darkness",
                                                         "Darkness - no lighting":"Darkness"})

# Change values equal to "Fetal" to "Fatal" in the 'Accident_Severity' column
df['Accident_Severity'] = df['Accident_Severity'].replace({'Fetal': 'Fatal'})

# Extract hour from 'Time'
df['Hour'] = df['Time'].apply(lambda x: x.hour)

df['Accident_Severity'] = df['Accident_Severity'].replace({'Serious': 1, 'Fatal': 1, 'Slight': 0})

# Separate classes
df_slight = df[df['Accident_Severity'] == 0]
df_serious_fatal = df[df['Accident_Severity'] == 1]

# Downsample the majority class ('Slight') to match the size of the minority classes
df_slight_downsampled = df_slight.sample(min(len(df_serious_fatal), len(df_slight)), 
                                         random_state=42)

# Concatenate the downsampled majority class with the minority classes
df_balanced = pd.concat([df_slight_downsampled, df_serious_fatal])

severity_counts = df_balanced['Accident_Severity'].value_counts()

print("Number of rows with Accident_Severity 0:", severity_counts[0])
print("Number of rows with Accident_Severity 1:", severity_counts[1])
df_balanced.head()


Number of rows with Accident_Severity 0: 44691
Number of rows with Accident_Severity 1: 44691


Unnamed: 0,Accident_Severity,Day_of_Week,Junction_Control,Light_Conditions,Road_Surface_Conditions,Road_Type,Speed_limit,Time,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type,Hour
251074,0,Friday,Give way or uncontrolled,Darkness,Dry,Roundabout,30,18:10:00,Urban,Raining no high winds,Car,18
29734,0,Friday,Auto traffic signal,Daylight,Wet or damp,Single carriageway,30,21:27:00,Urban,Fine no high winds,Car,21
78599,0,Wednesday,Data missing or out of range,Daylight,Dry,Single carriageway,30,13:00:00,Urban,Fine no high winds,Car,13
274068,0,Sunday,Data missing or out of range,Daylight,Dry,One way street,30,17:00:00,Urban,Fine no high winds,Car,17
242213,0,Monday,Give way or uncontrolled,Daylight,Dry,Roundabout,30,17:25:00,Rural,Fine no high winds,Car,17


In [33]:
# Drop the original 'Time' column
df_balanced = df_balanced.drop('Time', axis=1)

# Split the data into features and target variable
X = df_balanced.drop('Accident_Severity', axis=1)
y = df_balanced['Accident_Severity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use OneHotEncoder with ColumnTransformer for categorical columns
categorical_columns = ['Day_of_Week', 'Junction_Control', 'Light_Conditions', 'Road_Surface_Conditions', 'Road_Type','Weather_Conditions', 'Vehicle_Type','Urban_or_Rural_Area']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)
    ],
    remainder='passthrough'
)
print(df.head())

# Initialize the model (Random Forest Classifier in this example)
model = RandomForestClassifier()

   Accident_Severity Day_of_Week          Junction_Control Light_Conditions  \
0                  1    Thursday  Give way or uncontrolled         Daylight   
1                  1      Monday  Give way or uncontrolled         Daylight   
2                  0      Sunday  Give way or uncontrolled         Daylight   
3                  1      Monday       Auto traffic signal         Daylight   
4                  1     Tuesday       Auto traffic signal         Darkness   

  Road_Surface_Conditions           Road_Type  Speed_limit  \
0                     Dry      One way street           30   
1             Wet or damp  Single carriageway           30   
2                     Dry  Single carriageway           30   
3            Frost or ice  Single carriageway           30   
4                     Dry  Single carriageway           30   

  Urban_or_Rural_Area  Weather_Conditions Vehicle_Type  Hour  
0               Urban  Fine no high winds          Car    15  
1               Urban  Fin

In [34]:
# Create a pipeline with the preprocessor and model
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the testing set
predictions = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.5349890921295519
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.57      0.55      8941
           1       0.54      0.50      0.52      8936

    accuracy                           0.53     17877
   macro avg       0.54      0.53      0.53     17877
weighted avg       0.54      0.53      0.53     17877



In [45]:
import numpy as np

sample_data = pd.DataFrame({
    'Day_of_Week': np.random.choice(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
                                     'Sunday'], size=1),  # Random values for Day_of_Week
    'Junction_Control': np.random.choice(['Authorised person', 'Auto traffic signal', 'Give way or uncontrolled',
                                          'Not at junction or within 20 metres','Stop sign'], size=1),  # Random values for Junction_Control
    'Light_Conditions': np.random.choice(['Daylight', 'Darkness'], size=1),  # Random values for Light_Conditions
    'Road_Surface_Conditions': np.random.choice(['Dry', 'Wet or damp', 'Frost or ice'], size=1),  # Random values for Road_Surface_Conditions
    'Road_Type': np.random.choice(['Single carriageway', 'Dual carriageway', 'Roundabout','One way street'], size=1),  # Random values for Road_Type
    'Speed_limit': np.random.randint(20, 70, size=1),  # Random values for Speed_limit (20 to 70)
    'Time': pd.to_datetime(np.random.choice(pd.date_range('2022-01-01', '2022-12-31', freq='H'), size=1)),  # Random values for Time
    'Urban_or_Rural_Area': np.random.choice(['Urban', 'Rural'], size=1),  # Random values for Urban_or_Rural_Area
    'Weather_Conditions': np.random.choice(['Fine no high winds', 'Fog or mist', 'Raining + high winds',
                                            'Raining no high winds'], size=1),  # Random values for Weather_Conditions
    'Vehicle_Type': np.random.choice(['Car', 'Van', 'Motorcycle'], size=1),  # Random values for Vehicle_Type
})
print(sample_data)
sample_data['Hour'] = sample_data['Time'].dt.hour

# Drop the original 'Time' column
sample_data = sample_data.drop('Time', axis=1)

# Make predictions
predictions2 = pipeline.predict(sample_data)

# Display predictions
print("Predicted Accident Severity:", predictions2)

  Day_of_Week                     Junction_Control Light_Conditions  \
0      Monday  Not at junction or within 20 metres         Daylight   

  Road_Surface_Conditions         Road_Type  Speed_limit                Time  \
0             Wet or damp  Dual carriageway           51 2022-02-02 22:00:00   

  Urban_or_Rural_Area Weather_Conditions Vehicle_Type  
0               Rural        Fog or mist   Motorcycle  
Predicted Accident Severity: [0]


In [48]:
joblib.dump(pipeline, 'model.pkl')

['model.pkl']

In [49]:
joblib.dump(preprocessor, 'preprocessor.pkl')

['preprocessor.pkl']