In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [7]:
data = pd.read_csv('downloads/accidents.csv')

# Display first few rows to understand the structure
print(data.head())


   Accident_severity     Month     Speed Weather Road_condition  Driver_Age  \
0                  2   January  100km/hr   Rainy            wet          45   
1                  2   January   69km/hr   Sunny            dry          33   
2                  1  February   50km/hr   Sunny            dry          33   
3                  3  February  150km/hr   Rainy            wet          33   
4                  2     March  120km/hr   Rainy            wet          33   

   Gender Alcohol_level  
0    Male          0.58  
1  Female          0.78  
2  Female           0.3  
3    Male         0..04  
4  Female          0.48  


In [8]:
# Check the column names in the dataset
print(data.columns)

# Define dependent and independent variables (adjust based on available columns)
X = data[['Month', 'Speed', 'Weather', 'Driver_Age', 'Gender']]  # Removed 'Road_Condition' and 'Alcohol_Level'
y = data['Accident_severity']  # Dependent variable

# Encode categorical variables (e.g., 'Weather', 'Gender')
# Assuming 'Weather' and 'Gender' are categorical
X = pd.get_dummies(X, drop_first=True)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Print shapes of the training and test sets to verify the split
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)



Index(['Accident_severity', 'Month', 'Speed', 'Weather', 'Road_condition',
       'Driver_Age', 'Gender', 'Alcohol_level'],
      dtype='object')
X_train shape: (3, 9)
X_test shape: (2, 9)
y_train shape: (3,)
y_test shape: (2,)


In [9]:
# Print the number of samples in the test set to check if the size is too small
print(f'Number of samples in test set: {len(X_test)}')


Number of samples in test set: 2


In [10]:
# Create a linear regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 0.12715474757935627
R-squared: 0.0


In [11]:
import joblib

# Save the model
joblib.dump(model, 'road_accident_severity_model.pkl')

['road_accident_severity_model.pkl']

In [17]:
# Hypothetical data (make sure this structure matches the original data used for training)
hypothetical_data = pd.DataFrame({
    'Month_January': [1],
    'Month_March': [0],
    'Speed_120km/hr': [0], 
    'Speed_150km/hr': [0], 
    'Speed_70km/hr': [1],
    'Weather_Sunny': [1],    # Assuming clear weather was encoded
    'Weather_Rainy': [0],    # Add this to match the columns
    'Driver_Age': [35],
    'Gender_Male': [1],      # Assuming male was encoded
})

# Re-align the columns with the training data (adding any missing columns with 0)
missing_cols = set(X_train.columns) - set(hypothetical_data.columns)
for col in missing_cols:
    hypothetical_data[col] = 0

# Ensure the columns are in the same order as the training data
hypothetical_data = hypothetical_data[X_train.columns]

# Make prediction
severity_prediction = model.predict(hypothetical_data)
print(f'Predicted Accident Severity: {severity_prediction[0]}')


Predicted Accident Severity: 1.9125214408233282
