In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle

df = pd.read_csv(r"C:\joy\accident_ data _joy.csv")

print(df.columns)

categorical_columns = ['Age', 'Sex', 'Education', 'Vehicle', 'Lanes', 'Junction', 
                       'Road', 'Time', 'Weather_conditions', 'Collision_type', 
                       'Vehicle_movement', 'Pedestrian_movement', 'Cause_of_accident', 
                       'Driving_experience']

X = df.drop('Accident_severity', axis=1) 
y = df['Accident_severity'] 

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ], remainder='passthrough')  # Leave numerical data as is

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', LinearRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

with open('road_accident_severity_model.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

with open('road_accident_severity_model.pkl', 'rb') as file:
    pipeline = pickle.load(file)


Index(['Age', 'Sex', 'Education', 'Vehicle', 'Driving_experience', 'Lanes',
       'Junction', 'Road', 'Time', 'Weather_conditions', 'Collision_type',
       'Vehicle_movement', 'Pedestrian_movement', 'Cause_of_accident',
       'Accident_severity'],
      dtype='object')
Mean Squared Error: 0.33009160171000795
