In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer

In [2]:
dataset = pd.read_csv('CO22334 dataset_traffic_accident_prediction1.csv')
print("Dataset Loaded:\n", dataset)

Dataset Loaded:
     Weather   Road_Type Time_of_Day  Traffic_Density  Speed_Limit  \
0     Rainy   City Road     Morning              1.0        100.0   
1     Clear  Rural Road       Night              NaN        120.0   
2     Rainy     Highway     Evening              1.0         60.0   
3     Clear   City Road   Afternoon              2.0         60.0   
4     Rainy     Highway     Morning              1.0        195.0   
..      ...         ...         ...              ...          ...   
835   Clear     Highway       Night              2.0         30.0   
836   Rainy  Rural Road     Evening              2.0         60.0   
837   Foggy     Highway     Evening              NaN         30.0   
838   Foggy     Highway   Afternoon              2.0         60.0   
839   Clear     Highway   Afternoon              1.0         60.0   

     Number_of_Vehicles  Driver_Alcohol Accident_Severity      Road_Condition  \
0                   5.0             0.0               NaN                

In [3]:
dataset.fillna(dataset.mean(numeric_only=True), inplace=True)
for col in dataset.select_dtypes(include=['object']).columns:
    dataset[col] = dataset[col].fillna(dataset[col].mode()[0])
print(dataset)

    Weather   Road_Type Time_of_Day  Traffic_Density  Speed_Limit  \
0     Rainy   City Road     Morning         1.000000        100.0   
1     Clear  Rural Road       Night         1.001253        120.0   
2     Rainy     Highway     Evening         1.000000         60.0   
3     Clear   City Road   Afternoon         2.000000         60.0   
4     Rainy     Highway     Morning         1.000000        195.0   
..      ...         ...         ...              ...          ...   
835   Clear     Highway       Night         2.000000         30.0   
836   Rainy  Rural Road     Evening         2.000000         60.0   
837   Foggy     Highway     Evening         1.001253         30.0   
838   Foggy     Highway   Afternoon         2.000000         60.0   
839   Clear     Highway   Afternoon         1.000000         60.0   

     Number_of_Vehicles  Driver_Alcohol Accident_Severity      Road_Condition  \
0                   5.0             0.0               Low                 Wet   
1        

In [4]:
categorical_features = dataset.select_dtypes(include=['object']).columns.tolist()
numerical_features = dataset.select_dtypes(exclude=['object']).columns.tolist()
numerical_features.remove(dataset.columns[-1])
print(categorical_features)

['Weather', 'Road_Type', 'Time_of_Day', 'Accident_Severity', 'Road_Condition', 'Vehicle_Type', 'Road_Light_Condition']


In [5]:
target_column = dataset.columns[-1]
X = dataset.iloc[:, :-1].values 
y = dataset.iloc[:, -1].values 
print(X)
print(y)

[['Rainy' 'City Road' 'Morning' ... 51.0 48.0 'Artificial Light']
 ['Clear' 'Rural Road' 'Night' ... 49.0 43.0 'Artificial Light']
 ['Rainy' 'Highway' 'Evening' ... 54.0 52.0 'Artificial Light']
 ...
 ['Foggy' 'Highway' 'Evening' ... 43.2593984962406 34.0
  'Artificial Light']
 ['Foggy' 'Highway' 'Afternoon' ... 25.0 19.0 'Artificial Light']
 ['Clear' 'Highway' 'Afternoon' ... 29.0 21.0 'Artificial Light']]
[0.         0.         0.         0.         1.         0.
 1.         1.         0.         0.         0.29949875 1.
 0.         1.         0.         0.         0.         0.
 0.         1.         0.         0.         0.         0.
 1.         0.         1.         0.         1.         0.
 0.         1.         0.         1.         0.         0.
 0.         0.         1.         1.         1.         0.
 0.         0.         0.         0.         0.         1.
 0.         0.         0.         1.         0.         1.
 0.         0.         1.         0.29949875 0.         1.

In [6]:
from sklearn.pipeline import Pipeline
import joblib

ct = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', ct),
    ('model', DecisionTreeRegressor())
])
 
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]
pipeline.fit(X, y)

# Save the pipeline
joblib.dump(pipeline, 'traffic_accident_pipeline.pkl')

['traffic_accident_pipeline.pkl']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=0)

In [8]:
# Assuming dataset is your original DataFrame
X = dataset.drop(columns=['Accident'])
y = dataset['Accident']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Now apply ColumnTransformer
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)


In [9]:
# Reshape after converting to NumPy array
y_train = np.array(y_train).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)

sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train).flatten()
y_test = sc_y.transform(y_test).flatten()


print(y_train)
print(y_test)

[ 1.56984065e+00 -6.72196358e-01  1.56984065e+00 -6.72196358e-01
 -6.72196358e-01 -7.09082001e-04 -6.72196358e-01  1.56984065e+00
  1.56984065e+00 -6.72196358e-01 -6.72196358e-01 -6.72196358e-01
 -6.72196358e-01  1.56984065e+00 -6.72196358e-01 -6.72196358e-01
  1.56984065e+00 -6.72196358e-01 -6.72196358e-01 -6.72196358e-01
  1.56984065e+00 -6.72196358e-01  1.56984065e+00  1.56984065e+00
  1.56984065e+00  1.56984065e+00 -6.72196358e-01  1.56984065e+00
  1.56984065e+00 -7.09082001e-04 -6.72196358e-01 -6.72196358e-01
  1.56984065e+00 -6.72196358e-01  1.56984065e+00  1.56984065e+00
  1.56984065e+00 -6.72196358e-01 -6.72196358e-01 -6.72196358e-01
 -7.09082001e-04 -6.72196358e-01 -6.72196358e-01 -6.72196358e-01
 -6.72196358e-01 -6.72196358e-01 -6.72196358e-01 -6.72196358e-01
  1.56984065e+00 -6.72196358e-01 -6.72196358e-01 -6.72196358e-01
 -6.72196358e-01 -6.72196358e-01 -6.72196358e-01  1.56984065e+00
 -6.72196358e-01  1.56984065e+00 -6.72196358e-01 -6.72196358e-01
 -6.72196358e-01  1.56984

In [10]:
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)

In [11]:
y_pred = regressor.predict(X_test)

In [12]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 1.9537128898143197


In [13]:
from sklearn.metrics import mean_absolute_error, mean_squared_error , median_absolute_error 

In [14]:
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error: {rmse}')

median_error = median_absolute_error(y_test, y_pred)
print(f'Median Absolute Error: {median_error}')


Mean Absolute Error: 0.9117189060084434
Root Mean Squared Error: 1.3977527999665462
Median Absolute Error: 2.220446049250313e-16


In [15]:
import joblib

# Save the trained model
joblib.dump(regressor, 'traffic_accident_model.pkl')




['traffic_accident_model.pkl']