In [3]:
print("Welcome to the Road Accident Prediction Assignment")

Welcome to the Road Accident Prediction Assignment


In [14]:

# Step 1: Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [15]:
# Step 2: Load dataset
df = pd.read_csv("accident_prediction_india.csv")

print("First 5 rows:")
print(df.head())


First 5 rows:
          State Name City Name  Year    Month Day of Week Time of Day  \
0  Jammu and Kashmir   Unknown  2021      May      Monday        1:46   
1      Uttar Pradesh   Lucknow  2018  January   Wednesday       21:30   
2       Chhattisgarh   Unknown  2023      May   Wednesday        5:37   
3      Uttar Pradesh   Lucknow  2020     June    Saturday        0:31   
4             Sikkim   Unknown  2021   August    Thursday       11:21   

  Accident Severity  Number of Vehicles Involved Vehicle Type Involved  \
0           Serious                            5                 Cycle   
1             Minor                            5                 Truck   
2             Minor                            5            Pedestrian   
3             Minor                            3                   Bus   
4             Minor                            5                 Cycle   

   Number of Casualties  ...         Road Type      Road Condition  \
0                     0  ...  Na

In [6]:
# 2) Check Missing Values
# ----------------------------
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
State Name                       0
City Name                        0
Year                             0
Month                            0
Day of Week                      0
Time of Day                      0
Accident Severity                0
Number of Vehicles Involved      0
Vehicle Type Involved            0
Number of Casualties             0
Number of Fatalities             0
Weather Conditions               0
Road Type                        0
Road Condition                   0
Lighting Conditions              0
Traffic Control Presence       716
Speed Limit (km/h)               0
Driver Age                       0
Driver Gender                    0
Driver License Status          975
Alcohol Involvement              0
Accident Location Details        0
dtype: int64


In [16]:
# Step 3: Create Binary Target
# Convert Accident Severity into numeric
# Minor = 0, Serious = 1

df['Accident Severity'] = df['Accident Severity'].map({
    'Minor': 0,
    'Serious': 1
})

# Remove rows where severity might be missing
df = df.dropna(subset=['Accident Severity'])


In [8]:
# 3) Set correct target column
target = "Accident Severity"
print("Target Column:", target)

Target Column: Accident Severity


In [17]:
# Step 4: Select important features
features = [
    'Speed Limit (km/h)',
    'Driver Age',
    'Number of Vehicles Involved',
    'Number of Casualties',
    'Number of Fatalities',
    'Alcohol Involvement'
]

df = df[features + ['Accident Severity']]

# Convert Alcohol Yes/No to 1/0
df['Alcohol Involvement'] = df['Alcohol Involvement'].map({
    'Yes': 1,
    'No': 0
})

# Remove any remaining missing values
df = df.dropna()


In [18]:
# Step 5: Define X and y
X = df.drop('Accident Severity', axis=1)
y = df['Accident Severity']


In [19]:
# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [20]:
# Step 7: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [21]:
# Step 8: Train Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [22]:
# Step 9: Predictions
y_pred = model.predict(X_test_scaled)


In [23]:
# Step 10: Evaluation
print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Model Accuracy: 0.4838709677419355

Confusion Matrix:
 [[145  56]
 [152  50]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.49      0.72      0.58       201
         1.0       0.47      0.25      0.32       202

    accuracy                           0.48       403
   macro avg       0.48      0.48      0.45       403
weighted avg       0.48      0.48      0.45       403



In [24]:
# Step 11: Sample Prediction
sample = np.array([[80, 30, 2, 1, 0, 1]])  
# Format:
# Speed, Age, Vehicles, Casualties, Fatalities, Alcohol

sample_scaled = scaler.transform(sample)
prediction = model.predict(sample_scaled)

print("\nPredicted Severity (1=Serious, 0=Minor):", prediction[0])


Predicted Severity (1=Serious, 0=Minor): 0.0


