In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("merged.csv")

In [4]:
data.info


<bound method DataFrame.info of         Latitude  Longitude            Timestamp Weather  Age Type_of_Vehicle  \
0          11.06      76.80  2006-06-09 05:41:46   Clear   32           Truck   
1          11.06      76.80  2006-06-09 05:41:46   Clear   32           Truck   
2          11.06      76.80  2006-06-09 05:41:46   Clear   32           Truck   
3          11.06      76.80  2006-06-09 05:41:46   Clear   32           Truck   
4          11.06      76.80  2006-06-09 05:41:46   Clear   32           Truck   
...          ...        ...                  ...     ...  ...             ...   
705571     11.20      76.92  2014-12-29 20:37:35  Stormy   38             Bus   
705572     11.20      76.92  2014-12-29 20:37:35  Stormy   38             Bus   
705573     11.20      76.92  2014-12-29 20:37:35  Stormy   38             Bus   
705574     11.20      76.92  2014-12-29 20:37:35  Stormy   38             Bus   
705575     11.20      76.92  2014-12-29 20:37:35  Stormy   38             Bus

In [5]:
data.columns

Index(['Latitude', 'Longitude', 'Timestamp', 'Weather', 'Age',
       'Type_of_Vehicle', 'Road_Type', 'Time_of_Day', 'Traffic_Density',
       'Speed_Limit', 'Number_of_Vehicles', 'Driver_Alcohol',
       'Accident_Severity', 'Road_Condition', 'Vehicle_Type', 'Driver_Age',
       'Driver_Experience', 'Road_Light_Condition', 'Accident'],
      dtype='object')

In [6]:
data.drop_duplicates(subset=['Latitude', 'Longitude'], inplace=True)
data.info

<bound method DataFrame.info of         Latitude  Longitude            Timestamp Weather  Age Type_of_Vehicle  \
0          11.06      76.80  2006-06-09 05:41:46   Clear   32           Truck   
334        10.88      77.16  2019-01-28 17:28:02  Stormy   33   Auto-rickshaw   
374        11.00      77.05  2024-03-25 01:13:09   Windy   18   Auto-rickshaw   
375        10.97      77.07  2022-12-19 19:49:42  Stormy   27             Car   
415        10.93      76.81  2003-08-23 19:49:18   Rainy   55           Truck   
...          ...        ...                  ...     ...  ...             ...   
697878     11.11      76.88  2006-02-23 15:43:15   Windy   50             Bus   
698321     11.14      77.16  1992-03-28 19:31:26   Rainy   58             Bus   
698912     11.13      77.15  2013-07-19 17:17:30  Stormy   27           Truck   
700537     11.19      77.02  1998-08-02 23:55:51   Foggy   23            Bike   
705536     11.20      76.92  2014-12-29 20:37:35  Stormy   38             Bus

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Preprocessing the dataset
# Encode the target variable
data['Accident'] = LabelEncoder().fit_transform(data['Accident'])

# Convert the Timestamp column to datetime and extract meaningful features
data['Timestamp'] = pd.to_datetime(data['Timestamp'])
data['Hour'] = data['Timestamp'].dt.hour
data['Day'] = data['Timestamp'].dt.day
data['Month'] = data['Timestamp'].dt.month
data['DayOfWeek'] = data['Timestamp'].dt.dayofweek

# Define features and target
features = [
    'Latitude', 'Longitude', 'Weather', 'Age', 'Type_of_Vehicle', 'Road_Type', 'Time_of_Day', 
    'Traffic_Density', 'Speed_Limit', 'Number_of_Vehicles', 'Driver_Alcohol', 'Accident_Severity', 
    'Road_Condition', 'Vehicle_Type', 'Driver_Age', 'Driver_Experience', 'Road_Light_Condition', 
    'Hour', 'Day', 'Month', 'DayOfWeek'
]
X = data[features]
y = data['Accident']

# Preprocessing for categorical and numerical features
categorical_features = [
    'Weather', 'Type_of_Vehicle', 'Road_Type', 'Time_of_Day', 'Traffic_Density', 'Driver_Alcohol', 
    'Accident_Severity', 'Road_Condition', 'Vehicle_Type', 'Road_Light_Condition'
]
numerical_features = [
    'Latitude', 'Longitude', 'Age', 'Speed_Limit', 'Number_of_Vehicles', 'Driver_Age', 
    'Driver_Experience', 'Hour', 'Day', 'Month', 'DayOfWeek'
]

# Preprocessor for the pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# Decision Tree Classifier with limited depth
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(
        max_depth=1,  # Limiting tree depth to underfit
        min_samples_split=5,  # Larger split for underfitting
        random_state=42
    ))
])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Predictions and evaluation
y_pred = pipeline.predict(X_test)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Output results
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", report)


Accuracy: 79.87%

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       194
           1       0.00      0.00      0.00        63
           2       0.47      1.00      0.64        56

    accuracy                           0.80       313
   macro avg       0.49      0.67      0.55       313
weighted avg       0.70      0.80      0.73       313



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
