In [24]:
import pandas as pd
import numpy as np

In [18]:
data = pd.read_csv("merged.csv")

In [25]:
data.info


<bound method DataFrame.info of         Latitude  Longitude           Timestamp Weather  Age Type_of_Vehicle  \
0          11.06      76.80 2006-06-09 05:41:46   Clear   32           Truck   
334        10.88      77.16 2019-01-28 17:28:02  Stormy   33   Auto-rickshaw   
374        11.00      77.05 2024-03-25 01:13:09   Windy   18   Auto-rickshaw   
375        10.97      77.07 2022-12-19 19:49:42  Stormy   27             Car   
415        10.93      76.81 2003-08-23 19:49:18   Rainy   55           Truck   
...          ...        ...                 ...     ...  ...             ...   
697878     11.11      76.88 2006-02-23 15:43:15   Windy   50             Bus   
698321     11.14      77.16 1992-03-28 19:31:26   Rainy   58             Bus   
698912     11.13      77.15 2013-07-19 17:17:30  Stormy   27           Truck   
700537     11.19      77.02 1998-08-02 23:55:51   Foggy   23            Bike   
705536     11.20      76.92 2014-12-29 20:37:35  Stormy   38             Bus   

       

In [20]:
data.columns

Index(['Latitude', 'Longitude', 'Timestamp', 'Weather', 'Age',
       'Type_of_Vehicle', 'Road_Type', 'Time_of_Day', 'Traffic_Density',
       'Speed_Limit', 'Number_of_Vehicles', 'Driver_Alcohol',
       'Accident_Severity', 'Road_Condition', 'Vehicle_Type', 'Driver_Age',
       'Driver_Experience', 'Road_Light_Condition', 'Accident'],
      dtype='object')

In [26]:
data.drop_duplicates(subset=['Latitude', 'Longitude'], inplace=True)
data.info

<bound method DataFrame.info of         Latitude  Longitude           Timestamp Weather  Age Type_of_Vehicle  \
0          11.06      76.80 2006-06-09 05:41:46   Clear   32           Truck   
334        10.88      77.16 2019-01-28 17:28:02  Stormy   33   Auto-rickshaw   
374        11.00      77.05 2024-03-25 01:13:09   Windy   18   Auto-rickshaw   
375        10.97      77.07 2022-12-19 19:49:42  Stormy   27             Car   
415        10.93      76.81 2003-08-23 19:49:18   Rainy   55           Truck   
...          ...        ...                 ...     ...  ...             ...   
697878     11.11      76.88 2006-02-23 15:43:15   Windy   50             Bus   
698321     11.14      77.16 1992-03-28 19:31:26   Rainy   58             Bus   
698912     11.13      77.15 2013-07-19 17:17:30  Stormy   27           Truck   
700537     11.19      77.02 1998-08-02 23:55:51   Foggy   23            Bike   
705536     11.20      76.92 2014-12-29 20:37:35  Stormy   38             Bus   

       

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('merged.csv')  # Replace with your dataset path

# Encode the target variable
data['Accident'] = LabelEncoder().fit_transform(data['Accident'])

# Convert the Timestamp column to datetime and extract meaningful features
data['Timestamp'] = pd.to_datetime(data['Timestamp'])
data['Hour'] = data['Timestamp'].dt.hour
data['Day'] = data['Timestamp'].dt.day
data['Month'] = data['Timestamp'].dt.month
data['DayOfWeek'] = data['Timestamp'].dt.dayofweek

# Define features and target
features = [
    'Latitude', 'Longitude', 'Weather', 'Age', 'Type_of_Vehicle', 'Road_Type', 'Time_of_Day', 
    'Traffic_Density', 'Speed_Limit', 'Number_of_Vehicles', 'Driver_Alcohol', 'Accident_Severity', 
    'Road_Condition', 'Vehicle_Type', 'Driver_Age', 'Driver_Experience', 'Road_Light_Condition', 
    'Hour', 'Day', 'Month', 'DayOfWeek'
]
X = data[features]
y = data['Accident']

# Preprocessing for categorical and numerical features
categorical_features = [
    'Weather', 'Type_of_Vehicle', 'Road_Type', 'Time_of_Day', 'Traffic_Density', 'Driver_Alcohol', 
    'Accident_Severity', 'Road_Condition', 'Vehicle_Type', 'Road_Light_Condition'
]
numerical_features = [
    'Latitude', 'Longitude', 'Age', 'Speed_Limit', 'Number_of_Vehicles', 'Driver_Age', 
    'Driver_Experience', 'Hour', 'Day', 'Month', 'DayOfWeek'
]

# Preprocessor for the pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# Decision Tree Classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(
        max_depth=5,  # Adjust depth for better accuracy
        min_samples_split=5,
        random_state=42
    ))
])

# Train the model on the entire dataset
pipeline.fit(X, y)

# Function to predict accident occurrence
def predict_accident(lat, long):
    # Ensure all features are included in the input
    input_data = {
        'Latitude': [lat],
        'Longitude': [long]
    }
    
    # Ensure all required features are present
    required_features = [
        'Latitude', 'Longitude', 'Weather', 'Age', 'Type_of_Vehicle', 'Road_Type', 'Time_of_Day',
        'Traffic_Density', 'Speed_Limit', 'Number_of_Vehicles', 'Driver_Alcohol', 
        'Accident_Severity', 'Road_Condition', 'Vehicle_Type', 'Driver_Age', 'Driver_Experience', 
        'Road_Light_Condition', 'Hour', 'Day', 'Month', 'DayOfWeek'
    ]
    
    # Check for missing features
    for feature in required_features:
        if feature not in input_data:
            raise ValueError(f"Missing required feature: {feature}")
    
    # Convert to DataFrame
    input_df = pd.DataFrame(input_data)
    
    # Predict using the pipeline
    prediction = pipeline.predict(input_df)
    
    # Convert prediction to Yes/No
    return "Yes" if prediction[0] == 1 else "No"


# Example usage
lat = 12.971598
long = 77.594566


result = predict_accident(lat, long)
print(f"Accident occurred? {result}")


ValueError: Missing required feature: Weather