In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("data.csv")

In [4]:
data['Accident'] = data['Accident'].replace(np.nan, 0)

In [5]:
data.columns

Index(['Latitude', 'Longitude', 'Timestamp', 'Weather', 'Age',
       'Type_of_Vehicle', 'Road_Type', 'Time_of_Day', 'Traffic_Density',
       'Speed_Limit', 'Number_of_Vehicles', 'Driver_Alcohol',
       'Accident_Severity', 'Road_Condition', 'Vehicle_Type', 'Driver_Age',
       'Driver_Experience', 'Road_Light_Condition', 'Accident'],
      dtype='object')

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle  # For saving the model

# Preprocessing the dataset
# Encode the target variable
data['Accident'] = LabelEncoder().fit_transform(data['Accident'])
print("Unique values in 'Accident':", data['Accident'].unique())

# Convert the Timestamp column to datetime and extract meaningful features
data['Timestamp'] = pd.to_datetime(data['Timestamp'])
data['Hour'] = data['Timestamp'].dt.hour
data['Day'] = data['Timestamp'].dt.day
data['Month'] = data['Timestamp'].dt.month
data['DayOfWeek'] = data['Timestamp'].dt.dayofweek

# Define features and target
features = [
    'Latitude', 'Longitude', 'Weather', 'Age', 'Type_of_Vehicle', 'Road_Type', 'Time_of_Day', 
    'Traffic_Density', 'Speed_Limit', 'Number_of_Vehicles', 'Driver_Alcohol', 'Accident_Severity', 
    'Road_Condition', 'Vehicle_Type', 'Driver_Age', 'Driver_Experience', 'Road_Light_Condition', 
    'Hour', 'Day', 'Month', 'DayOfWeek'
]
X = data[features]
y = data['Accident']

# Preprocessing for categorical and numerical features
categorical_features = [
    'Weather', 'Type_of_Vehicle', 'Road_Type', 'Time_of_Day', 'Traffic_Density', 'Driver_Alcohol', 
    'Accident_Severity', 'Road_Condition', 'Vehicle_Type', 'Road_Light_Condition'
]
numerical_features = [
    'Latitude', 'Longitude', 'Age', 'Speed_Limit', 'Number_of_Vehicles', 'Driver_Age', 
    'Driver_Experience', 'Hour', 'Day', 'Month', 'DayOfWeek'
]

# Preprocessor for the pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# Decision Tree Classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(
        max_depth=3,  # Adjust as needed for better performance
        min_samples_split=5,  # Adjust as needed for better performance
        random_state=42
    ))
])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Predictions and evaluation
y_pred = pipeline.predict(X_test)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Output results
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", report)

# Save the pipeline as a .pkl file using pickle
with open('Model.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

print("Model saved as 'Model.pkl'")

Unique values in 'Accident': [0 1]
Accuracy: 72.84%

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.99      0.84    101810
           1       0.73      0.04      0.08     39306

    accuracy                           0.73    141116
   macro avg       0.73      0.52      0.46    141116
weighted avg       0.73      0.73      0.63    141116

Model saved as 'Model.pkl'


In [9]:
import pickle
import pandas as pd

# Load the saved pipeline
with open('Model.pkl', 'rb') as file:
    loaded_pipeline = pickle.load(file)

# Example input data
random_data = {
    'Latitude': 12.9716,              
    'Longitude': 77.5946,             
    'Weather': 'Rainy',               
    'Age': 5,                         
    'Type_of_Vehicle': 'Car',         
    'Road_Type': 'City Road',         
    'Time_of_Day': 'Night',           
    'Traffic_Density': 1.0,      
    'Speed_Limit': 60,                
    'Number_of_Vehicles': 3,          
    'Driver_Alcohol': 0.0,          
    'Accident_Severity': 'High',      
    'Road_Condition': 'Wet',          
    'Vehicle_Type': 'Car',            
    'Driver_Age': 35,                 
    'Driver_Experience': 10,          
    'Road_Light_Condition': 'Daylight',
    'Hour': 22,                       
    'Day': 15,                        
    'Month': 1,                       
    'DayOfWeek': 5                    
}

# Convert the input data to a DataFrame
user_input = pd.DataFrame([random_data])

# Make predictions using the loaded pipeline
prediction = loaded_pipeline.predict(user_input)

# Map the prediction to a human-readable label
label_mapping = {0: "No Accident", 1: "Accident"}
predicted_label = label_mapping[prediction[0]]

print("\nPrediction for the input:")
print(f"The model predicts: {predicted_label}")

# Predict probabilities
prediction_proba = loaded_pipeline.predict_proba(user_input)
confidence = max(prediction_proba[0]) * 100
print(f"Confidence: {confidence:.2f}%")


Prediction for the input:
The model predicts: Accident
Confidence: 80.14%
