In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("xavierberge/road-accident-dataset")

print("Path to dataset files:", path)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset (Replace with the actual dataset path after downloading)
df = pd.read_csv("road_accident_dataset.csv")  # Adjust filename if needed

# Display basic info and first few rows
print(df.info())
print(df.head())

# Handling missing values
df.dropna(inplace=True)  # Drop rows with missing values (modify as necessary)

### 1. Frequency of Accidents Over Time ###
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['Hour'] = df['Date'].dt.hour

# Total number of accidents
print("Total Accidents:", len(df))

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
sns.countplot(x='Year', data=df, ax=axes[0,0])
sns.countplot(x='Month', data=df, ax=axes[0,1])
sns.countplot(x='DayOfWeek', data=df, ax=axes[1,0])
sns.countplot(x='Hour', data=df, ax=axes[1,1])
plt.tight_layout()
plt.show()

### 2. Geographical Distribution ###
location_counts = df['Location'].value_counts().head(10)
sns.barplot(x=location_counts.index, y=location_counts.values)
plt.xticks(rotation=45)
plt.title("Top 10 Accident-Prone Locations")
plt.show()

### 3. Accident Severity Analysis ###
severity_counts = df['Severity'].value_counts()
print("Accident Severity Distribution:")
print(severity_counts)
sns.barplot(x=severity_counts.index, y=severity_counts.values)
plt.title("Accident Severity Distribution")
plt.show()

### 4. Demographic Insights ###
sns.histplot(df['Age'], bins=30, kde=True)
plt.title("Age Distribution of Individuals Involved in Accidents")
plt.show()

gender_counts = df['Gender'].value_counts()
sns.barplot(x=gender_counts.index, y=gender_counts.values)
plt.title("Gender Distribution in Accidents")
plt.show()

### 5. Environmental and Road Conditions ###
sns.countplot(x='Weather', data=df, order=df['Weather'].value_counts().index)
plt.xticks(rotation=45)
plt.title("Accidents Under Different Weather Conditions")
plt.show()

sns.countplot(x='Road_Type', data=df, order=df['Road_Type'].value_counts().index)
plt.xticks(rotation=45)
plt.title("Accidents on Different Road Types")
plt.show()

sns.countplot(x='Lighting', data=df, order=df['Lighting'].value_counts().index)
plt.xticks(rotation=45)
plt.title("Impact of Lighting Conditions on Accidents")
plt.show()

### 6. Vehicle and Driver Information ###
vehicle_counts = df['Vehicle_Type'].value_counts().head(10)
sns.barplot(x=vehicle_counts.index, y=vehicle_counts.values)
plt.xticks(rotation=45)
plt.title("Most Frequently Involved Vehicles in Accidents")
plt.show()

### 7. Temporal Patterns ###
weekday_counts = df[df['DayOfWeek'] < 5]['DayOfWeek'].value_counts()
weekend_counts = df[df['DayOfWeek'] >= 5]['DayOfWeek'].value_counts()
print("Weekday vs Weekend Accidents:")
print("Weekdays:", weekday_counts.sum(), "Weekends:", weekend_counts.sum())

### 8. Contributing Factors ###
contributing_factors = df['Contributing_Factor'].value_counts().head(10)
sns.barplot(x=contributing_factors.index, y=contributing_factors.values)
plt.xticks(rotation=45)
plt.title("Top Contributing Factors in Accidents")
plt.show()

### 9. Injury and Fatality Analysis ###
injury_counts = df['Injury_Severity'].value_counts()
sns.barplot(x=injury_counts.index, y=injury_counts.values)
plt.title("Injury Severity Distribution")
plt.show()

### 10. Comparative Analysis ###
region_comparison = df.groupby('Region')['Accident_ID'].count()
region_comparison.plot(kind='bar', figsize=(10, 5))
plt.title("Comparison of Accidents Across Regions")
plt.xticks(rotation=45)
plt.show()

# Save the cleaned dataset
df.to_csv("cleaned_accident_data.csv", index=False)
