In [None]:
# loading the data set
import pandas as pd

df = pd.read_csv("aviation_incidents.csv")
df.head()


In [None]:
# let us clean the data
## clean column names to help in standardization of column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

- Remove duplicates
- Fix inconsistent formats (e.g., dates, locations)
- Handle missing values


In [None]:
# Fix inconsistent formats
## convert date column
df['date'] = pd.to_datetime(df['date'], errors='coerce')

In [None]:
# remove duplicates
df.drop_duplicates(inplace=True)


In [None]:
# Handle missing values
df.isnull().sum()


In [None]:
# Fill missing aircraft model with 'Unknown'
df['aircraft_model'].fillna('Unknown', inplace=True)



In [None]:
# Fill missing engine type
df['engine_type'].fillna(df['engine_type'].mode()[0], inplace=True)


In [None]:
# Fill missing severity with 'Substantial' (if domain knowledge supports it)
df['severity'].fillna('Substantial', inplace=True)


In [None]:
# Exploratory Data Analysis (EDA)
## Which aircraft models are most frequently involved in fatal incidents?
fatal_counts = df[df['severity'] == 'Fatal']['aircraft_model'].value_counts().head(10)
fatal_counts.plot(kind='bar', title='Top Aircraft Models in Fatal Incidents')


In [None]:
# How do incidents trend over time?
df['year'] = df['date'].dt.year
incident_trend = df.groupby('year').size()
incident_trend.plot(kind='line', title='Incidents Over Time')


In [None]:
# Which engine types are associated with higher severity?
import seaborn as sns

sns.countplot(data=df, x='engine_type', hue='severity')