<a href="https://colab.research.google.com/github/itsayaanpatel/NTSB-Analysis/blob/main/Practicum_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np

url = "https://raw.githubusercontent.com/itsayaanpatel/NTSB-Analysis/main/Downloads/NTSBdata/NTSBAviationData.csv"
df = pd.read_csv(url, encoding='latin1', low_memory=False)

#latin1 is used to load data without crashing
# df = pd.read_csv("NTSBAviationData.csv", low_memory=False, encoding="latin1")  # remove or comment this line

# make column names lowercase and replace spaces dots with underscores
new_cols = []
for col in df.columns:
    col = col.strip().lower()
    col = col.replace(" ", "_").replace(".", "_")
    new_cols.append(col)
df.columns = new_cols

# convert dates
df["event_date"] = pd.to_datetime(df["event_date"], errors="coerce")

# convert injury columns to numbers
injury_cols = ["total_fatal_injuries","total_serious_injuries","total_minor_injuries","total_uninjured"]
for col in injury_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# drop rows without event_date
df = df.dropna(subset=["event_date"])

# clip extreme outliers at the 99th percentile
for col in injury_cols:
    if col in df.columns:
        upper = df[col].quantile(0.99)
        df[col] = np.where(df[col] > upper, upper, df[col])

print(df[injury_cols].describe())
# check shape (rows, columns)
print("Shape of data:", df.shape)

# check first few rows
print("\nFirst 5 rows:")
print(df.head())

# check missing values per column
print("\nMissing values per column:")
print(df.isna().sum())

# check data types
print("\nData types:")
print(df.dtypes)

# check unique values for key categorical columns
cat_cols = ["investigation_type","injury_severity","aircraft_category","make","weather_condition"]
for col in cat_cols:
    if col in df.columns:
        print(f"\nUnique values in {col}:")
        print(df[col].unique())

# check basic stats for numeric columns
print("\nNumeric summary:")
print(df.describe())


       total_fatal_injuries  total_serious_injuries  total_minor_injuries  \
count          77488.000000            76379.000000          76956.000000   
mean               0.435332                0.238351              0.300327   
std                0.973694                0.587873              0.701695   
min                0.000000                0.000000              0.000000   
25%                0.000000                0.000000              0.000000   
50%                0.000000                0.000000              0.000000   
75%                0.000000                0.000000              0.000000   
max                5.000000                3.000000              4.000000   

       total_uninjured  
count     82977.000000  
mean          4.491112  
std          19.884282  
min           0.000000  
25%           0.000000  
50%           1.000000  
75%           2.000000  
max         151.000000  
Shape of data: (88889, 31)

First 5 rows:
         event_id investigation_type ac