In [None]:
import pandas as pd
import seaborn as sns

# Load the dataset
df = pd.read_csv('airplane crashes.csv')

# 2. Determine the number of rows and columns in the dataset
num_rows, num_cols = df.shape
print(f"Number of rows: {num_rows}, Number of columns: {num_cols}")

# 3. Display the last 75 rows in the dataset
print(df.tail(75))

# 4. Methods for treating missing data in each column (except the first one)
for col in df.columns[1:]:
    num_missing = df[col].isnull().sum()
    dtype = df[col].dtype
    if dtype == 'object':
        method = "Impute with mode or 'Unknown'"
        justification = "Categorical data; mode or a placeholder is appropriate."
    else:
        method = "Impute with mean/median"
        justification = "Numerical data; mean or median preserves distribution."
    print(f"Column: {col}, Missing: {num_missing}, Method: {method}, Justification: {justification}")

# 5. Create 'fatality_locations' dataframe
fatality_locations = df[['Date', 'Location', 'Aboard', 'Fatalities']].copy()

# 6. Date of the highest number of recorded fatalities
max_fatalities_idx = fatality_locations['Fatalities'].idxmax()
date_max_fatalities = fatality_locations.loc[max_fatalities_idx, 'Date']
print(f"Date with highest fatalities: {date_max_fatalities}")

# 7. Compare Aboard vs Fatalities, count crashes with zero fatalities
zero_fatalities = fatality_locations[fatality_locations['Fatalities'] == 0]
num_zero_fatalities = zero_fatalities.shape[0]
print(f"Number of crashes with zero fatalities: {num_zero_fatalities}")

# 8. Split 'Location' into 'Region' and 'State/Country'
location_split = fatality_locations['Location'].str.split(',', n=1, expand=True)
fatality_locations['Region'] = location_split[0]
fatality_locations['State/Country'] = location_split[1].str.strip() if location_split.shape[1] > 1 else None

# 9. Order by fatalities and select top 100
top_100 = fatality_locations.sort_values(by='Fatalities', ascending=False).head(100)

# 10. Pie chart of top 25 fatalities per country/state
import matplotlib.pyplot as plt

top_25 = fatality_locations.groupby('State/Country')['Fatalities'].sum().sort_values(ascending=False).head(25)
plt.figure(figsize=(10, 10))
top_25.plot.pie(autopct='%1.1f%%', startangle=140)
plt.title('Top 25 Fatalities by Country/U.S. State')
plt.ylabel('')
plt.show()