In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#Reads the csv
df = pd.read_csv('data/AviationData.csv', encoding='latin-1')

df['Aircraft.Category'].value_counts()

In [None]:
df.head()

In [None]:
accident_types = df['Investigation.Type'].unique()
airplane_df = df[df['Aircraft.Category'] == 'Airplane']
airplane_df = airplane_df.set_index(['Event.Id'])

In [None]:
group_by_airplane = airplane_df.groupby('Make').sum(numeric_only=True)
accidents_by_make = airplane_df['Make'].value_counts()
accidents_by_make

In [None]:
locations_list = df.Location.dropna().unique()

geolocator = Nominatim(user_agent='phase1_project')

In [None]:
len(locations_list)

In [None]:
#DO NOT RUN
# def coordinate_finder(location=str):
    info = geolocator.geocode(location)
    return info
# def geo_df_generator(locations=list):
    df = {'Location': [],
        'Longitude_city': [],
        'Latitude_city': []}
    try:
        for city in locations:
            get_info = coordinate_finder(city)
            if get_info != None:
                longitude = get_info.longitude
                latitude = get_info.latitude
                df['Location'].append(city)
                df['Longitude_city'].append(longitude)
                df['Latitude_city'].append(latitude)
        return df
    except AttributeError as err:
        print(err, city)
    except geopy.exc.GeocoderServiceError as terr:
        print(terr, city)


In [None]:
#DO NOT RUN
# coordinates_df = pd.DataFrame(geo_df_generator(locations_list[:1000]))

## Location cleaning

In [None]:
# Makes all locations upper case
airplane_df['Location'] = airplane_df['Location'].replace({np.nan: 'UNK'})
airplane_df['Location'] = airplane_df['Location'].str.upper().str.strip(to_strip=',')
airplane_df['Location'].loc[airplane_df['Country'] == 'United States'].str.slice(start=-2)

#Creates State column for accidents in the US
airplane_df['State'] = df['Location'].fillna('')
airplane_df['State'] = airplane_df['Location'].map(lambda x: x[-2:])
airplane_df['State'].loc[airplane_df['Country'] != 'United States'] = 'UNK'

In [None]:
state_counts = [(state, count) for state, count in airplane_df['State'].value_counts().items()]
state_counts

In [None]:
for state, count in state_counts:
    x = airplane_df['Location'].loc[airplane_df['State'] == state].sample(1)
    print(x)

- ATL: Atlantic Ocean
- NMI: Northern Mariana Islands
- USVI: US Virgin Islands
- GU: Guam
- GM: Gulf of Mexico

In [None]:
airplane_df['State'] = airplane_df['State'].replace({'AO': 'ATL', 'AN': 'NMI',
                             'PO': 'GU', 'NG': 'GU', 'IX': 'USVI',
                             'IE': 'USVI', 'NA': 'GU', 'ED': 'USVI',
                             'AS': 'USVI', 'TA': 'GU'})

## Airport Codes Cleaning 

In [None]:
#look at dataset
airplane_df.head()

# Count the number of NaN values in the 'Airport.Code' column
missing_values_count = airplane_df['Airport.Code'].isnull().sum()

# Display the count and percentage of missing values
total_entries = len(airplane_df['Airport.Code'])
percentage_missing = (missing_values_count / total_entries) * 100

print(f"Number of missing values: {missing_values_count}")
print(f"Percentage of missing values: {percentage_missing:.2f}%")

In [None]:
#standardize Airport.Code to all uppercase
airplane_df['Airport.Code'] = airplane_df['Airport.Code'].str.upper()


In [None]:
# Set the option to display all columns
pd.set_option('display.max_columns', None)

# Display the first few rows of the DataFrame
print(airplane_df.head())

# Reset the option to its default value

pd.reset_option('display.max_columns')



In [None]:
airplane_df.describe()

In [None]:
top_makes = airplane_df['Make'].value_counts().head(10).index
top_makes_df = airplane_df[airplane_df['Make'].isin(top_makes)]

sns.boxplot(x = 'Make', y = 'Total.Fatal.Injuries', data = top_makes_df)
plt.show()