Overview?

Business Problem

# **Importing Packages**

---

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

# **Importing Project Data**

---

In [2]:
df = pd.read_csv('data/Aviation_Data.csv', encoding='latin-1', low_memory=False)

In [3]:
df.head()

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
0,20001218X45444,Accident,SEA87LA080,1948-10-24,"MOOSE CREEK, ID",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,UNK,Cruise,Probable Cause,
1,20001218X45447,Accident,LAX94LA336,1962-07-19,"BRIDGEPORT, CA",United States,,,,,...,Personal,,4.0,0.0,0.0,0.0,UNK,Unknown,Probable Cause,19-09-1996
2,20061025X01555,Accident,NYC07LA005,1974-08-30,"Saltville, VA",United States,36.922223,-81.878056,,,...,Personal,,3.0,,,,IMC,Cruise,Probable Cause,26-02-2007
3,20001218X45448,Accident,LAX96LA321,1977-06-19,"EUREKA, CA",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,IMC,Cruise,Probable Cause,12-09-2000
4,20041105X01764,Accident,CHI79FA064,1979-08-02,"Canton, OH",United States,,,,,...,Personal,,1.0,2.0,,0.0,VMC,Approach,Probable Cause,16-04-1980


In [4]:
df.tail()

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
90343,20221227106491,Accident,ERA23LA093,2022-12-26,"Annapolis, MD",United States,,,,,...,Personal,,0.0,1.0,0.0,0.0,,,,29-12-2022
90344,20221227106494,Accident,ERA23LA095,2022-12-26,"Hampton, NH",United States,,,,,...,,,0.0,0.0,0.0,0.0,,,,
90345,20221227106497,Accident,WPR23LA075,2022-12-26,"Payson, AZ",United States,341525N,1112021W,PAN,PAYSON,...,Personal,,0.0,0.0,0.0,1.0,VMC,,,27-12-2022
90346,20221227106498,Accident,WPR23LA076,2022-12-26,"Morgan, UT",United States,,,,,...,Personal,MC CESSNA 210N LLC,0.0,0.0,0.0,0.0,,,,
90347,20221230106513,Accident,ERA23LA097,2022-12-29,"Athens, GA",United States,,,,,...,Personal,,0.0,1.0,0.0,1.0,,,,30-12-2022


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90348 entries, 0 to 90347
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                88889 non-null  object 
 1   Investigation.Type      90348 non-null  object 
 2   Accident.Number         88889 non-null  object 
 3   Event.Date              88889 non-null  object 
 4   Location                88837 non-null  object 
 5   Country                 88663 non-null  object 
 6   Latitude                34382 non-null  object 
 7   Longitude               34373 non-null  object 
 8   Airport.Code            50249 non-null  object 
 9   Airport.Name            52790 non-null  object 
 10  Injury.Severity         87889 non-null  object 
 11  Aircraft.damage         85695 non-null  object 
 12  Aircraft.Category       32287 non-null  object 
 13  Registration.Number     87572 non-null  object 
 14  Make                    88826 non-null

## **Cleaning Data**

---

In [None]:
# Clean column names, replacing . to _ and making them lowercase
df = df.rename(columns={c: c.lower().replace('.', '_') for c in df.columns})

In [None]:
# Convert event_date column to datetime format
df['event_date'] = pd.to_datetime(df['event_date'])

# We will be looking at data from 2001 to 2022
usa_df = df[df['event_date'] > '2001-11-19']

#dropping the columns we will not be using
usa_df = df[['location','investigation_type','event_date','country','injury_severity','airport_name','aircraft_category','make', 'model',
  'number_of_engines', 'engine_type','injury_severity','total_fatal_injuries', 'total_uninjured', 'total_serious_injuries',
  'total_minor_injuries', 'latitude','longitude', 'amateur_built']]

In [10]:
#Creating a new dataframe with data from the US
usa_df = df[df['country'] == 'United States']

# Split location column into city and state columns
usa_df[['city', 'state']] = usa_df['location'].str.split(', ', n=1, expand=True)

#Cleaning city names
usa_df['city'].str.title()

#We are only interested in airplanes in aircraft category
usa_df = usa_df.loc[~usa_df['aircraft_category'].isin(usa_df['aircraft_category'].value_counts().index[1:])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [11]:
#Cleaning the injury severity column
usa_df['injury_severity'] = usa_df['injury_severity'].replace(regex=r'Fatal\(\d+\)', value='Fatal')

In [None]:
# Import necessary libraries
from geopy.geocoders import Nominatim
import pandas as pd

# Create geolocator object
geolocator = Nominatim(user_agent="my_app")

# Loop through each row in dataframe
for index, row in usa_df.iterrows():
    # Check if latitude and longitude are missing
    if pd.isnull(row['latitude']) or pd.isnull(row['longitude']):
        # Get location string
        location = str(row['airport_name']) + ', ' + str(row['country'])
        # Use geolocator to get latitude and longitude
        try:
            location = geolocator.geocode(location)
            usa_df.at[index, 'latitude'] = location.latitude
            usa_df.at[index, 'longitude'] = location.longitude
        except:
            pass

In [None]:
usa_df.loc[usa['total_fatal_injuries'].isna(), 'total_fatal_injuries']=0

minor_injuries=usa_df.loc[(usa_df['total_uninjured'].notna()) & (usa_df['total_minor_injuries'].notna())]
minor_injuries_ratio=minor_injuries['total_minor_injuries'].sum()/minor_injuries['total_uninjured'].sum()

serious_injuries=usa_df.loc[(usa_df['total_uninjured'].notna()) & (usa_df['total_serious_injuries'].notna())]
serious_injuries_ratio=serious_injuries['total_serious_injuries'].sum()/serious_injuries['total_uninjured'].sum()

usa_df.loc[usa_df['total_uninjured'].isna(), 'total_uninjured']=usa_df['total_uninjured'].median()
usa_df.loc[us_dfa['total_minor_injuries'].isna(), 'total_minor_injuries']=round(minor_injuries_ratio*usa_df['total_uninjured'], 0)
usa_df.loc[usa_df['total_serious_injuries'].isna(), 'total_serious_injuries']=round(serious_injuries_ratio*usa_df['total_uninjured'], 0)

**Private Plane Analysis**

In [None]:
#Creating a figure showing investigation occurance by Airplane Make
fig, ax=plt.subplots(figsize=(16,9))
sns.set_style('darkgrid')
makes=sns.barplot(data=private_planes, x=private_planes['make'].value_counts().index[:20], y=private_planes['make'].value_counts().values[:20])
makes.set_title('20 Most Frequent Airplane Makes Found in Investigations')
makes.set_xlabel('Make', fontsize=15)
makes.set_ylabel('Number of Occurrences', fontsize=15)
makes.set_xticklabels(private_planes['make'].value_counts().index[:20], rotation=-45, ha='left');
plt.show()

#Showing fatalities by Airplane Make
fatalities=private_planes.groupby('make')['total_fatal_injuries'].sum().sort_values(ascending=False)
fig, ax=plt.subplots(figsize=(16,9))
sns.set_style('darkgrid')
f=sns.barplot(data=private_planes, x=fatalities.index[:20], y=fatalities.values[:20])
f.set_title('Fatalities by Airplane Make', fontsize=15)
f.set_xlabel('Make', fontsize=15)
f.set_ylabel('Fatalities', fontsize=15)
f.set_xticklabels(fatalities.index[:20], rotation=-45, ha='left');
plt.show()


#Creating a figure showing investigation frequnecy by Airplane Model
fig, ax=plt.subplots(figsize=(16,9))
sns.set_style('darkgrid')
makes=sns.barplot(data=private_planes, x=private_planes['model'].value_counts().index[:20], y=private_planes['model'].value_counts().values[:20])
makes.set_title('20 Most Frequent Airplane Models Found in Accidents', fontsize=15)
makes.set_xlabel('Model', fontsize=15)
makes.set_ylabel('Number of Occurrences', fontsize=15)
makes.set_xticklabels(private_planes['model'].value_counts().index[:20], rotation=-45, ha='center');
plt.show()

#Showing fatalities by Airplane Model
fatalities=private_planes.groupby('model')['total_fatal_injuries'].sum().sort_values(ascending=False)
fig, ax=plt.subplots(figsize=(16,9))
sns.set_style('darkgrid')
f=sns.barplot(data=private_planes, x=fatalities.index[:20], y=fatalities.values[:20])
f.set_title('Fatalities by Airplane Model', fontsize=15)
f.set_xlabel('Model', fontsize=15)
f.set_ylabel('Fatalities', fontsize=15)
f.set_xticklabels(fatalities.index[:20], rotation=-45, ha='left');
plt.show()

**Commerical Plane Analysis**

**Location of Operation Analysis**