# General Overview

In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

plt.rcParams["axes.grid"] = False # Removes default white gridlines from the plot:

In [None]:
import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly import tools
# Activate inline plotting in notebook
py.init_notebook_mode(connected = False)

In [None]:
aviationData = pd.read_csv('AviationData.csv', encoding = 'ISO-8859-1')
aviationData.head()

In [None]:
aviationData.info()

In [None]:
# Separating the date column into year, month, day
aviationData['day'], aviationData['month'] ,aviationData['year'] = zip(*aviationData['Event.Date'].map(lambda x: str(x).split('/')))

# Changing the fatal column to 'Non-Fatal', 'None', 'Fatal', or 'Unknown', and adding a 'Fatal_Count' in a separate column
import re
import numpy as np
fatal_counts = []
fatal_bools = []
for i in range(len(aviationData)):
    fatal_resp = aviationData.iloc[i]['Injury.Severity']
    if str(fatal_resp) == 'Non-Fatal':
        fatal_counts.append(0)
        fatal_bools.append("None")
    elif fatal_resp is not None:
        num = re.sub(r'\D', "", str(fatal_resp))
        if num == "":
            fatal_counts.append(np.nan)
            fatal_bools.append("Unknown")
        else:
            fatal_counts.append(int(num))
            fatal_bools.append("Fatal")
    else:
        fatal_counts.append(np.nan)
        fatal_bools.append("Unknown")
aviationData['Fatal_Counts'] = fatal_counts
aviationData['Fatal_Bool'] = fatal_bools

In [None]:
print(aviationData.columns)

## Univariate Plots Section

### Accidents by Year and Month

Plot frequency histograms for the year, month, and weekday of accidents in the dataset.

The majority of the observations in the dataframe are from after the early 1980s onwards. So let’s generate a plot from 1980 to 2019.

In [None]:
aviationData[aviationData["Investigation.Type"]=="Accident"].groupby('year').head()

#### Year of Record Accident/Incident

In [None]:
investTypeYear = aviationData[["Investigation.Type", "year"]]

#investTypeYear = investTypeYear[investTypeYear['Investigation.Type'] == "Accident"]
(investTypeYear.groupby(['year','Investigation.Type'])['Investigation.Type'].count().unstack())[['Accident','Incident']].plot(kind='bar',color=['xkcd:grey','xkcd:yellow'],figsize=(15,6))

plt.xticks(rotation=60)
plt.title('Year of Record Accident/Incident (1948-2019)')


The number of accidents has overall decreased by approx. 59% between 1982 and 2017 from approx. 3400 observations to approx. 1400 observations.

#### Month of Record Accident/Incident

In [None]:
investTypeMonth = aviationData[["Investigation.Type", "month"]]

(investTypeMonth.groupby(['month','Investigation.Type'])['Investigation.Type'].count().unstack())[['Accident','Incident']].plot(kind='bar',color=['xkcd:grey','xkcd:yellow'],figsize=(15,6))

plt.xticks(rotation=60)
plt.title('Month of Record Accident/Incident (1948-2019)')

The highest number of accidents in the dataset for a given year take place during northern hemisphere summer time (Jun-Jul-Aug). This is also likely to be correlated with the increased numbers of flights during the summer holiday period.

### Total Fatal Injuries

In [None]:
def fatalGroup(item):
    if item == 0:
        return '0'
    elif item==1:
        return '1'
    elif item==2:
        return '2'
    elif item==3:
        return '3'
    elif item==4:
        return '4'
    elif item ==5:
        return '5'
    elif 6<item<=10:
        return '6-10'
    elif 10<item<=20:
        return '10-20'
    elif 20<item<=50:
        return '21-50'
    elif 50<item<=100:
        return '51-100'
    elif 100<item<=150:
        return '101-150'
    elif 150<item<=200:
        return '151-200'
    elif 200<item<=250:
        return '201-250'
    elif 250<item<300:
        return '251-300'

In [None]:
fatal_count = pd.DataFrame(aviationData['Fatal_Counts'])
# fill missing value
fatal_count.fillna(value=0,inplace = True)
fatal_count['fatal_range'] = fatal_count.apply(lambda row: fatalGroup(row['Fatal_Counts']), axis = 1)

rearrange = fatal_count.groupby(['fatal_range', 'Fatal_Counts'])['Fatal_Counts'].count().unstack()
rearrange.fillna(value = 0, inplace = True)
rearrange.head()


### Engine Types

In [None]:
engine_types = pd.DataFrame(aviationData['Engine.Type'])
engine_types.fillna(value = "Unknown", inplace = True)
engine_types.head(10)

In [None]:
sb.countplot(engine_types['Engine.Type'], color='blue')
plt.xticks(rotation=60)
plt.title('Engine Type in Observations (1948-2019)')

According to the plots above, the bulk of engine types in the reported accidents are Reciprocating engine types.

### Weather Conditions

VMC: conditions are such that pilots have sufficient visibility to fly the aircraft maintaining visual separation from terrain and other aircraft.

IMC: conditions require pilots to fly primarily by reference to instruments.


In [None]:
weather_c = pd.DataFrame(aviationData['Weather.Condition'])
weather_c.fillna(value = "Unknown", inplace = True)
sb.countplot(weather_c['Weather.Condition'], color='blue')
plt.xticks(rotation=60)
plt.title('Weather Condition in Observations (1948-2019)')

The bulk of accidents in the dataset take place during VMC weather conditions, which are great conditions for flying as VMC requires greater visibility and cloud clearance than IMC.

### Broad Phases of Flight

In [None]:
phase = pd.DataFrame(aviationData['Broad.Phase.of.Flight'])
phase.fillna(value = "Unknown", inplace = True)
sb.countplot(phase['Broad.Phase.of.Flight'], color='blue')
plt.xticks(rotation=60)
plt.title('Broad Phase of Flight in Observations (1948-2019)')

According to the plot, the bulk of accidents took place during landing or take-off. It is well known in the industry that these are high-risk.

## Bivariate Plots Section

The relationship between pairs of variables that could show interesting relationship

### Engine Types and Total Fatal Injuries

In [None]:
fig = make_subplots(rows = 1, cols = 2, 
                    subplot_titles = ('Engine Types Vs. Total Fatal Injuries', 
                                      'Engine Types Vs. Total Fatal Injuries (<10)'
                                      
                                      ))

p1 = go.Scatter(
        x = aviationData['Engine.Type'],
        y = aviationData['Fatal_Counts'],
        mode = 'markers', showlegend = False)

fig.append_trace(p1,1, 1)



p2 = go.Scatter(
        x = aviationData['Engine.Type'],
        y = aviationData['Fatal_Counts'][aviationData['Fatal_Counts']<=10],
        mode = 'markers', showlegend = False)

fig.append_trace(p2, 1, 2)



fig['layout'].update(height = 400, width = 800)
py.iplot(fig)

The first plot shows that the Turbo-Fan engine has more outliers with higher number of fatalities than other engines. From the 2nd plot, the bulk of the data for fatalities under 10 is with the engine type reciprocating engine type.

### Weather Conditions Vs. Total Fatal Injuries

In [None]:
c

As previously noted ,weather conditions do not show a particularly strong relationship with total fatal injuries. The bulk of the distribution is associate with VMC weather conditions. However, that is likely to the fact that the vast majority of flights are flown in VMC conditions.

### Phase of Flight and Total Fatal Injuries

In [None]:
fig = make_subplots(rows = 1, cols = 2, 
                    subplot_titles = ('Phase of Flight Vs. Total Fatal Injuries', 
                                      'Phase of Flight Vs. Total Fatal Injuries (<10)'
                                      
                                      ))

p1 = go.Scatter( x = aviationData['Broad.Phase.of.Flight'], 
                y = aviationData['Fatal_Counts'], mode = 'markers', showlegend = False)

fig.append_trace(p1, 1, 1)



p2 = go.Scatter(
        x = aviationData['Broad.Phase.of.Flight'],
        y = aviationData['Fatal_Counts'][aviationData['Fatal_Counts']<=10],
        mode = 'markers', showlegend = False)

fig.append_trace(p2, 1, 2)



fig['layout'].update(height = 400, width = 800)
py.iplot(fig)

The plots show that Take-Off and Approach are associated with outliers with high number of fatalities. As previously noted, these two phases of flight are often referred to as “critical phases of flight” for that particular reason.

### Broad Phase of Flight and Weather Conditions

In [None]:
fig = make_subplots(rows = 1, cols = 1, 
                    subplot_titles = ('Broad Phase of Flight Vs. Weather Conditions'))

p1 = go.Scatter( x = aviationData['Broad.Phase.of.Flight'], 
                y = aviationData['Weather.Condition'], mode = 'markers', showlegend = False)

fig.append_trace(p1, 1, 1)

fig['layout'].update(height = 400, width = 800)
py.iplot(fig)

The plots indicates that there is higher frequency of recorded observations for certain combinations of weather and phases of flight — for example, IMC flying conditions while during “cruise” or “approach” phases of flight.

### Longitude and Latitude of Recorded Accidents

Plotting the Latitude vs Longitude of the accidents essentially gives us the map of the US. The plots also indicates that the coastal states are more heavily impacted compared to mid-western states and most of Alaska. This can be explained by the volume of flight to/from destinations those areas of the US.

## Multivariate Plots Section