### National Transportation Safety Board Accident/Incident Data

In [19]:
import pandas as pd
import xml.etree.ElementTree as ET  #XML parsing (https://docs.python.org/2/library/xml.etree.elementtree.html)
import numpy as np
import re #regex
import matplotlib.pyplot as plt

%matplotlib inline

In [4]:
#Load data
xmltree = ET.parse('AviationData.xml')

data = []
for el in xmltree.iterfind('./*'):
    for i in el.iterfind('*'):
        data.append(dict(i.items()))

#Convert list to dataframe
df = pd.DataFrame(data)

In [21]:
df.head()

Unnamed: 0,AccidentNumber,AirCarrier,AircraftCategory,AircraftDamage,AirportCode,AirportName,AmateurBuilt,BroadPhaseOfFlight,Country,EngineType,...,PublicationDate,PurposeOfFlight,RegistrationNumber,ReportStatus,Schedule,TotalFatalInjuries,TotalMinorInjuries,TotalSeriousInjuries,TotalUninjured,WeatherCondition
0,GAA15CA244,,,,,,,,United States,,...,,,N786AB,Preliminary,,,,,,
1,CEN15LA392,,Airplane,Substantial,IWS,WEST HOUSTON,No,LANDING,United States,,...,09/04/2015,Instructional,N452CS,Preliminary,,,,,2.0,VMC
2,GAA15CA246,,,,,,,,United States,,...,,,N112RP,Preliminary,,,,,,
3,GAA15CA243,,,,,,,,United States,,...,,,N8404X,Preliminary,,,,,,
4,GAA15CA242,,,,,,,,United States,,...,,,N5255G,Preliminary,,,,,,


In [20]:
tree.getroot().tag

'{http://www.ntsb.gov}DATA'

In [19]:
tree.getroot().attrib

{}

In [34]:
def clean_xml_data(df):
    
    #Subset to US fatalities, since that is the primary interest of the NTSB data
    df_subset = df[df['Country'] == 'United States']
    
    #Replace blank string values with NaN
    df_copy = df_subset.replace('', np.nan)
    
    #Clean AirCarrier Names
    df_copy['AirCarrier'] = df_subset['AirCarrier'].str.title()  #convert to lowercase to deal with companies captalized differently
    #TODO: Use "DBA" name if available
    
    #Split States/Cities in Location Field
    df_copy['City'] = df_subset.Location.str.title().str.split(', ', expand=True)[0]
    df_copy['State'] = df_subset.Location.str.upper().str.split(', ', expand=True)[1]
    
    #Strip Number from Injury Serverity Field
    df_copy['fatalities'] = df_subset['InjurySeverity'].str.extract(r'(\d+)')
    
    
    return(df_copy)
    
df_clean = clean_xml_data(df)
df_clean.head()

Unnamed: 0,AccidentNumber,AirCarrier,AircraftCategory,AircraftDamage,AirportCode,AirportName,AmateurBuilt,BroadPhaseOfFlight,Country,EngineType,...,ReportStatus,Schedule,TotalFatalInjuries,TotalMinorInjuries,TotalSeriousInjuries,TotalUninjured,WeatherCondition,City,State,fatalities
0,GAA15CA244,,,,,,,,United States,,...,Preliminary,,,,,,,Truckee,CA,
1,CEN15LA392,,Airplane,Substantial,IWS,WEST HOUSTON,No,LANDING,United States,,...,Preliminary,,,,,2.0,VMC,Houston,TX,
2,GAA15CA246,,,,,,,,United States,,...,Preliminary,,,,,,,New Milford,CT,
3,GAA15CA243,,,,,,,,United States,,...,Preliminary,,,,,,,New Enterprise,PA,
4,GAA15CA242,,,,,,,,United States,,...,Preliminary,,,,,,,Midlothian,TX,


In [48]:
print('There are {} incidents in the data, with {} columns. The columns available are: {}'.format(df.shape[0], df.shape[1], df.columns.to_list()))

There are 77257 incidents in the data, with 31 columns. The columns available are: ['AccidentNumber', 'AirCarrier', 'AircraftCategory', 'AircraftDamage', 'AirportCode', 'AirportName', 'AmateurBuilt', 'BroadPhaseOfFlight', 'Country', 'EngineType', 'EventDate', 'EventId', 'FARDescription', 'InjurySeverity', 'InvestigationType', 'Latitude', 'Location', 'Longitude', 'Make', 'Model', 'NumberOfEngines', 'PublicationDate', 'PurposeOfFlight', 'RegistrationNumber', 'ReportStatus', 'Schedule', 'TotalFatalInjuries', 'TotalMinorInjuries', 'TotalSeriousInjuries', 'TotalUninjured', 'WeatherCondition']


In [30]:
#Count missing values 
df_clean.isna().sum()

AccidentNumber              0
AirCarrier                  0
AircraftCategory        58199
AircraftDamage           1856
AirportCode             30266
AirportName             26618
AmateurBuilt              273
BroadPhaseOfFlight       2815
Country                     0
EngineType               1143
EventDate                   2
EventId                     0
FARDescription          58053
InjurySeverity            106
InvestigationType           0
Latitude                50507
Location                   33
Longitude               50517
Make                       23
Model                      40
NumberOfEngines          1507
PublicationDate         12462
PurposeOfFlight          2001
RegistrationNumber        148
ReportStatus                0
Schedule                63612
TotalFatalInjuries      19945
TotalMinorInjuries      19904
TotalSeriousInjuries    21026
TotalUninjured           9389
WeatherCondition          373
City                        0
State                      33
fatalities

In [136]:
df_clean.describe()

Unnamed: 0,AccidentNumber,AirCarrier,AircraftCategory,AircraftDamage,AirportCode,AirportName,AmateurBuilt,BroadPhaseOfFlight,Country,EngineType,...,RegistrationNumber,ReportStatus,Schedule,TotalFatalInjuries,TotalMinorInjuries,TotalSeriousInjuries,TotalUninjured,WeatherCondition,City,State
count,77257,77257.0,16520,74873,43477,47331.0,76603,71030,76747,74048,...,74501,77257,11379,55791,54733,53744,65862,75268,77257,76671
unique,77257,2720.0,12,3,9488,22283.0,2,12,173,14,...,67492,4,3,117,62,40,363,3,14542,329
top,LAX86LA318,,Airplane,Substantial,NONE,,No,LANDING,United States,Reciprocating,...,NONE,Probable Cause,UNK,0,0,0,1,VMC,Anchorage,CA
freq,1,73439.0,14210,55420,1457,1210.0,69198,18553,73076,63016,...,363,72264,4099,40363,40342,42955,22029,68764,501,8027


In [67]:
df_clean.AirCarrier.value_counts()

                                                          73439
united airlines                                              60
american airlines                                            50
delta air lines inc                                          26
continental airlines                                         26
usair                                                        24
american airlines, inc.                                      23
continental airlines, inc.                                   21
southwest airlines co                                        21
delta air lines                                              17
united air lines inc                                         14
us airways inc                                               13
delta airlines                                               13
simmons airlines (dba: american eagle)                       12
american airlines inc                                        12
northwest airlines                      

In [55]:
df_clean.AircraftCategory.value_counts()

Airplane             14210
Helicopter            1775
Glider                 261
Balloon                120
Gyrocraft               62
Unknown                 40
Weight-Shift            20
Ultralight              18
Powered Parachute       10
Powered-Lift             2
Blimp                    1
Rocket                   1
Name: AircraftCategory, dtype: int64

In [60]:
df_clean['AircraftDamage'].value_counts()[0:20]

Substantial    55420
Destroyed      17003
Minor           2450
Name: AircraftDamage, dtype: int64

In [115]:
df_clean['City'].value_counts()

ANCHORAGE          372
MIAMI              190
CHICAGO            169
HOUSTON            164
ALBUQUERQUE        164
FAIRBANKS          138
JACKSONVILLE       135
Anchorage          129
COLUMBUS           118
ENGLEWOOD          114
ORLANDO            114
LAS VEGAS          107
TUCSON             107
ATLANTA            102
COLUMBIA           101
PHOENIX            100
DENVER              98
RENO                93
Houston             92
Phoenix             89
LOS ANGELES         89
SPRINGFIELD         88
CHANDLER            87
SEATTLE             87
SAN JOSE            85
JACKSON             85
SAN DIEGO           84
                    84
FORT WORTH          83
Albuquerque         81
                  ... 
Lake George          1
BELLEAIR             1
MANGROVE CAY         1
WEDOWEE              1
Ellenville           1
LAVA HOT SPRING      1
Isom                 1
MYAKKA               1
Ship Shoal 208H      1
Mountain Ranch       1
WINTERPORT           1
St Elmo              1
PUUNENE    

In [63]:
for col in df.columns:
    
    print(df_clean[col].value_counts()[0:20])

LAX86LA318     1
SEA02LA170     1
CEN13CA144     1
CEN15LA233     1
BFO93LA086     1
LAX03LA112     1
DEN00FA086     1
NYC91LA054     1
LAX00LA167     1
ANC98LA130     1
MKC90LA021     1
LAX98LA302A    1
FTW02LA074     1
SEA05LA065     1
ERA12FA175     1
ATL83LA162     1
FTW89LA056     1
CHI93LA064     1
ERA10LA184     1
LAX91FA344     1
Name: AccidentNumber, dtype: int64
UNITED AIRLINES                           49
AMERICAN AIRLINES                         41
CONTINENTAL AIRLINES                      25
USAIR                                     24
DELTA AIR LINES INC                       23
AMERICAN AIRLINES, INC.                   22
SOUTHWEST AIRLINES CO                     21
CONTINENTAL AIRLINES, INC.                19
UNITED AIR LINES INC                      14
AMERICAN AIRLINES INC                     12
SIMMONS AIRLINES (DBA: AMERICAN EAGLE)    12
US AIRWAYS INC                            12
TRANS WORLD AIRLINES                      11
DELTA AIRLINES                          

In [148]:
df_clean.InjurySeverity.value_counts()

Non-Fatal      58499
Fatal(1)        7598
Fatal(2)        4494
Incident        3050
Fatal(3)        1407
Fatal(4)         977
Fatal(5)         301
Unavailable      193
Fatal(6)         190
Fatal(7)          79
Fatal(8)          64
Fatal(10)         40
Fatal(9)          35
Fatal(14)         17
Fatal(11)         17
Fatal(13)         11
Fatal(12)         10
Fatal(20)          6
Fatal(15)          5
Fatal(17)          5
Fatal(18)          5
Fatal(19)          5
Fatal(25)          4
Fatal(21)          4
Fatal(23)          4
Fatal(24)          3
Fatal(154)         3
Fatal(65)          3
Fatal(34)          3
Fatal(16)          3
               ...  
Fatal(189)         1
Fatal(141)         1
Fatal(107)         1
Fatal(145)         1
Fatal(256)         1
Fatal(143)         1
Fatal(72)          1
Fatal(57)          1
Fatal(110)         1
Fatal(138)         1
Fatal(114)         1
Fatal(265)         1
Fatal(26)          1
Fatal(132)         1
Fatal(144)         1
Fatal(127)         1
Fatal(121)   

In [None]:
#Plot number of incidents over time

