In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import difflib # To take closest matches to countries

Have to import module **`us`** to filter data of incidents because if the incident happened in the USA, it will be registred by `county` and `State`<br>
[US MODULE](https://github.com/unitedstates/python-us)

In [2]:
import us

Also I have to import **`pycounrty`** module to check countries

In [3]:
import pycountry
# List of countries
countrylist=[str(country.name) for country in pycountry.countries]

In [4]:
# USA States names in lowercase
USstates=[item.name for item in us.STATES]
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
"SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [5]:
def month_number_to_string(string):
    m = {
        '01':'jan','02':'feb','03':'mar','04':'apr','05':'may','06':'jun',
         '07':'jul','08':'aug','09':'sep','10':'oct','11':'nov','12':'dec'
        }
    s = string.strip()[:3].lower()
    try:
        out = m[s]
        return out
    except:
        raise ValueError('Not a month')

## Data set: Airplane Crashes and Fatalities since 1908
https://opendata.socrata.com/Government/Airplane-Crashes-and-Fatalities-Since-1908/q2te-8cvq

In [6]:
crashes_raw=pd.read_csv('Airplane_Crashes_and_Fatalities_Since_1908.csv')
# Drop Description Attribute
crashes_raw=crashes_raw.drop('Summary',axis=1)

# Cleaning dataframe attibute date. I will select month and year for its further use
dates=[[month_number_to_string(item.split('/')[0]),item.split('/')[2]] for item in crashes_raw['Date']]
crashes_1=pd.concat((crashes_raw,pd.DataFrame(dates,columns=['Month','Year'])),axis=1)
colist=crashes_1.columns.tolist()[-2:]+crashes_1.columns.tolist()[2:-2]

In [7]:
crashes_1.head(5)

Unnamed: 0,Date,Time,Location,Operator,Flight #,Route,Type,Registration,cn/In,Aboard,Fatalities,Ground,Month,Year
0,09/17/1908,17:18,"Fort Myer, Virginia",Military - U.S. Army,,Demonstration,Wright Flyer III,,1.0,2.0,1.0,0.0,sep,1908
1,07/12/1912,06:30,"AtlantiCity, New Jersey",Military - U.S. Navy,,Test flight,Dirigible,,,5.0,5.0,0.0,jul,1912
2,08/06/1913,,"Victoria, British Columbia, Canada",Private,-,,Curtiss seaplane,,,1.0,1.0,0.0,aug,1913
3,09/09/1913,18:30,Over the North Sea,Military - German Navy,,,Zeppelin L-1 (airship),,,20.0,14.0,0.0,sep,1913
4,10/17/1913,10:30,"Near Johannisthal, Germany",Military - German Navy,,,Zeppelin L-2 (airship),,,30.0,30.0,0.0,oct,1913


In [8]:
listcountries=[]
for item in crashes_1['Location']:
    if str(item).split(',')[-1].strip() not in USstates and str(item).split(',')[-1].strip() in countrylist:
        listcountries.append(str(item).split(',')[-1].strip())
    elif str(item).split(',')[-1].strip() in USstates:
        listcountries.append('United States')
    else:
        ap=[]
        for e in re.findall('\w+',str(item).replace(',','')):
            al=difflib.get_close_matches(str(e),countrylist,n=1)
            if al!=[]:
                ap.append(al[0])
            else:
                al1=difflib.get_close_matches(str(e).capitalize(),USstates,n=1)
                al2=difflib.get_close_matches(str(e).capitalize(),states,n=1,cutoff=0.5)
                if al1!=[] or al2!=[]:
                    ap.append('United States')                    

        if ap!=[]:
            listcountries.append(ap[0])
        else:
            listcountries.append(str(item))
    
airtype=[]
for item in crashes_1['Operator']:
    if 'Military' in str(item):
        airtype.append('Military')
    else:
        airtype.append('Civil')

crashes_2=crashes_1[['Month','Year','Aboard','Fatalities','Ground']].copy()
crashes_2['Country']=listcountries
crashes_2['Type']=airtype
crashes_2=crashes_2[['Year','Month','Country','Type','Aboard','Fatalities','Ground']]


In [9]:
# Clean Data Frame for crashes for locations out of each country, Water Crashes
crashes_L=crashes_2[crashes_2['Country'].isin(countrylist)].reset_index(drop=True)

# Data Frame for only crashes out of land
crashes_W=crashes_2[~crashes_2['Country'].isin(countrylist)].reset_index(drop=True)

print("")
print("Land crashes")
display(crashes_L.head(5))
print("")
print("Water crashes")
display(crashes_W.head(5))


Land crashes


Unnamed: 0,Year,Month,Country,Type,Aboard,Fatalities,Ground
0,1908,sep,United States,Military,2.0,1.0,0.0
1,1912,jul,United States,Military,5.0,5.0,0.0
2,1913,aug,Canada,Civil,1.0,1.0,0.0
3,1913,sep,Serbia,Military,20.0,14.0,0.0
4,1913,oct,Germany,Military,30.0,30.0,0.0



Water crashes


Unnamed: 0,Year,Month,Country,Type,Aboard,Fatalities,Ground
0,1918,apr,Over the Mediterranean,Military,23.0,23.0,0.0
1,1919,oct,English Channel,Civil,,,
2,1924,apr,Over the English Channel,Civil,3.0,3.0,0.0
3,1926,may,English Channel,Civil,1.0,1.0,0.0
4,1926,oct,English Channel,Civil,12.0,0.0,0.0


In [11]:
len(crashes_L),len(crashes_W)

(5200, 68)

In [13]:
crashes_L.pivot_table(index=['Year'],aggfunc=({'Country':'count','Fatalities':,'Aboard':'sum'}))

Unnamed: 0_level_0,Aboard,Country
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1908,2.0,1
1912,5.0,1
1913,51.0,3
1915,60.0,2
1916,109.0,5
1917,124.0,6
1918,42.0,3
1919,5.0,5
1920,31.0,17
1921,69.0,13
