# DATA ANALYSIS WITH PANDAS - DAY 19 - WEEK 4

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import difflib # To take closest matches to countries

Have to import module **`us`** to filter data of incidents because if the incident happened in the USA, it will be registred by `county` and `State`<br>
[US MODULE](https://github.com/unitedstates/python-us)

In [2]:
import us
# USA States names in lowercase - Using US module
USstates=[item.name for item in us.STATES]
states = list(us.states.mapping('name', 'abbr').values())

Also I have to import **`pycounrty`** module to check countries

In [3]:
import pycountry
# List of countries
countrylist=[str(country.name) for country in pycountry.countries]

In [4]:
def month_number_to_string(string):
    m = {
        '01':'jan','02':'feb','03':'mar','04':'apr','05':'may','06':'jun',
         '07':'jul','08':'aug','09':'sep','10':'oct','11':'nov','12':'dec'
        }
    s = string.strip()[:3].lower()
    try:
        out = m[s]
        return out
    except:
        raise ValueError('Not a month')

## Data set: Airplane Crashes and Fatalities since 1908
I have retrieved the data from the link below<br>
https://opendata.socrata.com/Government/Airplane-Crashes-and-Fatalities-Since-1908/q2te-8cvq

In [5]:
crashes_raw=pd.read_csv('Airplane_Crashes_and_Fatalities_Since_1908.csv')
# Drop Description Attribute
crashes_raw=crashes_raw.drop('Summary',axis=1)

# Cleaning dataframe attibute date. I will select month and year for its further use
dates=[[month_number_to_string(item.split('/')[0]),item.split('/')[2]] for item in crashes_raw['Date']]
crashes_1=pd.concat((crashes_raw,pd.DataFrame(dates,columns=['Month','Year'])),axis=1)
colist=crashes_1.columns.tolist()[-2:]+crashes_1.columns.tolist()[2:-2]
crashes_1=crashes_1[colist]
crashes_1['Year']=crashes_1['Year'].astype('int64')

In [6]:
info=crashes_1.describe(include='all').T
info['Data Type']=crashes_1.dtypes
display(info.T)

Unnamed: 0,Month,Year,Location,Operator,Flight #,Route,Type,Registration,cn/In,Aboard,Fatalities,Ground
count,5268,5268,5248,5250,1069,3562,5241,4933,4040,5246,5256,5246
unique,12,,4303,2476,724,3244,2446,4905,3707,,,
top,dec,,"Moscow, Russia",Aeroflot,-,Training,Douglas DC-3,49,178,,,
freq,517,,15,179,67,81,334,3,6,,,
mean,,1971.3,,,,,,,,27.5545,20.0683,1.60884
std,,22.3875,,,,,,,,43.0767,33.2,53.9878
min,,1908,,,,,,,,0,0,0
25%,,1954,,,,,,,,5,3,0
50%,,1973,,,,,,,,13,9,0
75%,,1990,,,,,,,,30,23,0


Now I am going to normalize data from location and transport type ('Civilian' or 'Military')

In [7]:
# First by countries
# Had to use weighted comparisions because country and states name weren't correct
# Done with module difflib and function get_close_matches - Very handy
listcountries=[]
for item in crashes_1['Location']:
    if str(item).split(',')[-1].strip() not in USstates and str(item).split(',')[-1].strip() in countrylist:
        listcountries.append(str(item).split(',')[-1].strip())
    elif str(item).split(',')[-1].strip() in USstates:
        listcountries.append('United States')
    else:
        ap=[]
        for e in re.findall('\w+',str(item).replace(',','')):
            al=difflib.get_close_matches(str(e),countrylist,n=1)
            if al!=[]:
                ap.append(al[0])
            else:
                al1=difflib.get_close_matches(str(e).capitalize(),USstates,n=1)
                al2=difflib.get_close_matches(str(e).capitalize(),states,n=1,cutoff=0.5)
                if al1!=[] or al2!=[]:
                    ap.append('United States')                    

        if ap!=[]:
            listcountries.append(ap[0])
        else:
            listcountries.append(str(item))
    
airtype=[]
for item in crashes_1['Operator']:
    if 'Military' in str(item):
        airtype.append('Military')
    else:
        airtype.append('Civil')

crashes_2=crashes_1[['Month','Year','Aboard','Fatalities','Ground']].copy()
crashes_2['Country']=listcountries
crashes_2['Type']=airtype
crashes_2=crashes_2[['Year','Month','Country','Type','Aboard','Fatalities','Ground']]


In [8]:
# Clean Data Frame for crashes for locations out of each country, Water Crashes
crashes_L=crashes_2[crashes_2['Country'].isin(countrylist)].reset_index(drop=True)

# Data Frame for only crashes out of land
crashes_W=crashes_2[~crashes_2['Country'].isin(countrylist)].reset_index(drop=True)

print("")
print("Land crashes")
display(crashes_L.head(5))
print("")
print("Water crashes")
display(crashes_W.head(5))


Land crashes


Unnamed: 0,Year,Month,Country,Type,Aboard,Fatalities,Ground
0,1908,sep,United States,Military,2.0,1.0,0.0
1,1912,jul,United States,Military,5.0,5.0,0.0
2,1913,aug,Canada,Civil,1.0,1.0,0.0
3,1913,sep,Serbia,Military,20.0,14.0,0.0
4,1913,oct,Germany,Military,30.0,30.0,0.0



Water crashes


Unnamed: 0,Year,Month,Country,Type,Aboard,Fatalities,Ground
0,1918,apr,Over the Mediterranean,Military,23.0,23.0,0.0
1,1919,oct,English Channel,Civil,,,
2,1924,apr,Over the English Channel,Civil,3.0,3.0,0.0
3,1926,may,English Channel,Civil,1.0,1.0,0.0
4,1926,oct,English Channel,Civil,12.0,0.0,0.0


Pivoting the table for land crashes

In [9]:
# Pivoted_1 contains info of how many fatalities and people aboard were in all flights for eache year
# Also counts the number of incidents per year
pivoted_1=(crashes_L.pivot_table(index=['Year'],aggfunc=({'Country':'count','Fatalities':'sum','Aboard':'sum'}))).reset_index()
pivoted_1['Year']=pivoted_1['Year'].astype('int64')
pivoted_1.columns=['Year','People aboard','# Crashes','Fatalities']
pivoted_1=pivoted_1[['Year','# Crashes','People aboard','Fatalities']]
display(pivoted_1.dtypes)
display(pivoted_1.head())
display(pivoted_1.tail())

Year               int64
# Crashes          int64
People aboard    float64
Fatalities       float64
dtype: object

Unnamed: 0,Year,# Crashes,People aboard,Fatalities
0,1908,1,2.0,1.0
1,1912,1,5.0,5.0
2,1913,3,51.0,45.0
3,1915,2,60.0,40.0
4,1916,5,109.0,108.0


Unnamed: 0,Year,# Crashes,People aboard,Fatalities
93,2005,49,2151.0,1293.0
94,2006,49,1413.0,1136.0
95,2007,54,1364.0,931.0
96,2008,61,1445.0,802.0
97,2009,24,891.0,581.0


___

### CASE #1
Hypothesis: There has to exists a linear relation between `people aboard` a plane and the number of `fatalities`<br>
This is common sense driven, when planes crash while flying almost everybody die.<br>
Of course, the `# of crashes` has also a strong relationship with `fatalities` and `people aboard`<br>

Lets correlate this info<br>
First: **Pearson Correlation**<br>
* ```Let's check if our variables have any linear relation with each other```

In [10]:
pivoted_1.corr()

Unnamed: 0,Year,# Crashes,People aboard,Fatalities
Year,1.0,0.75825,0.778644,0.734575
# Crashes,0.75825,1.0,0.909307,0.919632
People aboard,0.778644,0.909307,1.0,0.971843
Fatalities,0.734575,0.919632,0.971843,1.0


Second: **Spearman Correlation Coeficients**<br>
* ```Let's check if our variables have any non-linear relation with each other```

In [11]:
pivoted_1.corr(method='spearman')

Unnamed: 0,Year,# Crashes,People aboard,Fatalities
Year,1.0,0.711885,0.778939,0.734006
# Crashes,0.711885,1.0,0.914795,0.921137
People aboard,0.778939,0.914795,1.0,0.966652
Fatalities,0.734006,0.921137,0.966652,1.0


Second: **Kendall Correlation Coeficients**<br>
* ```Let's check if our variables have any non-parametric relation with each other```

In [12]:
pivoted_1.corr(method='kendall')

Unnamed: 0,Year,# Crashes,People aboard,Fatalities
Year,1.0,0.536793,0.608796,0.551405
# Crashes,0.536793,1.0,0.756193,0.761772
People aboard,0.608796,0.756193,1.0,0.883907
Fatalities,0.551405,0.761772,0.883907,1.0


### Conclusions - 1

Kendall is the worst one for this dataset. So we can depict it.<br>
Spearman and Pearson shows good correlation coeficients for variables within the dataframe<br>
Our hipothesis is confirmed by the results driven by spearman and pearson coeficients.<br>
There are a strong relationship between all variables except for the year. This one does not reach a value over <br>
0.8 so It is not good enough for us.<br>

___

### CASE #2
Now lets check the same correlations between years, people aboard, ground fatalities and plane fatalities <br>
for the following data frame (Land crashes). These could be related with each other

In [13]:
crashes_L['Year']=crashes_L['Year'].astype('int64')
crashes_L=crashes_L.sort_values('Year',ascending=True).reset_index(drop=True)
display(crashes_L.head())
display(crashes_L.tail())

Unnamed: 0,Year,Month,Country,Type,Aboard,Fatalities,Ground
0,1908,sep,United States,Military,2.0,1.0,0.0
1,1912,jul,United States,Military,5.0,5.0,0.0
2,1913,aug,Canada,Civil,1.0,1.0,0.0
3,1913,sep,Serbia,Military,20.0,14.0,0.0
4,1913,oct,Germany,Military,30.0,30.0,0.0


Unnamed: 0,Year,Month,Country,Type,Aboard,Fatalities,Ground
5195,2009,jan,United States,Civil,155.0,0.0,0.0
5196,2009,jan,Niger,Civil,9.0,8.0,0.0
5197,2009,jun,Canada,Civil,1.0,1.0,0.0
5198,2009,mar,United States,Civil,14.0,14.0,0.0
5199,2009,jun,India,Military,13.0,13.0,0.0


For this one I am not going to group or pivot the table<br><br>
Lets see those correlation coeficients!

In [14]:
print('')
print('Pearson Coeficients')
display(crashes_L.corr())
print('------------------------------')
print('')
print('Spearman Coeficients:')
display(crashes_L.corr(method='spearman'))
print('------------------------------')
print('')
print('Kendall Coeficients:')
display(crashes_L.corr(method='kendall'))


Pearson Coeficients


Unnamed: 0,Year,Aboard,Fatalities,Ground
Year,1.0,0.152727,0.109285,0.029286
Aboard,0.152727,1.0,0.772225,0.023751
Fatalities,0.109285,0.772225,1.0,0.035177
Ground,0.029286,0.023751,0.035177,1.0


------------------------------

Spearman Coeficients:


Unnamed: 0,Year,Aboard,Fatalities,Ground
Year,1.0,0.060092,0.017315,0.042142
Aboard,0.060092,1.0,0.776131,0.058626
Fatalities,0.017315,0.776131,1.0,0.018721
Ground,0.042142,0.058626,0.018721,1.0


------------------------------

Kendall Coeficients:


Unnamed: 0,Year,Aboard,Fatalities,Ground
Year,1.0,0.049186,0.016933,0.03429
Aboard,0.049186,1.0,0.728307,0.048396
Fatalities,0.016933,0.728307,1.0,0.015587
Ground,0.03429,0.048396,0.015587,1.0


### Conclusions - 2
Results says that there is no more relationships between variables. Just `Fatalities` and People `Aboard`<br>
No Special relation between variables as far as we have already observed in the case before

___

Lets check **sea crashes** > crashes_W

In [15]:
# Pivoted_1 contains info of how many fatalities and people aboard were in all flights for eache year
# Also counts the number of incidents per year
pivoted_3=(crashes_L.pivot_table(index=['Year'],aggfunc=({'Country':'count','Fatalities':'sum','Aboard':'sum'}))).reset_index()
pivoted_3['Year']=pivoted_3['Year'].astype('int64')
pivoted_3.columns=['Year','People aboard','# Crashes','Fatalities']
pivoted_3=pivoted_3[['Year','# Crashes','People aboard','Fatalities']]
display(pivoted_3.head())
display(pivoted_3.tail())

Unnamed: 0,Year,# Crashes,People aboard,Fatalities
0,1908,1,2.0,1.0
1,1912,1,5.0,5.0
2,1913,3,51.0,45.0
3,1915,2,60.0,40.0
4,1916,5,109.0,108.0


Unnamed: 0,Year,# Crashes,People aboard,Fatalities
93,2005,49,2151.0,1293.0
94,2006,49,1413.0,1136.0
95,2007,54,1364.0,931.0
96,2008,61,1445.0,802.0
97,2009,24,891.0,581.0


### CASE #3
Hypothesis: It is supossed to have the same relations between variables than Case 1 (pivoted_1 for land crashes)<br>
Even there might be the possibility that the relation between people aboard and fatalities raises because is <br>
harder to land in the sea than in land

In [16]:
print('')
print('Pearson Coeficients')
display(pivoted_3.corr())
print('------------------------------')
print('')
print('Spearman Coeficients:')
display(pivoted_3.corr(method='spearman'))
print('------------------------------')
print('')
print('Kendall Coeficients:')
display(pivoted_3.corr(method='kendall'))


Pearson Coeficients


Unnamed: 0,Year,# Crashes,People aboard,Fatalities
Year,1.0,0.75825,0.778644,0.734575
# Crashes,0.75825,1.0,0.909307,0.919632
People aboard,0.778644,0.909307,1.0,0.971843
Fatalities,0.734575,0.919632,0.971843,1.0


------------------------------

Spearman Coeficients:


Unnamed: 0,Year,# Crashes,People aboard,Fatalities
Year,1.0,0.711885,0.778939,0.734006
# Crashes,0.711885,1.0,0.914795,0.921137
People aboard,0.778939,0.914795,1.0,0.966652
Fatalities,0.734006,0.921137,0.966652,1.0


------------------------------

Kendall Coeficients:


Unnamed: 0,Year,# Crashes,People aboard,Fatalities
Year,1.0,0.536793,0.608796,0.551405
# Crashes,0.536793,1.0,0.756193,0.761772
People aboard,0.608796,0.756193,1.0,0.883907
Fatalities,0.551405,0.761772,0.883907,1.0


Lets display the difference of coeficients for CASE #1 and CASE #2

In [17]:
print('')
print('Pearson Coeficients')
display(pivoted_3.corr()-pivoted_1.corr())
print('------------------------------')
print('')
print('Spearman Coeficients:')
display(pivoted_3.corr(method='spearman')-pivoted_1.corr(method='spearman'))
print('------------------------------')
print('')
print('Kendall Coeficients:')
display(pivoted_3.corr(method='kendall')-pivoted_1.corr(method='kendall'))


Pearson Coeficients


Unnamed: 0,Year,# Crashes,People aboard,Fatalities
Year,0.0,0.0,0.0,0.0
# Crashes,0.0,0.0,0.0,0.0
People aboard,0.0,0.0,0.0,0.0
Fatalities,0.0,0.0,0.0,0.0


------------------------------

Spearman Coeficients:


Unnamed: 0,Year,# Crashes,People aboard,Fatalities
Year,0.0,0.0,0.0,0.0
# Crashes,0.0,0.0,0.0,0.0
People aboard,0.0,0.0,0.0,0.0
Fatalities,0.0,0.0,0.0,0.0


------------------------------

Kendall Coeficients:


Unnamed: 0,Year,# Crashes,People aboard,Fatalities
Year,0.0,0.0,0.0,0.0
# Crashes,0.0,0.0,0.0,0.0
People aboard,0.0,0.0,0.0,0.0
Fatalities,0.0,0.0,0.0,0.0


### Conclusions - 3

Going back to conclusion 1, [Conclusions 1](#Conclusions-\--1) because CASE 3 is the same than CASE 1 > Differences <br> between both cases give us 0 so are the same

___