In [2]:
# Import the necessary packages and import the html file. Open the file with Beautiful Soup
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
airline_data_url="https://github.com/fivethirtyeight/data/blob/master/airline-safety/airline-safety.csv"

# Make a GET request to fetch the raw HTML content
airline_content = requests.get(airline_data_url).text

# Parse the html content
airline_soup = BeautifulSoup(airline_content, "lxml")

In [3]:
airline_col_header = []
airline_headers = airline_soup.find_all("th")
for th in airline_headers:
    airline_col_header.append(th.text)
airline_col_header

['airline',
 'avail_seat_km_per_week',
 'incidents_85_99',
 'fatal_accidents_85_99',
 'fatalities_85_99',
 'incidents_00_14',
 'fatal_accidents_00_14',
 'fatalities_00_14']

In [4]:
airline_data = []
air_data = airline_soup.find_all("tr")
for tr in air_data:
    airline_data.append(tr.text)
airline_df = pd.DataFrame(airline_data)
airline_df.columns = ['Data']
airline_df = airline_df.Data.str.split("\n",expand=True)
airline_df = airline_df.drop([0, 1,10], axis = 1)
airline_df = airline_df.drop([0])
airline_df.columns = airline_col_header
airline_df

Unnamed: 0,airline,avail_seat_km_per_week,incidents_85_99,fatal_accidents_85_99,fatalities_85_99,incidents_00_14,fatal_accidents_00_14,fatalities_00_14
1,Aer Lingus,320906734,2,0,0,0,0,0
2,Aeroflot*,1197672318,76,14,128,6,1,88
3,Aerolineas Argentinas,385803648,6,0,0,1,0,0
4,Aeromexico*,596871813,3,1,64,5,0,0
5,Air Canada,1865253802,2,0,0,2,0,0
6,Air France,3004002661,14,4,79,6,2,337
7,Air India*,869253552,2,1,329,4,1,158
8,Air New Zealand*,710174817,3,0,0,5,1,7
9,Alaska Airlines*,965346773,5,0,0,5,1,88
10,Alitalia,698012498,7,2,50,4,0,0


In [5]:
airline_df.to_csv('airline_data.csv')

In [6]:
veh_url="https://www-fars.nhtsa.dot.gov/Main/index.aspx"

# Make a GET request to fetch the raw HTML content
veh_content = requests.get(veh_url).text

# Parse the html content
veh_soup = BeautifulSoup(veh_content, "lxml")

In [7]:
vehicle_header = []
veh_headers = veh_soup.find("tr", attrs={"class": "hdr01_home"})
for th in veh_headers:
    th = th.text
    th = th.strip()
    vehicle_header.append(th)
vehicle_header = vehicle_header[1:]
vehicle_header

['2018',
 '2017',
 '2016',
 '2015',
 '2014',
 '2013',
 '2012',
 '2011',
 '2010',
 '2009',
 '2008',
 '2007',
 '2006',
 '2005',
 '2004',
 '2003',
 '2002',
 '2001',
 '2000',
 '1999',
 '1998',
 '1997',
 '1996',
 '1995',
 '1994']

In [8]:
fatality_rate = []
fatality_data = veh_soup.find_all("td", attrs={"headers": re.compile("hdr21 hdrCol.*")})
for td in fatality_data:
    fatality_rate.append(td.text)
df_fr = pd.DataFrame(fatality_rate)
df_fr.columns = ['Fatalities per 100 Million Vehicle Miles Traveled']
df_fr['Year'] = vehicle_header
df_fr

Unnamed: 0,Fatalities per 100 Million Vehicle Miles Traveled,Year
0,1.13,2018
1,1.17,2017
2,1.19,2016
3,1.15,2015
4,1.08,2014
5,1.1,2013
6,1.14,2012
7,1.1,2011
8,1.11,2010
9,1.15,2009


In [9]:
df_fr.to_csv('car_fatality_data.csv')

In [10]:
url_total_rev="http://web.mit.edu/airlinedata/www/2019%2012%20Month%20Documents/Traffic%20and%20Capacity/System%20Total/Total%20System%20Revenue%20Passenger%20Miles.htm"

total_rev_html_content = requests.get(url_total_rev).text

soup_total_rev = BeautifulSoup(total_rev_html_content, "lxml")

In [11]:
total_rev_header = []
total_rev_headers = soup_total_rev.find_all("td", attrs={"class": "xl728262"})
for td in total_rev_headers:
    total_rev_header.append(td.text)

In [12]:
total_revenue = []
total_rev = soup_total_rev.find_all("td", attrs={"class": "xl778262"})
for td in total_rev:
    total_revenue.append(td.text)

In [13]:
df_tr = pd.DataFrame(total_revenue)
df_tr.columns = ['Total Revenue Passenger Miles']
df_tr = df_tr.iloc[114:139]
df_tr = df_tr['Total Revenue Passenger Miles'].str.split("\n",expand=True)
df_tr = df_tr.drop([0], axis = 1)
df_tr.columns = ['Total Revenue Passenger Miles']
df_tr = df_tr.reset_index()
df_tr = df_tr.drop(['index'], axis = 1)
df_tr

Unnamed: 0,Total Revenue Passenger Miles
0,557234
1,598203
2,622234
3,635795
4,670779
5,708762
6,725027
7,655250
8,673997
9,751593


In [14]:
url_dom_rev="http://web.mit.edu/airlinedata/www/2019%2012%20Month%20Documents/Traffic%20and%20Capacity/Domestic/Domestic%20Revenue%20Passenger%20Miles.htm"

dom_rev_html_content = requests.get(url_dom_rev).text

soup_dom_rev = BeautifulSoup(dom_rev_html_content, "lxml")

In [15]:
dom_revenue = []
total_dom_rev = soup_dom_rev.find_all("td", attrs={"class": "xl774871"})
for td in total_dom_rev:
    dom_revenue.append(td.text)
df_dr = pd.DataFrame(dom_revenue)
df_dr.columns = ['Total Domestic Revenue Passenger Miles']
df_dr = df_dr.iloc[125:]
df_dr = df_dr['Total Domestic Revenue Passenger Miles'].str.split("\n",expand=True)
df_dr = df_dr.drop([0], axis = 1)
df_dr.columns = ['Total Domestic Revenue Passenger Miles']
df_tr['Total Domestic Revenue Passenger Miles'] = df_dr['Total Domestic Revenue Passenger Miles'].to_list()
df_tr

Unnamed: 0,Total Revenue Passenger Miles,Total Domestic Revenue Passenger Miles
0,557234,402776
1,598203,436647
2,622234,453023
3,635795,463618
4,670779,489655
5,708762,515703
6,725027,503471
7,655250,483253
8,673997,505420
9,751593,557936


In [16]:
url_intl_rev="http://web.mit.edu/airlinedata/www/2019%2012%20Month%20Documents/Traffic%20and%20Capacity/International/International%20Revenue%20Passenger%20Miles.htm"

intl_rev_html_content = requests.get(url_intl_rev).text

soup_intl_rev = BeautifulSoup(intl_rev_html_content, "lxml")

In [17]:
intl_revenue = []
total_intl_rev = soup_intl_rev.find_all("td", attrs={"class": "xl775725"})
for td in total_intl_rev:
    intl_revenue.append(td.text)
df_ir = pd.DataFrame(intl_revenue)
df_ir.columns = ['Total International Revenue Passenger Miles']
df_ir = df_ir.iloc[125:]
df_ir = df_ir['Total International Revenue Passenger Miles'].str.split("\n",expand=True)
df_ir = df_ir.drop([0], axis = 1)
df_ir.columns = ['Total International Revenue Passenger Miles']
df_tr['Total International Revenue Passenger Miles'] = df_ir['Total International Revenue Passenger Miles'].to_list()
df_tr['Year'] = total_rev_header
df_tr

Unnamed: 0,Total Revenue Passenger Miles,Total Domestic Revenue Passenger Miles,Total International Revenue Passenger Miles,Year
0,557234,402776,151032,1995
1,598203,436647,156796,1996
2,622234,453023,164848,1997
3,635795,463618,166962,1998
4,670779,489655,175316,1999
5,708762,515703,187404,2000
6,725027,503471,173617,2001
7,655250,483253,167341,2002
8,673997,505420,161066,2003
9,751593,557936,185009,2004


In [18]:
df_tr_area = pd.DataFrame(df_tr[['Total Revenue Passenger Miles','Year']])
df_tr_area.columns = ['Revenue','Year']
df_tr_area['Category'] = ['Total' for i in range(25)]

df_tr_temp = pd.DataFrame(df_tr[['Total Domestic Revenue Passenger Miles','Year']])
df_tr_temp.columns = ['Revenue','Year']
df_tr_temp['Category'] = ['Domestic' for i in range(25)]
df_tr_area = df_tr_area.append(df_tr_temp)

df_tr_temp = pd.DataFrame(df_tr[['Total International Revenue Passenger Miles','Year']])
df_tr_temp.columns = ['Revenue','Year']
df_tr_temp['Category'] = ['International' for i in range(25)]
df_tr_area = df_tr_area.append(df_tr_temp)

df_tr_area.sort_values(by=["Year","Category"])

Unnamed: 0,Revenue,Year,Category
0,402776,1995,Domestic
0,151032,1995,International
0,557234,1995,Total
1,436647,1996,Domestic
1,156796,1996,International
...,...,...,...
23,283189,2018,International
23,1016918,2018,Total
24,762889,2019,Domestic
24,295009,2019,International


In [19]:
df_tr.to_csv('total_rev_data.csv')
df_tr_area.to_csv('total_rev_data_area.csv')

In [20]:
us_url = "https://www.bts.gov/sites/bts.dot.gov/files/table_02_09_121819.xlsx"
us_df = pd.read_excel(us_url, skiprows=1)
us_df.head(10)

Unnamed: 0.1,Unnamed: 0,1960,1965,1970,1975,1980,1985,1990,1991,1992,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,Total fatalities,499,261,146.0,124.0,1.0,526.0,39.0,50.0,33.0,...,52.0,2.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,1.0
1,Total seriously injured persons,N,N,107.0,81.0,19.0,30.0,29.0,26.0,22.0,...,26.0,17.0,21.0,18.0,9.0,14.0,24.0,18.0,19.0,25.0
2,Total accidents,90,83,55.0,37.0,19.0,21.0,24.0,26.0,18.0,...,30.0,30.0,33.0,26.0,23.0,31.0,29.0,30.0,32.0,30.0
3,Fatal accidents,17,9,8.0,3.0,1.0,7.0,6.0,4.0,4.0,...,2.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0
4,Aircraft-miles (millions),1130,1536,2685.0,2478.0,2924.0,3631.0,4947.832,4824.824,5039.435,...,7465.598,7598.128,7713.557,7660.072,7672.511158,7690.949524,7822.203105,8016.961206,8154.971756,8473.725686
5,Rates per 100 million aircraft-miles,,,,,,,,,,...,,,,,,,,,,
6,Fatalities,44.1593,16.9922,5.43762,5.00404,0.0342,14.486367,0.788224,1.036307,0.654835,...,0.696528,0.026322,0.0,0.0,0.117302,0.0,0.0,0.0,0.0,0.011801
7,Seriously injured persons,N,N,3.9851,3.26877,0.649795,0.826219,0.586115,0.53888,0.436557,...,0.348264,0.223739,0.272248,0.234985,0.117302,0.182032,0.306819,0.224524,0.232987,0.29503
8,Total accidents,7.9646,5.40365,2.04842,1.49314,0.649795,0.578353,0.485061,0.53888,0.357183,...,0.401843,0.394834,0.427818,0.339422,0.299771,0.403071,0.37074,0.374207,0.392399,0.354036
9,"Total accidents, fatal",1.50442,0.585938,0.297952,0.121065,0.0342,0.192784,0.121265,0.082905,0.079381,...,0.02679,0.013161,0.0,0.0,0.026067,0.0,0.0,0.0,0.0,0.011801


In [21]:
us_years = us_df.columns.tolist()[1:]
us_acc_rate = us_df.iloc[8].tolist()[1:]
us_fatal_rate = us_df.iloc[9].tolist()[1:]
us_miles_flown = us_df.iloc[4].tolist()[1:]

In [23]:
us_data = pd.DataFrame(us_df.columns.tolist()[1:], columns = ["Years"])
us_data['Accident_Rate'] = us_acc_rate
us_data['Fatality_Rate'] = us_fatal_rate
us_data['Miles_Flown'] = us_miles_flown
us_data['Miles_Flown'] = us_data['Miles_Flown']*1000000
us_data

Unnamed: 0,Years,Accident_Rate,Fatality_Rate,Miles_Flown
0,1960,7.964602,1.504425,1130000000.0
1,1965,5.403646,0.585938,1536000000.0
2,1970,2.048417,0.297952,2685000000.0
3,1975,1.49314,0.121065,2478000000.0
4,1980,0.649795,0.0342,2924000000.0
5,1985,0.578353,0.192784,3631000000.0
6,1990,0.485061,0.121265,4947832000.0
7,1991,0.53888,0.082905,4824824000.0
8,1992,0.357183,0.079381,5039435000.0
9,1993,0.43814,0.01905,5249469000.0


In [24]:
us_data.to_csv('us_accident_data.csv')

In [37]:
flights_url = 'https://www.bts.dot.gov/annual-passengers-all-us-scheduled-airline-flights-domestic-international-and-foreign-airline'

flights_html_content = requests.get(flights_url).text

soup_flights = BeautifulSoup(flights_html_content, "lxml")

In [112]:
flights_table = soup_flights.find_all("table")[0]
flight_results = pd.DataFrame()
rows = flights_table.find_all('tr')
for row in rows:
    data = row.find_all('td')
    row_data = [ x.text for x in data ]
    temp_df = pd.DataFrame([row_data])

    flight_df = flight_results.append(temp_df)

flight_results = flight_results.dropna(how='all').reset_index(drop = True)
flight_results.columns = ['Year', 'Total', 'Domestic', 'International']
flight_results

Unnamed: 0,Year,Total,Domestic,International
0,2003,700.9,583.3,117.6
1,2004,763.7,629.8,133.9
2,2005,800.8,657.3,143.6
3,2006,808.1,658.4,149.7
4,2007,835.5,679.2,156.3
5,2008,809.8,651.7,158.1
6,2009,767.8,618.1,149.7
7,2010,787.5,629.5,157.9
8,2011,802.1,638.2,163.9
9,2012,813.1,642.3,170.8


In [113]:
flight_results.to_csv('flight_totals.csv')