In [68]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

# retrieve html contents of the wikipedia page that contains aircraft fuel consumption data
planeUrl = 'https://en.wikipedia.org/wiki/Fuel_economy_in_aircraft'
tableClass = 'wikitable sortable jquery-tablesorter'
response = requests.get(planeUrl)

In [69]:
# build a soup object of the wiki page and extract all of the tables
planeSoup = BeautifulSoup(response.text, 'html.parser')
planeTables = planeSoup.find_all('table', {'class':'wikitable'})

In [70]:
# convert tables to a list of dataframes
planedfList = pd.read_html(str(planeTables))

In [71]:
planedfList[1]

Unnamed: 0,Model,First flight,Seats,Sector,Fuel burn,Fuel efficiency per seat
0,Airbus A319neo,2015,144,"600 nmi (1,100 km)",3.37 kg/km (11.94 lb/mi),2.92 L/100 km (80.6 mpg‑US)[66]
1,Airbus A319neo,2015,124,"660 nmi (1,220 km)",2.82 kg/km (10 lb/mi),2.82 L/100 km (83.5 mpg‑US)[67]
2,Airbus A320neo,2015,154,"660 nmi (1,220 km)",2.79 kg/km (9.9 lb/mi),2.25 L/100 km (104.7 mpg‑US)[67]
3,Airbus A321neo,2015,192,"660 nmi (1,220 km)",3.30 kg/km (11.7 lb/mi),2.19 L/100 km (107.4 mpg‑US)[67]
4,Antonov An-148,2004,89,"684 nmi (1,267 km)",2.89 kg/km (10.3 lb/mi),4.06 L/100 km (57.9 mpg‑US)[54]
5,Antonov An-158,2010,99,"684 nmi (1,267 km)",3 kg/km (11 lb/mi),3.79 L/100 km (62.1 mpg‑US)[54]
6,Boeing 737-300,1984,126,507 nmi (939 km),3.49 kg/km (12.4 lb/mi),3.46 L/100 km (68 mpg‑US)[68]
7,Boeing 737-600,1998,110,500 nmi (930 km),3.16 kg/km (11.2 lb/mi),3.59 L/100 km (65.5 mpg‑US)[69]
8,Boeing 737-700,1997,126,500 nmi (930 km),3.21 kg/km (11.4 lb/mi),3.19 L/100 km (74 mpg‑US)[69]
9,Boeing 737 MAX 7,2017,128,"660 nmi (1,220 km)",2.85 kg/km (10.1 lb/mi),2.77 L/100 km (84.8 mpg‑US)[67]


In [72]:
# build a dictionary of all of the raw dataframes
allPlanes = {'Commuter': planedfList[0].copy(),
             'Regional': planedfList[1].copy(),
             'Short-Haul': planedfList[2].copy(),
             'Medium-Haul': planedfList[3].copy(),
             'Long-Haul': planedfList[4].copy()}

In [74]:
#standardize column names and add 'sector' for all tables
allPlanes['Commuter']['Sector'] = '300 nmi (560 km)'
allPlanes['Short-Haul']['Sector'] = '1,000 nmi (1,900 km)'
allPlanes['Short-Haul'].rename(columns={'Fuel Burn': 'Fuel burn', 'Fuel efficiency per seat': 'Fuel per seat'},
                              inplace=True)
allPlanes['Regional'].rename(columns={'Fuel efficiency per seat': 'Fuel per seat'}, inplace=True)

In [75]:
planeList = pd.DataFrame(columns=allPlanes['Commuter'].columns)
for j in allPlanes:
    allPlanes[j]['Fuel Efficiency (L/100km)'] = allPlanes[j]['Fuel per seat'].str.extract(r'([0-9\.]+\sL)').replace('\sL', '', regex=True).astype('float')
    allPlanes[j]['Sector'] = allPlanes[j]['Sector'].str.extract(r'([0-9\.\,]+\skm)').replace('\skm', '', regex=True).replace('\,', '', regex=True).astype('int')
    allPlanes[j].drop(columns=['Fuel burn', 'Fuel per seat'], inplace=True)
    allPlanes[j]['Sector'] = allPlanes[j]['Sector'].astype('int')
    allPlanes[j]['Class'] = str(j)
    planeList = pd.concat([planeList, allPlanes[j]])
planeList.drop(columns=['Fuel burn', 'Fuel per seat'], inplace=True)
planeList.to_csv('planes.csv', index=False)

In [None]:
# MANUALLY OVERRIDE COMMUTER & SH ITEMS THAT INCLUED DISTANCE IN PLANE NAME

In [77]:
carUrl = 'https://www.nrcan.gc.ca/sites/nrcan/files/oee/files/csv/MY2022%20Fuel%20Consumption%20Ratings.csv'
dfAllCars = pd.read_csv(carUrl, encoding_errors='ignore', low_memory=False)

dfAllCars = dfAllCars[['Model', 'Make', 'Model.1', 'Vehicle Class', 'Engine Size', 'Cylinders',
                       'Transmission', 'Fuel', 'Fuel Consumption', 'Unnamed: 9', 'Unnamed: 10',
                       'Unnamed: 11', 'CO2 Emissions', 'CO2']]
dfAllCars = dfAllCars.rename(columns={'Model':'Model Year', 'Model.1': 'Model', 
                                      'Engine Size': 'Engine Size (L)', 'Fuel': 'Fuel Type', 
                                      'Fuel Consumption': 'Fuel Consumption: City (L/100km)', 
                                      'Unnamed: 9': 'Fuel Consumption: Highway (L/100km)',
                                      'Unnamed: 10': 'Fuel Consumption: Combined (L/100km)',
                                      'Unnamed: 11': 'Fuel Consumption: Combined (mpg)',
                                      'CO2 Emissions': 'CO2 Emissions (g/km)',
                                      'CO2': 'CO2 Ratings'})
dfAllCars = dfAllCars[dfAllCars['Model Year'].str.isnumeric() == True]
dfAllCars.to_csv('cars.csv', index=False)

In [78]:
planeList.head()

Unnamed: 0,Model,First flight,Seats,Sector,Fuel Efficiency (L/100km),Class
0,Antonov An-148 (241 nmi),2004,89,560,5.95,Commuter
1,Antonov An-158 (241 nmi),2010,99,560,5.47,Commuter
2,ATR 42-500,1995,48,560,3.15,Commuter
3,ATR 72-500,1997,70,560,2.53,Commuter
4,Beechcraft 1900D (226 nm),1982,19,560,6.57,Commuter
