In [79]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

# retrieve html contents of the wikipedia page that contains aircraft fuel consumption data
planeUrl = 'https://en.wikipedia.org/wiki/Fuel_economy_in_aircraft'
tableClass = 'wikitable sortable jquery-tablesorter'
response = requests.get(planeUrl)

In [80]:
# build a soup object of the wiki page and extract all of the tables
planeSoup = BeautifulSoup(response.text, 'html.parser')
planeTables = planeSoup.find_all('table', {'class':'wikitable'})

In [81]:
# convert tables to a list of dataframes
planedfList = pd.read_html(str(planeTables))

In [82]:
# build a dictionary of all of the raw dataframes
allPlanes = {'Commuter': planedfList[0],
             'Regional': planedfList[1],
             'Short-Haul': planedfList[2],
             'Medium-Haul': planedfList[3],
             'Long-Haul': planedfList[4]}

In [83]:
#standardize column names and add 'sector' for all tables
allPlanes['Commuter']['Sector'] = '300 nmi (560 km)'
allPlanes['Short-Haul']['Sector'] = '1,000 nmi (1,900 km)'
allPlanes['Short-Haul'].rename(columns={'Fuel Burn': 'Fuel burn', 'Fuel efficiency per seat': 'Fuel per seat'},
                              inplace=True)
allPlanes['Regional'].rename(columns={'Fuel efficiency per seat': 'Fuel per seat'}, inplace=True)

In [84]:
planeList = pd.DataFrame(columns=allPlanes['Commuter'].columns)
for j in allPlanes:
    allPlanes[j]['Fuel Efficiency (L/100km)'] = allPlanes[j]['Fuel per seat'].str.extract(r'([0-9\.]+\sL)').replace('\sL', '', regex=True).astype('float')
    allPlanes[j]['Sector'] = allPlanes[j]['Sector'].str.extract(r'([0-9\.]+\skm)').replace('\skm', '', regex=True).astype('int')
    allPlanes[j].drop(columns=['Fuel burn', 'Fuel per seat'], inplace=True)
    allPlanes[j]['Sector'] = allPlanes[j]['Sector'].astype('int')
    allPlanes[j]['Class'] = str(j)
    planeList = pd.concat([planeList, allPlanes[j]])
planeList.to_csv('planes.csv')

In [85]:
carUrl = 'https://www.nrcan.gc.ca/sites/nrcan/files/oee/files/csv/MY2022%20Fuel%20Consumption%20Ratings.csv'
dfAllCars = pd.read_csv(carUrl, encoding_errors='ignore', low_memory=False)

dfAllCars = dfAllCars[['Model', 'Make', 'Model.1', 'Vehicle Class', 'Engine Size', 'Cylinders',
                       'Transmission', 'Fuel', 'Fuel Consumption', 'Unnamed: 9', 'Unnamed: 10',
                       'Unnamed: 11', 'CO2 Emissions', 'CO2']]
dfAllCars = dfAllCars.rename(columns={'Model':'Model Year', 'Model.1': 'Model', 
                                      'Engine Size': 'Engine Size (L)', 'Fuel': 'Fuel Type', 
                                      'Fuel Consumption': 'Fuel Consumption: City (L/100km)', 
                                      'Unnamed: 9': 'Fuel Consumption: Highway (L/100km)',
                                      'Unnamed: 10': 'Fuel Consumption: Combined (L/100km)',
                                      'Unnamed: 11': 'Fuel Consumption: Combined (mpg)',
                                      'CO2 Emissions': 'CO2 Emissions (g/km)',
                                      'CO2': 'CO2 Ratings'})
dfAllCars = dfAllCars[dfAllCars['Model Year'].str.isnumeric() == True]
dfAllCars.to_csv('cars.csv')

In [86]:
planeList.head()

Unnamed: 0,Model,First flight,Seats,Fuel burn,Fuel per seat,Sector,Fuel Efficiency (L/100km),Class
0,Antonov An-148 (241 nmi),2004,89,,,560,5.95,Commuter
1,Antonov An-158 (241 nmi),2010,99,,,560,5.47,Commuter
2,ATR 42-500,1995,48,,,560,3.15,Commuter
3,ATR 72-500,1997,70,,,560,2.53,Commuter
4,Beechcraft 1900D (226 nm),1982,19,,,560,6.57,Commuter
