Resources:

- http://aerotransport.org/php/go.php?action=help_findSubfleet
- https://www.planespotters.net/airline/Endeavor-Air
- https://www.airfleets.net/flottecie/American%20Airlines.htm

In [721]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import time

In [722]:
ORD_OTP = pd.read_csv('../data/ORD_OTP.csv', index_col = "Unnamed: 0")

  mask |= (ar1 == a)


In [723]:
ORD_OTP['OP_UNIQUE_CARRIER'].value_counts()

UA    483352
AA    413998
OO    361638
MQ    358550
EV    248383
DL     58473
NK     50266
YX     32262
F9     19261
US     18900
B6     17044
AS     15951
YV      9548
VX      8161
9E      7450
OH      2870
Name: OP_UNIQUE_CARRIER, dtype: int64

In [724]:
my_codes = ORD_OTP['OP_UNIQUE_CARRIER'].unique().tolist()
my_codes.sort()

base_url = 'https://en.wikipedia.org'
my_urls = [
    '/wiki/Endeavor_Air',
    '/wiki/American_Airlines_fleet',
    '/wiki/Alaska_Airlines', # need wrangle
    '/wiki/JetBlue',
    '/wiki/Delta_Air_Lines_fleet',
    '/wiki/ExpressJet',
    '/wiki/Frontier_Airlines',
    '/wiki/Envoy_Air',
    '/wiki/Spirit_Airlines',
    '/wiki/PSA_Airlines',
    '/wiki/SkyWest_Airlines',
    '/wiki/United_Airlines_fleet',
    '/wiki/US_Airways_fleet',
    '/wiki/Virgin_America',
    '/wiki/Mesa_Airlines',
    '/wiki/Republic_Airways',
]

my_airlines = dict(zip(my_codes, my_urls))

In [725]:
def caption_hunter(tag):
    try:
        if tag.name=="caption" and ("fleet" in tag.text or "Fleet" in tag.text) and not ("retired" in tag.text):
            return True
        else:
            return False
    except:
        return False

In [726]:
def table_hunter(tag):
    try:
        if tag.name=="table" and ("wikitable" in tag['class'] or "toccolours" in tag['class']) and "Aircraft" in tag.text and "Passengers" in tag.text:
            return True
        else:
            return False
    except:
        return False

In [727]:
airline = []
aircraft = []
in_service = []
pass_count = []

for code, url in my_airlines.items():
    airline_soup = BeautifulSoup(requests.get(base_url + url).text, 'html.parser')
    try:
        # Obtain table
        try:
            airline_fleet = pd.read_html(str(airline_soup.find(caption_hunter).find_parent("table")))[0]
        except:
            airline_fleet = pd.read_html(str(airline_soup.find(table_hunter)))[0]
        try:
            airline_fleet.columns = airline_fleet.columns.droplevel(0)
        except:
            pass
        airline_fleet = airline_fleet.iloc[0:-1,:]
        
        # Obtain airline codes
        airline.extend([code]*airline_fleet.shape[0])
        
        # Obtain aircrafts
        aircraft.extend(airline_fleet['Aircraft'].values.flatten().tolist())
        
        # Obtain aircraft counts
        if 'In service' in airline_fleet.columns:
            in_service.extend(airline_fleet['In service'].values.flatten().tolist())
        elif 'Active' in airline_fleet.columns:
            in_service.extend(airline_fleet['Active'].values.flatten().tolist())
        elif 'Fleet Size' in airline_fleet.columns:
            in_service.extend(airline_fleet['Fleet Size'].values.flatten().tolist())
        elif 'In Service' in airline_fleet.columns:
            in_service.extend(airline_fleet['In Service'].values.flatten().tolist())
        elif 'Total' in airline_fleet.columns:
            in_service.extend(airline_fleet['Total'].iloc[:,0].values.flatten().tolist())
            
        # Obtain passenger counts
        try: 
            if airline_fleet.columns.tolist().count('Total') > 1:
                pass_count.extend(airline_fleet['Total'].iloc[:,1].values.flatten().tolist())
            else:
                pass_count.extend(airline_fleet['Total'].values.flatten().tolist())
        except:
            pass_count.extend(airline_fleet['Passengers'].values.flatten().tolist())
    except:
        pass
    time.sleep(0.05)
    
seats = pd.DataFrame({"airline": airline, 
                      "aircraft" : aircraft, 
                      "in_service" : in_service, 
                      "pass_count" : pass_count})

In [730]:
# Remove any remaining title rows
flags_for_removal = ["Category", "Up to", "fleet"]
seats = seats[~seats.aircraft.str.contains("|".join(flags_for_removal))]

# Remove Cargo and SkyWest fleet from Alaska Airlines
AS_planes_for_removal = ["Boeing 737-700F", "Bombardier Q400", "Embraer 175"]
seats = seats[~((seats.aircraft.str.contains("|".join(planes_for_removal))) & (seats["airline"] == "AS"))]

# Remove brackets
seats['in_service'].replace(regex=True, inplace=True, to_replace=r'\[.*\]', value=r'')
seats['pass_count'].replace(regex=True, inplace=True, to_replace=r'\[.*\]', value=r'')

# Remove airplanes not in service
seats = seats[~(seats['in_service'] == '—')]

# Remove airplanes not in service
seats = seats[~(seats['in_service'] == '—')]

# Drop nans
seats = seats.dropna()

In [731]:
seats

Unnamed: 0,airline,aircraft,in_service,pass_count
0,9E,Bombardier CRJ-200,42,50
1,9E,Bombardier CRJ-700,14,69
2,9E,Bombardier CRJ-900,3,70
3,9E,Bombardier CRJ-900,116,76
4,AA,Airbus A319-100,133,128
...,...,...,...,...
179,YX,Embraer E170,22,69
180,YX,Embraer E170,37,70
182,YX,Embraer E175,85,76
183,YX,Embraer E175,37,76


In [732]:
seats.to_csv("../data/seat_counts_wiki.csv")