In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

### Web Scraping

In [10]:
# scrape the airline website

base_url = "https://www.airlinequality.com/airline-reviews/british-airways"

df = pd.DataFrame(columns=['text','aircraft','traveller_type','seat_type','route','date_flown','rec'])

pages_scraped = 10
reviews_per_page = 10

for page in range(1, pages_scraped+1):

    page_url = f"{base_url}/page/{page}/?sortby=post_date%3ADesc&pagesize={reviews_per_page}"

    response = requests.get(page_url)
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')

    for full_review in parsed_content.find_all("div", {"class": "tc_mobile"}):
        
        # first find the component with the corresponding class. the text we want is 
        # in the component that follows that one
        text = full_review.find("div", {"class": "text_content"}).get_text()
        aircraft = full_review.find("td", {"class": "aircraft"}).find_next().get_text() if full_review.find("td", {"class": "aircraft"}) else None
        traveller_type = full_review.find("td", {"class": "type_of_traveller"}).find_next().get_text() if full_review.find("td", {"class": "type_of_traveller"}) else None
        seat_type = full_review.find("td", {"class": "cabin_flown"}).find_next().get_text() if full_review.find("td", {"class": "cabin_flown"}) else None
        route = full_review.find("td", {"class": "route"}).find_next().get_text() if full_review.find("td", {"class": "route"}) else None
        date_flown = full_review.find("td", {"class": "date_flown"}).find_next().get_text() if full_review.find("td", {"class": "date_flown"}) else None
        rec = full_review.find("td", {"class": "recommended"}).find_next().get_text() if full_review.find("td", {"class": "recommended"}) else None

        df.loc[len(df)] = [text, aircraft, traveller_type, seat_type, route, date_flown, rec]

df.head()

Unnamed: 0,text,aircraft,traveller_type,seat_type,route,date_flown,rec
0,✅ Trip Verified | Boarding was difficult caus...,A320,Solo Leisure,Business Class,London Heathrow to Brussels,March 2024,yes
1,✅ Trip Verified | Boarding started with a del...,Boeing 777,Solo Leisure,Business Class,Barbados to London heathrow,March 2024,no
2,✅ Trip Verified | Absolutely horrible custome...,,Family Leisure,Economy Class,Toronto to Mumbai via London,February 2024,no
3,Not Verified | BA is not what it used to be! ...,,Family Leisure,Economy Class,Copenhagen to Port of Spain via London,February 2024,yes
4,"✅ Trip Verified | BA First, it's not even the...",Boeing 777-300ER,Solo Leisure,First Class,Los Angeles to London,March 2024,no


In [11]:
# split the text column into two columns, one with the text that only includes the review text (without
# the "verified" text), and another with "verified" as a boolean
  
split_df = pd.DataFrame()
split_df[["verified", "text"]] = df['text'].str.split('|', n=1, expand=True)
split_df["verified"] = np.where(split_df["verified"].str.contains("Not"), False, True)

df.drop(columns=['text'], inplace=True)
df = pd.concat([df, split_df], axis=1)
df

Unnamed: 0,aircraft,traveller_type,seat_type,route,date_flown,rec,verified,text
0,A320,Solo Leisure,Business Class,London Heathrow to Brussels,March 2024,yes,True,Boarding was difficult caused by vast majori...
1,Boeing 777,Solo Leisure,Business Class,Barbados to London heathrow,March 2024,no,True,Boarding started with a delay of some 20 min...
2,,Family Leisure,Economy Class,Toronto to Mumbai via London,February 2024,no,True,Absolutely horrible customer service - will ...
3,,Family Leisure,Economy Class,Copenhagen to Port of Spain via London,February 2024,yes,False,BA is not what it used to be! As much as I l...
4,Boeing 777-300ER,Solo Leisure,First Class,Los Angeles to London,March 2024,no,True,"BA First, it's not even the best business cl..."
...,...,...,...,...,...,...,...,...
95,,Couple Leisure,Economy Class,Gatwick to Venice,September 2023,no,True,Caught up in the Gatwick cancellation fiasc...
96,A321,Solo Leisure,Economy Class,Berlin to London,October 2023,no,True,BA has a real problem with boarding it's fl...
97,,Couple Leisure,Economy Class,Atlanta to Glasgow via London,September 2023,no,True,Our connecting flight from London to Glasgow ...
98,Boeing 787,Couple Leisure,Economy Class,Singapore to Sydney,October 2023,no,True,The worst airline I have ever flown with. A...


In [12]:
# split the route column into two columns, one with the origin and one with the destination
  
split_df = pd.DataFrame()
split_df[["origin", "dest"]] = df['route'].str.split('to', n=1, expand=True)

df.drop(columns=['route'], inplace=True)
df = pd.concat([df, split_df], axis=1)
df

Unnamed: 0,aircraft,traveller_type,seat_type,date_flown,rec,verified,text,origin,dest
0,A320,Solo Leisure,Business Class,March 2024,yes,True,Boarding was difficult caused by vast majori...,London Heathrow,Brussels
1,Boeing 777,Solo Leisure,Business Class,March 2024,no,True,Boarding started with a delay of some 20 min...,Barbados,London heathrow
2,,Family Leisure,Economy Class,February 2024,no,True,Absolutely horrible customer service - will ...,Toron,to Mumbai via London
3,,Family Leisure,Economy Class,February 2024,yes,False,BA is not what it used to be! As much as I l...,Copenhagen,Port of Spain via London
4,Boeing 777-300ER,Solo Leisure,First Class,March 2024,no,True,"BA First, it's not even the best business cl...",Los Angeles,London
...,...,...,...,...,...,...,...,...,...
95,,Couple Leisure,Economy Class,September 2023,no,True,Caught up in the Gatwick cancellation fiasc...,Gatwick,Venice
96,A321,Solo Leisure,Economy Class,October 2023,no,True,BA has a real problem with boarding it's fl...,Berlin,London
97,,Couple Leisure,Economy Class,September 2023,no,True,Our connecting flight from London to Glasgow ...,Atlanta,Glasgow via London
98,Boeing 787,Couple Leisure,Economy Class,October 2023,no,True,The worst airline I have ever flown with. A...,Singapore,Sydney


### EDA

In [13]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [14]:
m_name =  f"distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(m_name)
m = AutoModelForSequenceClassification.from_pretrained(m_name)
classifier = pipeline("sentiment-analysis", model=m, tokenizer=tokenizer)

In [15]:
# TODO: summarize all text for 1 star, 2 star, 3 star, etc. OR for rec/no rec
# maybe seperate by month
# just gotta do EDA i guess lolz

# first do sentiment analysis on all. group by sentiment. Then summarizee each group 

# df = pd.DataFrame(columns=['text','aircraft','traveller_type','seat_type','origin', 'dest','date_flown','rec'])


In [58]:
# iternate through all of the reviews and make a dataframe containing the sentiments of each review
sent_df = pd.DataFrame(columns=['sentiment_label', 'sentiment_score'])

for index, row in df.iterrows():
    text = row['text'][:512] # truncating the string to 512 chars. This is the limit for the model
    result = classifier(text) # classifier returns a list of length 1 containing a dict with two keys ('label' and 'score')
    sent_df.loc[len(sent_df)] = [result[0]['label'], result[0]['score']] 

sent_df.head()

Unnamed: 0,sentiment_label,sentiment_score
0,NEGATIVE,0.968747
1,NEGATIVE,0.995446
2,NEGATIVE,0.999461
3,NEGATIVE,0.996025
4,NEGATIVE,0.999621


In [59]:
# add this new df to our original df. now we have a df where we know the sentiment of sll the review! 
df = pd.concat([df, sent_df], axis=1)
df

Unnamed: 0,aircraft,traveller_type,seat_type,date_flown,rec,verified,text,origin,dest,sentiment_label,sentiment_score
0,A320,Solo Leisure,Business Class,March 2024,yes,True,Boarding was difficult caused by vast majori...,London Heathrow,Brussels,NEGATIVE,0.968747
1,Boeing 777,Solo Leisure,Business Class,March 2024,no,True,Boarding started with a delay of some 20 min...,Barbados,London heathrow,NEGATIVE,0.995446
2,,Family Leisure,Economy Class,February 2024,no,True,Absolutely horrible customer service - will ...,Toron,to Mumbai via London,NEGATIVE,0.999461
3,,Family Leisure,Economy Class,February 2024,yes,False,BA is not what it used to be! As much as I l...,Copenhagen,Port of Spain via London,NEGATIVE,0.996025
4,Boeing 777-300ER,Solo Leisure,First Class,March 2024,no,True,"BA First, it's not even the best business cl...",Los Angeles,London,NEGATIVE,0.999621
...,...,...,...,...,...,...,...,...,...,...,...
95,,Couple Leisure,Economy Class,September 2023,no,True,Caught up in the Gatwick cancellation fiasc...,Gatwick,Venice,NEGATIVE,0.999747
96,A321,Solo Leisure,Economy Class,October 2023,no,True,BA has a real problem with boarding it's fl...,Berlin,London,NEGATIVE,0.998161
97,,Couple Leisure,Economy Class,September 2023,no,True,Our connecting flight from London to Glasgow ...,Atlanta,Glasgow via London,NEGATIVE,0.999502
98,Boeing 787,Couple Leisure,Economy Class,October 2023,no,True,The worst airline I have ever flown with. A...,Singapore,Sydney,NEGATIVE,0.999785
