In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [85]:
# scrape the airline website

base_url = "https://www.airlinequality.com/airline-reviews/british-airways"

df = pd.DataFrame(columns=['text','aircraft','traveller_type','seat_type','route','date_flown','rec'])

pages_scraped = 10
reviews_per_page = 10

for page in range(1, pages_scraped+1):

    page_url = f"{base_url}/page/{page}/?sortby=post_date%3ADesc&pagesize={reviews_per_page}"

    response = requests.get(page_url)
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')

    for full_review in parsed_content.find_all("div", {"class": "tc_mobile"}):
        
        # first find the component with the corresponding class. the text we want is 
        # in the component that follows that one
        text = full_review.find("div", {"class": "text_content"}).get_text()
        aircraft = full_review.find("td", {"class": "aircraft"}).find_next().get_text() if full_review.find("td", {"class": "aircraft"}) else None
        traveller_type = full_review.find("td", {"class": "type_of_traveller"}).find_next().get_text() if full_review.find("td", {"class": "type_of_traveller"}) else None
        seat_type = full_review.find("td", {"class": "cabin_flown"}).find_next().get_text() if full_review.find("td", {"class": "cabin_flown"}) else None
        route = full_review.find("td", {"class": "route"}).find_next().get_text() if full_review.find("td", {"class": "route"}) else None
        date_flown = full_review.find("td", {"class": "date_flown"}).find_next().get_text() if full_review.find("td", {"class": "date_flown"}) else None
        rec = full_review.find("td", {"class": "recommended"}).find_next().get_text() if full_review.find("td", {"class": "recommended"}) else None

        df.loc[len(df)] = [text, aircraft, traveller_type, seat_type, route, date_flown, rec]

df.head()

Unnamed: 0,text,aircraft,traveller_type,seat_type,route,date_flown,rec
0,✅ Trip Verified | Boarding was difficult caus...,A320,Solo Leisure,Business Class,London Heathrow to Brussels,March 2024,yes
1,✅ Trip Verified | Boarding started with a del...,Boeing 777,Solo Leisure,Business Class,Barbados to London heathrow,March 2024,no
2,✅ Trip Verified | Absolutely horrible custome...,,Family Leisure,Economy Class,Toronto to Mumbai via London,February 2024,no
3,Not Verified | BA is not what it used to be! ...,,Family Leisure,Economy Class,Copenhagen to Port of Spain via London,February 2024,yes
4,"✅ Trip Verified | BA First, it's not even the...",Boeing 777-300ER,Solo Leisure,First Class,Los Angeles to London,March 2024,no


In [86]:
# split the text column into two columns, one with the text that only includes the review text (without
# the "verified" text), and another with "verified" as a boolean
  
split_df = pd.DataFrame()
split_df[["verified", "text"]] = df['text'].str.split('|', n=1, expand=True)
split_df["verified"] = np.where(split_df["verified"].str.contains("Not"), False, True)

df.drop(columns=['text'], inplace=True)
df = pd.concat([df, split_df], axis=1)
df

Unnamed: 0,aircraft,traveller_type,seat_type,route,date_flown,rec,verified,text
0,A320,Solo Leisure,Business Class,London Heathrow to Brussels,March 2024,yes,True,Boarding was difficult caused by vast majori...
1,Boeing 777,Solo Leisure,Business Class,Barbados to London heathrow,March 2024,no,True,Boarding started with a delay of some 20 min...
2,,Family Leisure,Economy Class,Toronto to Mumbai via London,February 2024,no,True,Absolutely horrible customer service - will ...
3,,Family Leisure,Economy Class,Copenhagen to Port of Spain via London,February 2024,yes,False,BA is not what it used to be! As much as I l...
4,Boeing 777-300ER,Solo Leisure,First Class,Los Angeles to London,March 2024,no,True,"BA First, it's not even the best business cl..."
...,...,...,...,...,...,...,...,...
95,,Couple Leisure,Economy Class,Gatwick to Venice,September 2023,no,True,Caught up in the Gatwick cancellation fiasc...
96,A321,Solo Leisure,Economy Class,Berlin to London,October 2023,no,True,BA has a real problem with boarding it's fl...
97,,Couple Leisure,Economy Class,Atlanta to Glasgow via London,September 2023,no,True,Our connecting flight from London to Glasgow ...
98,Boeing 787,Couple Leisure,Economy Class,Singapore to Sydney,October 2023,no,True,The worst airline I have ever flown with. A...
