In [222]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
from urllib.parse import urljoin  # Fix for NameError


In [223]:
url = 'https://www.airlinequality.com/airline-reviews/british-airways'
response = requests.get(url)
if response.status_code==200:
    print("Connection Succesfull!")
else:
    print ("Failed to Connect!")


Connection Succesfull!


In [224]:
page_count = 1  # Track the number of pages scraped

# Lists to store data (moved outside the loop)
review_list = []
jet_type_list = []
traveller_type_list = []
Route = []
Date = []
Seat_Comfort = []
Cabin_Staff_Service = []
Food = []
Inflight_Entertainment = []
Ground_Service = []
Wifi = []
Value_For_Money = []
Recommended = []

In [225]:
# Helper function to get star ratings
def get_star_rating(parent_table, field_class):
    field_td = parent_table.find("td", class_=f"review-rating-header {field_class}")
    if field_td:
        stars_td = field_td.find_next("td", class_="review-rating-stars stars")
        if stars_td:
            filled_stars = stars_td.find_all("span", class_="star fill")
            return len(filled_stars)  # Return the count of filled stars
    return "N/A"

# Loop through pages
while url and page_count <= 391:  # Limit to 5 pages
    print(f"Scraping Page {page_count}: {url}")
    
    # Fetch the current page
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch page {page_count}")
        break

    # Parse the current page
    soup = BeautifulSoup(response.text, "lxml")
    
    # Extract reviews from the current page
    reviews = soup.find_all("div", class_=lambda x: x and "text_content" in x)

    if reviews:
        for review in reviews:
            # Extract and clean the review text
            review_text = review.get_text(separator=" ", strip=True)
            cleaned_review = " ".join(word for word in review_text.split() if word not in ["✅", "|", "Trip", "Verified"])

            # Find the parent table for ratings
            parent_table = review.find_parent("div").find_next("table", class_="review-ratings")
            
            if cleaned_review and parent_table:
                # Extract star ratings for all relevant fields
                seat_comfort = get_star_rating(parent_table, "seat_comfort")
                cabin_staff_service = get_star_rating(parent_table, "cabin_staff_service")
                food = get_star_rating(parent_table, "food_and_beverages")
                inflight_entertainment = get_star_rating(parent_table, "inflight_entertainment")
                ground_service = get_star_rating(parent_table, "ground_service")
                wifi = get_star_rating(parent_table, "wifi_and_connectivity")
                value_for_money = get_star_rating(parent_table, "value_for_money")

                # Extract 'Recommended' field
                recommended_td = parent_table.find("td", class_="review-rating-header recommended")
                recommended_value = recommended_td.find_next("td", class_="review-value").get_text(strip=True) if recommended_td else "N/A"

                # Extract additional fields
                jet_type_td = parent_table.find("td", class_="review-rating-header aircraft")
                jet_type = jet_type_td.find_next("td", class_="review-value").get_text(strip=True) if jet_type_td else "N/A"

                traveller_type_td = parent_table.find("td", class_="review-rating-header type_of_traveller")
                traveller_type = traveller_type_td.find_next("td", class_="review-value").get_text(strip=True) if traveller_type_td else "N/A"

                route_td = parent_table.find("td", class_="review-rating-header route")
                route = route_td.find_next("td", class_="review-value").get_text(strip=True) if route_td else "N/A"

                date_td = parent_table.find("td", class_="review-rating-header date_flown")
                date = date_td.find_next("td", class_="review-value").get_text(strip=True) if date_td else "N/A"

                # Append data to lists
                review_list.append(cleaned_review)
                jet_type_list.append(jet_type)
                traveller_type_list.append(traveller_type)
                Route.append(route)
                Date.append(date)
                Seat_Comfort.append(seat_comfort)
                Cabin_Staff_Service.append(cabin_staff_service)
                Food.append(food)
                Inflight_Entertainment.append(inflight_entertainment)
                Ground_Service.append(ground_service)
                Wifi.append(wifi)
                Value_For_Money.append(value_for_money)
                Recommended.append(recommended_value)

    # Find the "Next" button
    next_page = soup.find("a", text=">>")
    if next_page and "href" in next_page.attrs:
        url = urljoin("https://www.airlinequality.com", next_page["href"])  # Combine base URL with relative href
        page_count += 1
    else:
        print("No 'Next' button found or pagination ended.")
        break

# Debugging list lengths
print("Reviews:", len(review_list))
print("Jet Type:", len(jet_type_list))
print("Traveller Type:", len(traveller_type_list))
print("Route:", len(Route))
print("Date:", len(Date))
print("Seat Comfort:", len(Seat_Comfort))
print("Cabin Staff Service:", len(Cabin_Staff_Service))
print("Food:", len(Food))
print("Inflight Entertainment:", len(Inflight_Entertainment))
print("Ground Service:", len(Ground_Service))
print("WiFi:", len(Wifi))
print("Value for Money:", len(Value_For_Money))
print("Recommended:", len(Recommended))

Scraping Page 1: https://www.airlinequality.com/airline-reviews/british-airways


  next_page = soup.find("a", text=">>")


Scraping Page 2: https://www.airlinequality.com/airline-reviews/british-airways/page/2/
Scraping Page 3: https://www.airlinequality.com/airline-reviews/british-airways/page/3/
Scraping Page 4: https://www.airlinequality.com/airline-reviews/british-airways/page/4/
Scraping Page 5: https://www.airlinequality.com/airline-reviews/british-airways/page/5/
Scraping Page 6: https://www.airlinequality.com/airline-reviews/british-airways/page/6/
Scraping Page 7: https://www.airlinequality.com/airline-reviews/british-airways/page/7/
Scraping Page 8: https://www.airlinequality.com/airline-reviews/british-airways/page/8/
Scraping Page 9: https://www.airlinequality.com/airline-reviews/british-airways/page/9/
Scraping Page 10: https://www.airlinequality.com/airline-reviews/british-airways/page/10/
Scraping Page 11: https://www.airlinequality.com/airline-reviews/british-airways/page/11/
Scraping Page 12: https://www.airlinequality.com/airline-reviews/british-airways/page/12/
Scraping Page 13: https://

In [228]:
import pandas as pd

# Ensure all lists are aligned and consistent
max_length = len(review_list)

# Pad lists with 'N/A' if any are shorter
def pad_list(lst, target_length):
    while len(lst) < target_length:
        lst.append("N/A")
    return lst

Seat_Comfort = pad_list(Seat_Comfort, max_length)
Cabin_Staff_Service = pad_list(Cabin_Staff_Service, max_length)
Food = pad_list(Food, max_length)
Inflight_Entertainment = pad_list(Inflight_Entertainment, max_length)
Ground_Service = pad_list(Ground_Service, max_length)
Wifi = pad_list(Wifi, max_length)
Value_For_Money = pad_list(Value_For_Money, max_length)
Recommended = pad_list(Recommended, max_length)

# Create DataFrame
df = pd.DataFrame({
    "Reviews": review_list,
    "Seat Comfort": Seat_Comfort,
    "Cabin Staff Service": Cabin_Staff_Service,
    "Food & Beverages": Food,
    "Inflight Entertainment": Inflight_Entertainment,
    "Ground Service": Ground_Service,
    "WiFi & Connectivity": Wifi,
    "Value for Money": Value_For_Money,
    "Recommended": Recommended
})

# Display the DataFrame
df



Unnamed: 0,Reviews,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,WiFi & Connectivity,Value for Money,Recommended
0,This was the first time I flew British Airways...,4,3,1,5,1,4,1,no
1,Pretty good flight but still some small things...,5,5,5,5,4,,5,yes
2,"Check in was fine, but no priority/fast track ...",5,5,3,3,4,4,4,yes
3,British Airways is absolute rubbish. I had to ...,1,1,,,1,,1,no
4,The flight time was changed at the last minute...,2,2,1,1,1,,1,no
...,...,...,...,...,...,...,...,...,...
3896,Flew LHR - VIE return operated by bmi but BA a...,5,5,5,0,,,4,yes
3897,LHR to HAM. Purser addresses all club passenge...,4,5,4,0,,,3,yes
3898,My son who had worked for British Airways urge...,,,,,,,4,yes
3899,London City-New York JFK via Shannon on A318 b...,1,3,5,0,,,1,no


In [230]:
# Create DataFrame
df = pd.DataFrame({
    "Reviews": review_list,
    "Jet Type": jet_type_list,
    "Traveller Type": traveller_type_list,
    "Route": Route,
    "Date": Date,
    "Seat Comfort": Seat_Comfort,
    "Cabin Staff Service": Cabin_Staff_Service,
    "Food & Beverages": Food,
    "Inflight Entertainment": Inflight_Entertainment,
    "Ground Service": Ground_Service,
    "WiFi & Connectivity": Wifi,
    "Value for Money": Value_For_Money,
    "Recommended": Recommended
})

# Display the DataFrame
df



Unnamed: 0,Reviews,Jet Type,Traveller Type,Route,Date,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,WiFi & Connectivity,Value for Money,Recommended
0,This was the first time I flew British Airways...,Boeing 777 / A350,Business,Washington to London,December 2024,4,3,1,5,1,4,1,no
1,Pretty good flight but still some small things...,Boeing 777,Solo Leisure,Cape Town to London,December 2024,5,5,5,5,4,,5,yes
2,"Check in was fine, but no priority/fast track ...",Boeing 787,Solo Leisure,Mexico City to London Heathrow,November 2024,5,5,3,3,4,4,4,yes
3,British Airways is absolute rubbish. I had to ...,,Solo Leisure,London to Amsterdam,December 2024,1,1,,,1,,1,no
4,The flight time was changed at the last minute...,A320,Solo Leisure,Marrakesh to London,August 2024,2,2,1,1,1,,1,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3896,Flew LHR - VIE return operated by bmi but BA a...,,,,,5,5,5,0,,,4,yes
3897,LHR to HAM. Purser addresses all club passenge...,,,,,4,5,4,0,,,3,yes
3898,My son who had worked for British Airways urge...,,,,,,,,,,,4,yes
3899,London City-New York JFK via Shannon on A318 b...,,,,,1,3,5,0,,,1,no


In [231]:
# Save to CSV
df.to_csv("british_airways_reviews_extended.csv", index=False)
print("DataFrame saved to 'british_airways_reviews_extended.csv'")


DataFrame saved to 'british_airways_reviews_extended.csv'
