In [1]:
def scrape(url):
    """Scrape URLs to generate previews."""
    import requests
    from bs4 import BeautifulSoup
    from time import sleep
    headers = requests.utils.default_headers()
    headers.update({
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
    })
    r = ""
    while r=="":
        try:
            r = requests.get(url, headers)
            break
        except:
            print("Connection refused by the server.. Sleep for 5 seconds")
            sleep(5)
            continue
    if not r.status_code == 200:
        return None
    raw_html = r.content
    soup = BeautifulSoup(raw_html, 'html.parser')
    return(soup)

In [2]:
def scrape_airline_df(results_page):
    """Convert previews from airlinequality.com to review dataframe."""
    import numpy as np
    import pandas as pd
    from datetime import datetime
    from bs4 import BeautifulSoup

    def safe_get(text):
        """Get the text if there is any, otherwise returns NaN."""
        return text.get_text() if text else pd.np.nan

    def safe_get_next(text):
        """Get the next sibling text if there is any, otherwise returns NaN."""
        return text.findNextSibling().get_text() if text else pd.np.nan

    def star_count(tag):
        """Convert logo stars into an integer."""
        return len(tag.findNextSibling().findAll("span", class_="star fill")) if tag else pd.np.nan

    reviews = results_page.find_all("article",  itemprop="review", itemscope="")
    review_list = list()

    for review in reviews:
        rating = review.find("div", class_="rating-10")
        id_ = review.find("div", class_="body")
        text_header = review.find("h2", class_="text_header")
        review_count = review.find("span", class_="userStatusReviewCount")
        name = review.find("span", itemprop="name")
        country = review.find("h3").findAll(text=True)[-2][2:-2]
        date_published = review.find("time")
        text = review.find("div", class_="text_content")
        traveller_type = review.find("td", class_="review-rating-header type_of_traveller")
        seat_type = review.find("td", class_="review-rating-header cabin_flown")
        route = review.find("td", class_="review-rating-header route")
        date_flown = review.find("td", class_="review-rating-header date_flown")
        seat_comfort = review.find("td", class_="review-rating-header seat_comfort")
        cabin_service = review.find("td", class_="review-rating-header cabin_staff_service")
        ground_service = review.find("td", class_="review-rating-header ground_service")
        value_for_money = review.find("td", class_="review-rating-header value_for_money")
        recommended = review.find("td", class_="review-rating-header recommended")
        review_dict = {
            "rating": rating.get_text() if rating else pd.np.nan,
            "id": id_.get("id") if id_ else pd.np.nan,
            "text_header": safe_get(text_header),
            "review_count": int(review_count.get_text()[:-8]) if review_count else 0,
            "name": safe_get(name),
            "country": country,
            "date_published":datetime.strptime(date_published.get("datetime"), "%Y-%m-%d") if date_published else pd.np.nan,
            "text": safe_get(text),
            "traveller_type": safe_get_next(traveller_type),
            "seat_type": safe_get_next(seat_type),
            "route": safe_get_next(route),
            "date_flown": datetime.strptime(safe_get_next(date_flown), "%B %Y") if date_flown else pd.np.nan,
            "seat_comfort": star_count(seat_comfort),
            "cabin_service": star_count(cabin_service),
            "ground_service": star_count(ground_service),
            "value_for_money": star_count(value_for_money),
            "recommended": pd.np.nan if not recommended else \
                            1 if recommended.findNextSibling().get_text()=="yes" else 0
        }
        review_list.append(review_dict)
    return pd.DataFrame.from_dict(review_list)

In [3]:
"""Get all airline_name and airline_code from airlinequality.com."""
import pandas as pd
url_all = "https://www.airlinequality.com/review-pages/a-z-airline-reviews/"
AZweb = scrape(url_all)
airlines = AZweb.find("div", class_="tabs-content").select("a")
airline_list = list()

for airline in airlines:
    airline_name = airline.get_text()
    airline_code =  airline.get("href")[17:]  # 17 to remove "/airline-reviews/" 
    airline_list.append((airline_name, airline_code))

airline_df = pd.DataFrame(airline_list, columns = ["airline_name", "airline_code"])


In [4]:
"""Get all raw reviews data from airlinequality.com."""
from IPython.display import clear_output

reviews_raw = pd.DataFrame()
airline_raw_list = list()
counter = 0
for (airline_name,airline_code) in zip(airline_df["airline_name"], airline_df["airline_code"]):
    url = "https://www.airlinequality.com/airline-reviews/" + airline_code + \
                                "/page/1/?sortby=post_date%3ADesc&pagesize=50000"
    soup = scrape(url)
    airline_raw_dict = {
        "airline_name": airline_name,
        "airline_code": airline_code,
        "airline_raw_data": soup
    }
    airline_raw_list.append(airline_raw_dict)
    new_df = scrape_airline_df(soup)
    new_df["airline_name"] = airline_name
    new_df["airline_code"] = airline_code
    reviews_raw = reviews_raw.append(new_df)
    clear_output()
    counter +=1
    percent_to_be_done = round(counter/len(airline_df) * 100)
    print(counter, " out of ", len(airline_df), "is done: ", percent_to_be_done, "%")


496  out of  496 is done:  100 %


In [5]:
reviews_raw

Unnamed: 0,cabin_service,country,date_flown,date_published,ground_service,id,name,rating,recommended,review_count,route,seat_comfort,seat_type,text,text_header,traveller_type,value_for_money,airline_name,airline_code
0,2.0,United Kingdom,2019-06-01 00:00:00,2019-06-25,1.0,anchor666859,Gyan Fernando,\n1/10\n,0.0,20,Moroni to Anjouan,2.0,Economy Class,✅ Trip Verified | Moroni to Anjouan. It is a v...,"""Not a good airline""",Solo Leisure,2.0,AB Aviation,ab-aviation
1,1.0,United Kingdom,2019-06-01 00:00:00,2019-06-25,1.0,anchor666802,Gyan Fernando,\n1/10\n,0.0,20,Anjouan to Dzaoudzi,2.0,Economy Class,✅ Trip Verified | Anjouan to Dzaoudzi. A very...,"""flight was fortunately short""",Solo Leisure,2.0,AB Aviation,ab-aviation
0,4.0,Germany,2018-10-01 00:00:00,2018-10-12,5.0,anchor612920,M Jager,\n8/10\n,1.0,32,Ljubljana to Munich,4.0,Economy Class,✅ Trip Verified | Ljubljana to Munich. The hom...,"""the crew was nice""",Family Leisure,5.0,Adria Airways,adria-airways
1,1.0,Germany,2018-10-01 00:00:00,2018-10-05,1.0,anchor611417,Giulia Rossi,\n1/10\n,0.0,0,Zurich to Ljubljana,2.0,Economy Class,Not Verified | Zurich to Ljubljana. Very poor...,"""Very bad experience overall""",Business,1.0,Adria Airways,adria-airways
2,1.0,United States,2018-07-01 00:00:00,2018-07-29,4.0,anchor595049,Galya Slavov,\n1/10\n,0.0,0,Vienna to Sofia,4.0,Economy Class,✅ Trip Verified | Vienna to Sofia. The flight...,"""bad customer service""",Family Leisure,1.0,Adria Airways,adria-airways
3,3.0,France,2018-05-01 00:00:00,2018-07-19,3.0,anchor592506,Loic Jouan,\n2/10\n,0.0,0,Paris to Skopje via Ljubljana,3.0,Economy Class,✅ Trip Verified | We were traveling from Pari...,"""overall very poor""",Solo Leisure,2.0,Adria Airways,adria-airways
4,2.0,Slovenia,2018-06-01 00:00:00,2018-06-30,2.0,anchor587586,P Gamirj,\n2/10\n,0.0,0,Ljubljana to Munich,1.0,Economy Class,✅ Trip Verified | Ljubljana to Munich. Adria'...,"""Would not fly again""",Business,1.0,Adria Airways,adria-airways
5,3.0,Czech Republic,2018-06-01 00:00:00,2018-06-24,1.0,anchor586201,B Haruz,\n3/10\n,0.0,0,Ljubljana to Prague,3.0,Economy Class,✅ Trip Verified | A very unpleasant experienc...,"""very unpleasant experience""",Couple Leisure,1.0,Adria Airways,adria-airways
6,5.0,Slovenia,2018-04-01 00:00:00,2018-05-04,5.0,anchor567813,Michel Zombra,\n10/10\n,1.0,0,Frankfurt to Ljubljana,5.0,Economy Class,✅ Trip Verified | Frankfurt to Ljubljana. Fli...,"""Flight was very comfortable""",Business,5.0,Adria Airways,adria-airways
7,1.0,Germany,2018-03-01 00:00:00,2018-03-11,1.0,anchor554731,S Hanarosic,\n1/10\n,0.0,0,Ljubljana to Frankfurt,2.0,Economy Class,✅ Trip Verified | Ljubljana to Frankfurt. Fli...,"""delayed for more than 2 hours""",Solo Leisure,1.0,Adria Airways,adria-airways


In [6]:
airline_raw = pd.DataFrame.from_dict(airline_raw_list)

In [9]:
#reviews_raw.to_csv("reviews_raw.csv", index=False)

In [10]:
#airline_raw.to_csv("airline_raw.csv", index=False)
