<a href="https://colab.research.google.com/github/kumpaten/masters-thesis-code/blob/main/Trustpilot_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install selenium

In [None]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import csv
import time
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from google.colab import files

def scrape_reviews_selenium(url):
    # 1. Set up headless Chrome options.
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    # 2. Load the URL.
    driver.get(url)

    # 3. Wait for dynamic content to load (adjust if needed).
    time.sleep(3)

    # 4. Find all <article> elements under the main content container
    #    Each <article> typically represents a single review
    article_elements = driver.find_elements(
        By.CSS_SELECTOR,
        "#__next > div > div > main > div > div.styles_mainContent__d9oos > section article"
    )

    reviews = []
    for article in article_elements:
        # 4a. Extract the date from <time datetime="...">
        try:
            time_elem = article.find_element(By.CSS_SELECTOR, "time[datetime]")
            review_date = time_elem.get_attribute("datetime")
        except:
            review_date = None

        # 4b. Extract the rating from <div data-service-review-rating="...">
        try:
            rating_div = article.find_element(By.CSS_SELECTOR, "div[data-service-review-rating]")
            rating = rating_div.get_attribute("data-service-review-rating")
        except:
            rating = None

        if rating and review_date:
            reviews.append({
                "rating": rating,
                "date": review_date
            })

    driver.quit()
    return reviews

def save_reviews_to_csv(reviews, filename="reviews.csv"):
    fieldnames = ["company", "rating", "date"]
    with open(filename, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for review in reviews:
            writer.writerow(review)

if __name__ == "__main__":
    # Example list of tuples: (company_name, total_pages), ADJUST "total number of pages" accordingly to the current state of https://www.trustpilot.com/review/www.{company}.com?languages=all
    companies = [("google", "total number of pages"), ("facebook", "total number of pages"), ("microsoft", "total number of pages"), ("apple", "total number of pages")]

    all_reviews = []
    empty_pages = []  # Store (company, page_num) for pages that return zero reviews

    # set the "endpages" values (2nd parameter of the range() function) at the page count that marks the cut of the time span you are working with (in my case it was the year 2022)
    endpages = ["x", "y", "z", "m"]
    for endpage in endpages:
      for company, total_pages in companies:
          for num in range(total_pages, endpage, -1):
              url = f"https://www.trustpilot.com/review/{company}.com?languages=all&page={num}"
              print(f"Scraping URL: {url}")
              review_data = scrape_reviews_selenium(url)

              # Tag each review with the company
              if review_data:
                  for review in review_data:
                      review["company"] = company
                      all_reviews.append(review)
                  print("Latest review scraped:", all_reviews[-1])
              else:
                  # If no reviews found, store page for re-checking later
                  empty_pages.append((company, num))

              # Sleep randomly between 1 to 3 seconds
              time.sleep(random.uniform(1, 3))

    # After the main loop, try re-checking empty pages in a while loop
      retry_count = 0

      while empty_pages:
          retry_count += 1
          print(f"\nRe-checking {len(empty_pages)} empty pages (attempt {retry_count}) "
                f"after a 60-second pause...\n")
          time.sleep(60)  # Wait a bit before retrying

          # We'll build a new list for pages that remain empty after this retry
          new_empty_pages = []

          for (company, page_num) in empty_pages:
              url = f"https://www.trustpilot.com/review/{company}.com?languages=all&page={page_num}"
              print(f"Re-checking URL: {url}")
              review_data = scrape_reviews_selenium(url)

              if review_data:
                  # We found reviews this time, so add them to all_reviews
                  for review in review_data:
                      review["company"] = company
                      all_reviews.append(review)
                  print("Latest review scraped (retry):", all_reviews[-1])
              else:
                  # Still empty, keep it for the next retry
                  new_empty_pages.append((company, page_num))

          # If we made no progress, break to avoid infinite loops
          if len(new_empty_pages) == len(empty_pages):
            print("\nNo progress made on retry, stopping.\n")
            break

          empty_pages = new_empty_pages

          if empty_pages:
              print(f"\nEven after retries, {len(empty_pages)} pages had zero reviews:\n{empty_pages}\n")
          else:
              print("\nAll previously empty pages yielded reviews on retry!\n")


    # Finally, save to CSV
    if all_reviews:
        save_reviews_to_csv(all_reviews)
        print(f"Scraped {len(all_reviews)} reviews. Data saved to 'reviews.csv'.")
        files.download("reviews.csv")
    else:
        print("No reviews found at all.")

    # Print a sample of the scraped reviews
    print("Sample reviews:", all_reviews[:10])


In [None]:
import pandas as pd
import numpy as np

# 1. Load the CSV file that you downloaded before and potentially renamed
df = pd.read_csv("/content/drive/MyDrive/Trustpilot_reviews.csv")

# 2. Convert 'date' to datetime and extract 'year'
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["year"] = df["date"].dt.year

# 3. Group by (company, year) to get average rating and review count
grouped = df.groupby(["company", "year"]).agg(
    avg_rating=("rating", "mean"),
    review_count=("rating", "size")
).reset_index()

# 4. Compute the company-specific average rating across all years (the prior, m_c)
grouped["company_avg"] = grouped.groupby("company")["avg_rating"].transform("mean")

# 5. Calculate company-specific median review count (firm_C)
def median_nonzero(x):
    nonzero = x[x > 0]
    return int(np.median(nonzero)) if len(nonzero) > 0 else 1

grouped["firm_C"] = grouped.groupby("company")["review_count"].transform(median_nonzero)

# 6. Apply the Bayesian smoothing formula with firm-specific C
grouped["smoothed_rating"] = (
    (grouped["review_count"] * grouped["avg_rating"] + grouped["firm_C"] * grouped["company_avg"]) /
    (grouped["review_count"] + grouped["firm_C"])
)

# 7. Sort by (company, year) for clarity
grouped.sort_values(["company", "year"], inplace=True)

# 8. (Optional) Save to a new CSV
# grouped.to_csv("Trustpilot_reviews_yearly_smoothed.csv", index=False)

print("Done! The output CSV has columns: company, year, avg_rating, review_count, company_avg, firm_C, and smoothed_rating.")
print(grouped.head())

In [None]:
import pandas as pd

# 2. Filter rows where year is between 2008 and 2021 (inclusive)
grouped_filtered = grouped[
    (grouped["year"] >= 2008) & (grouped["year"] <= 2021)
].copy()  # explicitly create a copy

# 3. (Optional) Sort again by (company, year) if needed
grouped_filtered.sort_values(["company", "year"], inplace=True, ignore_index=True)

# 4. Save the filtered DataFrame
output_file_path = "Trustpilot_yearly_ratings.csv"
grouped_filtered.to_csv(output_file_path, index=False)

print("Done! 'grouped_filtered' now only has rows from 2008 to 2021.")
print(grouped_filtered.head())

# 5. Save to google drive
output_path = '/content/drive/My Drive/Preprocessed_data/Trustpilot_reviews_updated.csv'
grouped_filtered.to_csv(output_path, index=False)
print("File saved to google drive")

