# Web Scraping 

---

### Singulier


---


Install the required package if it's not yet done

In [None]:
!pip3 install requests lxml pandas

# Import the needed packages

In [None]:
import requests
from lxml import html
from pprint import pprint
from urllib.parse import urljoin

In [None]:
# Write the url you want to scrap
url = "https://www.trustpilot.com/review/tripmate.com"

# 'Request' the HTML of the page
http_request = requests.get(url)

# Retrieve its content
page_content = http_request.content

# Transform the HTML content to the right format
page_html = html.fromstring(page_content)

In [None]:
def clean_text(text):
  """
  Function to clean a text.
  Takes a string and returns a string
  """
  # join the list to have a string
  cleaned_text = "".join(text)
  # remove '\n' and useless spaces
  cleaned_text = cleaned_text.strip()
  return cleaned_text

In [None]:
def parse_review(review_block):
  """
  Create a function to parse a review.
  Takes an HTML element containing the review and returns a dictionnary with cleaned information
  """
  # Create a dictionnary to store the results
  info = dict()
  # Write here the path to the title.
  xpath_title = ".//h2//text()"
  # Retrieve the title
  title = review_block.xpath(xpath_title)
  # Clean the title
  cleaned_title = clean_text(title)
  # Store the title
  info["title"] = cleaned_title
  # Same thing with the content
  xpath_content = ".//p[@class='review-content__text']//text()"
  content = review_block.xpath(xpath_content)
  cleaned_content = clean_text(content)
  info["content"] = cleaned_content
  # Same thing with the rating
  xpath_rating = ".//img/@alt"
  rating = review_block.xpath(xpath_rating)
  cleaned_rating = clean_text(rating)
  info["rating"] = cleaned_rating
  # Same thing with the date, don't forget to clean it
  xpath_date = ".//script[@data-initial-state='review-dates']//text()"
  date = review_block.xpath(xpath_date)
  cleaned_info_dates = clean_text(date)
  date_index = cleaned_info_dates.find("publishedDate")
  date_start_index = date_index + 16
  date_end_index = date_start_index + 10
  cleaned_date = cleaned_info_dates[date_start_index:date_end_index]
  info["date"] = cleaned_date
  return info

In [None]:
def parse_page(page_html):
    # Write the xpath of the result blocks
    xpath_results = "//div[contains(@class, 'review-card')]"
    # Get all the reviews
    all_results = page_html.xpath(xpath_results)
    # Create a list to store the scrapped information
    all_reviews_info = []
    # Explore all reviews
    for review in all_results:
        # For each review, get the information of the review
        review_info = parse_review(review)
        # Store them in the list all_reviews_info
        all_reviews_info.append(review_info)
    return all_reviews_info

In [None]:
def get_next_link(url, page_html):
    # Write here the path to the next page.
    xpath_next_link = "//a[@data-page-number='next-page']/@href"
    # Retrieve the link to the next page
    res_next_link = page_html.xpath(xpath_next_link)
    
    # Check whether or not there is a link
    if len(res_next_link) > 0: # (i.e if the list is not empty)
        res_next_link_cleaned = clean_text(res_next_link) # Then clean the result
        next_link = urljoin(url, res_next_link_cleaned) # Get the absolute link
    else:
        next_link = None
    return next_link

In [None]:
def scrap_all_reviews(url):
    # Initialize 'next_url' that will be modified
    # It's better to not alter the url parameter
    next_url = url
    # Create a list to store the results
    all_reviews = []
    # Explore all the urls
    while next_url is not None:
        # 'Request' the HTML
        http_request = requests.get(next_url)
        # Retrieve its content
        page_content = http_request.content
        # Transform the HTML content to the right format
        page_html = html.fromstring(page_content)
        # Scrap the reviews of the page
        page_reviews = parse_page(page_html)
        # Store the scrapped reviews
        all_reviews += page_reviews
        # Display a message to show completion
        print(f"Done with {next_url}")
        # Get the url of the next page
        next_url = get_next_link(next_url, page_html)
    return all_reviews

In [None]:
url = "https://www.trustpilot.com/review/tripmate.com"
all_reviews = scrap_all_reviews(url)
print(f"Scrapped {len(all_reviews)} reviews")

Check that the total number of reviews scrapped matches the total number of reviews mentionned on the website. If it's not the case, try to investigate why. For instance, go to the last page scrapped and see if there are other reviews available in other languages but not displayed etc.

## Customer Reviews Analysis

In [None]:
# Package to handle the date
import pandas as pd

#Packages to display graphs
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(rc={'figure.figsize':(10, 5)})
# In the general case, avoid putting imports in the middle of the code
# All imports must be at the top of the file
# However, this is a training file, so that's ok

In [None]:
# Create a DataFrame (= basically a table)
df = pd.DataFrame(all_reviews)
# Display first 10 rows
df.head(10)

### Let's save our results!

In [None]:
from google.colab import drive
# Authenticate to tell Google Drive that you are in fact the owner of this Drive
drive.mount('drive')

In [None]:
# Give the path to the file where you want the reviews to be stored.
# The Folder should already exist, go create it if it's not the case.
filepath = "drive/My Drive/Training - Scraping/customer_reviews_TripMate.xlsx"
# Save results in that file
df.to_excel(filepath)

### Let's see how many reviews per rating the company got

In [None]:
df_rating = df.groupby("rating")["content"].count()
df_rating

In [None]:
df_rating = df_rating.reset_index()
df_rating

In [None]:
sns.barplot(x="rating", y="content", data=df_rating)
plt.show()

### Let's see how many reviews per month the company got

In [None]:
# First transform the date into a readable format

def get_year_month(date):
  """
  Function to get the year and the month from a date.
  Takes a string and returns a string.
  """
  return pd.to_datetime(date[:7]) # Get the first 7 characters (year and month) and transform it into a datetime object

df["date_year_month"] = df["date"].apply(get_year_month)
df.head()

In [None]:
# Plot the number of reviews per month

df_year_month = df.groupby("date_year_month")["content"].count().reset_index()
sns.lineplot(x="date_year_month", y="content", data=df_year_month)
plt.show()

In [None]:
df_year_month_rating = df.groupby(["date_year_month", "rating"])["content"].count().reset_index()

sns.lineplot(x="date_year_month", y="content", data=df_year_month_rating,
             hue="rating")
plt.show()

In [None]:
start_date = pd.to_datetime("2020-01-01")
end_date = pd.to_datetime("2020-10-01")

def is_within_select_period(date):
  return date >= start_date and date < end_date

df_select_period = df_year_month_rating[df_year_month_rating["date_year_month"].apply(is_within_select_period)]
df_select_period

In [None]:
sns.lineplot(x="date_year_month", y="content", data=df_select_period,
             hue="rating")
plt.show()

In [None]:
df_year_month_rating = df_select_period.set_index(["date_year_month", "rating"]).unstack(
                                fill_value=0
                            ).asfreq(
                                'MS', fill_value=0
                            ).stack().sort_index(level=0).reset_index()

df_year_month_rating

In [None]:
sns.lineplot(x="date_year_month", y="content", data=df_year_month_rating,
            hue="rating")
plt.show()

In [None]:
# Package to create wordclouds
from wordcloud import WordCloud

In [None]:
all_text_reviews = " ".join(df["content"])

wordcloud = WordCloud(width=400, height=600, background_color="white").generate(all_text_reviews)

plt.figure( figsize=(10,10) )
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()