# **Scraping Covid19 related articles from TheJournal.ie**

# **Libraries**

In [None]:
# global purpose libraries
from datetime import date, datetime, timedelta
import random
import re
import time
import pandas as pd
import numpy as np
from collections import Counter

# scraping libraries
from bs4 import BeautifulSoup
import urllib
import requests

# **Scraping Covid19 related articles on TheJournal.ie**

The structure of the Covid-related pages on TheJournal is quite simple: each article contains a title, a date at which the article was posted, and the number of views and likes on Facebook.


We're simply going to write a script that scrapes all the articles for one page, and iterate through a finite number of pages to collect as much data as we can.

Each field will be appended to a list, which we will use to create a Pandas dataframe

In [None]:
# it's clearer if we create a class
class Journal:

  # the url embedded within an F-string statement, the number of pages we'll iterate through, and our lists
  def __init__(self):
    self.Urls = [f"https://www.thejournal.ie/covid19-facts/news/page/{i}/" for i in range(1,40)]
    self.Articles = []
    self.Published = []
    self.Comments = []
    self.Views = []

  # here we're simply calling the url, and parsing the HTML tags
  def getRequest(self,url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    return soup
  
  # that took me an hour, as my initial script was missing the likes and views, but capturing unrelated articles
  def getContent(self,url):
    soup = self.getRequest(url)
    for s in soup.find_all("a"):
      if str(s.get('title')).startswith("Comment on"):
        self.Articles.append(s.get('title').replace("Comment on ",""))
    for s in soup.find_all("span", class_="published-at"):
      self.Published.append(s.text.strip())
    for s in soup.find_all("span", class_="interactions"):
      self.Views.append(s.text.split(" ")[0].replace("\n",""))
    for s in soup.find_all("span", class_="interactions"):
      self.Comments.append(s.text.split(" ")[3])

  # creating dictionaries, where the key is the name of our columns, and the values are our lists
  def getDataframe(self):
    urls = self.Urls
    for u in urls:
      self.getContent(u)
    df = {"date": self.Published,
          "views": self.Views,
          "comments": self.Comments,
          "article": self.Articles
          }
    df = pd.DataFrame(df)
    return df

# making sure it worked
journal = Journal()
df = journal.getDataframe()
df.head(10)

Unnamed: 0,date,views,comments,article
0,Updated\n ...,62220,53,'It's a little bit shambolic': Hundreds turned...
1,Updated\n ...,33547,25,Initial data on Omicron 'doesn't indicate it i...
2,Tue 6:29 PM,1532,0,"Debunked: No, 80% of Covid deaths during Octob..."
3,Tue 7:44 AM,38876,71,"New Covid-19 measures for hospitality, nightcl..."
4,Updated\n ...,28617,29,Government confirms details of reopened PUP sc...
5,Updated\n ...,27290,60,Unmasked students should not be excluded from ...
6,Updated\n ...,55551,61,New travel test rules take effect for inbound ...
7,Sun 1:08 PM,71650,111,"Coronavirus: 5,156 new cases confirmed in Ireland"
8,Sun 12:40 PM,12497,23,Australia approves Pfizer Covid-19 vaccine for...
9,"Dec 4th 2021, 1:45 PM",91049,152,"Coronavirus: 5,622 new cases confirmed in Ireland"


In [None]:
# function to clean the date serie
def getCleanDates(serie):
  # sometimes, the date shows as "last updated XXX"
  if "update" in serie.lower():
    if not re.search("Mon|Tue|Wed|Thu|Fri|Sat|Sun",serie):
      return serie.replace("Updated","").strip().split(",")[0]
  # transforming the entries that contain a day name into the targeted time format
  elif re.search("Mon|Tue|Wed|Thu|Fri|Sat|Sun",serie):
    cleanedText = serie.replace("Updated\n","").strip().split(" ")[0]
    today = date.today()
    for i in range(7):
      day = today - timedelta(days=i)
      if day.strftime("%A")[:3] == cleanedText:
        return day
  # that's always for entries posted on the current day
  elif "ago" in serie.lower():
    return date.today()
  else:
    return serie.split(",")[0]

# applying the function, then converting to datetime format
df["date"] = df["date"].apply(getCleanDates)
df["date"] = pd.to_datetime(df["date"])

# converting Likes and Views to numeric format
df["views"] = df["views"].apply(lambda x: str(x).replace(",","")).astype(int)
df["comments"] = df["comments"].apply(lambda x: str(x).replace(",","")).astype(int)

In [None]:
# adding in a function that tags articles depending on their topic
def getTags(serie):
  s = serie.lower()
  if "vaccin" in s:
    return "Vaccine"
  elif "new case" in s:
    return "Daily press announcement"
  elif "lockdown" in s:
    return "Lockdown"
  elif re.search("flight|airport|travel|passenger",s):
    return "Travel"
  elif "quarantine" in s:
    return "Hotel quarantine"
  elif re.search("doherty|waters|protest|conspiracy|dolores cahill",s):
    "Conspiracy debunking"
  else:
    return "General news"

df["tag"] = df["article"].apply(getTags)

# no need for tokenization / lemmatization, as the article titles are short  we can jump straight into sentiment evaluation
df["sentiment_score"] = df["article"].apply(lambda x: TextBlob(x).sentiment.polarity)

# also creating categorical bins for sentiment
def getSentiment(serie):
  if serie > 0.15:
    return "Positive"
  elif serie < 0:
    return "Negative"
  else:
    return "Neutral"

df["sentiment_tag"] = df["sentiment_score"].apply(getSentiment)

# creating a copy of the dataset, that we can export later to a csv file
df_journal = df.copy()

# this is what the dataset now looks like
df.head()

Unnamed: 0,date,views,comments,article,tag,sentiment_score,sentiment_tag
2,2021-12-07,1532,0,"Debunked: No, 80% of Covid deaths during Octob...",Vaccine,0.0,Neutral
3,2021-12-07,38876,71,"New Covid-19 measures for hospitality, nightcl...",General news,0.136364,Neutral
7,2021-12-05,71650,111,"Coronavirus: 5,156 new cases confirmed in Ireland",Daily press announcement,0.268182,Positive
8,2021-12-05,12497,23,Australia approves Pfizer Covid-19 vaccine for...,Vaccine,-0.1,Negative
9,2021-12-04,91049,152,"Coronavirus: 5,622 new cases confirmed in Ireland",Daily press announcement,0.268182,Positive
