# Scraping All Headlines + Other Data

From 
- https://www.cbc.ca/news
- https://www.ctvnews.ca/canada
- https://globalnews.ca/canada/

In [33]:
import pandas as pd
from dateutil import parser
from datetime import date
import requests
from bs4 import BeautifulSoup

## Get ISO Date
- Format that is sortable by Pandas

In [34]:
# from datetime import date

def get_today_iso():
    return date.today().isoformat()

# Example usage:
print(get_today_iso())

2023-08-16


## Robust Date Parsing
- Can parse date from strings of text -- useful for checking if something is published today (else ignore)

In [35]:
# from dateutil import parser

def robust_parse_date(date_string):
    # Attempt to parse the date_string
    dt = parser.parse(date_string, fuzzy=True)
    
    # Return the date in the ISO format
    return dt.date().isoformat()

# Example usage:
date_string_1 = "Tue Aug 15 20:38:14 2023"
date_string_2 = "Tue Aug 15 20:38:14 EDT 2023"
print(robust_parse_date(date_string_1))
print(robust_parse_date(date_string_2))

2023-08-15
2023-08-15


## Converting Date back to Str
- Used specifically for the final plot. Df must be sorted according to numeric date, then this is used for plotting along x-axis (more interpretable than the numeric date)

In [36]:
from datetime import datetime

def get_date_str(date_str):
    # Parse the input date string to a datetime object
    date_obj = datetime.strptime(date_str, '%Y-%m-%d')
    
    # Format the datetime object to the desired output format
    return date_obj.strftime('%b %d, %Y')

# Example
date_str = "2020-02-29"
print(get_date_str(date_str))  # Outputs: Aug 15, 2023

Feb 29, 2020


## CBC Scraping
https://www.cbc.ca/news

In [37]:
# import requests
# from bs4 import BeautifulSoup

In [38]:
url = "https://www.cbc.ca/news"

response = requests.get(url)
print(response.status_code)
soup = BeautifulSoup(response.content, "html.parser")

links = soup.find_all("a")
cbc_headlines = []
for link in links:
    headline = link.h3
    href = link['href']

    # make sure both are not None
    if (headline and href):
        cbc_headlines.append((headline.text, "https://www.cbc.ca" + href))

cbc_headlines[:5]

200


[('Taxpayers are spending hours on hold with the CRA — despite repeated promises of faster service',
  'https://www.cbc.ca/news/politics/cra-call-centre-wait-times-two-hours-1.6937280'),
 ('N.W.T. declares state of emergency, with Yellowknife now threatened by wildfire',
  'https://www.cbc.ca/news/canada/north/nwt-premier-update-wildfires-1.6937511'),
 ("1,200 homes on prime farmland: What's known about the plan to develop the Greenbelt in Pickering, Ont.",
  'https://www.cbc.ca/news/canada/toronto/duffins-rouge-agricultural-preserve-1.6937144'),
 ("China snubs Canada on its approved travel spots, setting back tourism's post-COVID recovery",
  'https://www.cbc.ca/news/politics/china-canada-tourism-impact-1.6936627'),
 ('High winds in forecast pose looming fire threat, B.C. Wildfire Service says, as heat wave peaks',
  'https://www.cbc.ca/news/canada/british-columbia/what-you-need-to-know-about-wildfires-aug-16-1.6937767')]

## CTV Scraping
https://www.ctvnews.ca/canada

In [39]:
url = "https://www.ctvnews.ca/canada"

response = requests.get(url)
print(response.status_code)
soup = BeautifulSoup(response.content, "html.parser")

links = soup.find_all("h3", attrs = {'class' : 'c-list__item__title'})
ctv_headlines = []
for link in links:

    # get url and text
    if link.a:
        href = link.a['href'].strip()
        text = link.a.text.strip()

        # make sure both are not None
        if (text and href):
            ctv_headlines.append((text, href))

ctv_headlines[:5]

200


[("Why gas prices are high and why an expert says they won't drop until the winter",
  'https://www.ctvnews.ca/business/why-gas-prices-are-high-and-why-an-expert-says-they-won-t-drop-until-the-winter-1.6521349'),
 ('Visitors asked to stop bringing flowers to Kemptville, Ont. hospital',
  'https://ottawa.ctvnews.ca/visitors-asked-to-stop-bringing-flowers-to-kemptville-ont-hospital-1.6520639'),
 ('How to avoid compromising nutrition when grocery shopping at dollar stores',
  'https://www.ctvnews.ca/health/how-to-avoid-compromising-nutrition-when-grocery-shopping-at-dollar-stores-1.6520932'),
 ('Northwest Territories declares territorial state of emergency amid wildfires',
  'https://www.ctvnews.ca/canada/northwest-territories-declares-territorial-state-of-emergency-amid-wildfires-1.6520856'),
 ('Mounties resume enforcement against B.C. old-growth logging activists',
  'https://vancouverisland.ctvnews.ca/mounties-resume-enforcement-against-b-c-old-growth-logging-activists-1.6520235')]

## Global News Scraping
https://globalnews.ca/canada/

In [40]:
url = "https://globalnews.ca/canada/"

response = requests.get(url)
print(response.status_code)
soup = BeautifulSoup(response.content, "html.parser")

links = soup.find_all("a", attrs = {'class' : 'c-posts__inner'})

global_headlines = []
for link in links:

    # get span element -- text is here
    if link.span:
        href = link['href'].strip()
        text = link.span.text.strip()

        # make sure both are not None
        if (text and href):
            global_headlines.append((text, href))

global_headlines[:5]

200


[('Fort Fright returns to Kingston alongside Pumpkinferno this fall',
  'https://globalnews.ca/news/9899491/fort-fright-kingston-pumpkinferno-2023/'),
 ('Feds blamed AFN for delays, slow progress on First Nations policing bill: documents',
  'https://globalnews.ca/news/9899650/feds-blamed-afn-delays-first-nations-policing-bill/'),
 ('Province begins to establish new Sask. Marshals Service, critics concerned about oversight',
  'https://globalnews.ca/news/9899575/saskatchewan-marshals-service-oversight/'),
 ('Online bettors set to wager on King’s Plate horse race for 1st time',
  'https://globalnews.ca/news/9899629/online-bettors-kings-plate/'),
 ('Planning a budget for back-to-school expenses',
  'https://globalnews.ca/video/9899611/planning-a-budget-for-back-to-school-expenses/')]

## National Post Scraping
https://nationalpost.com/category/news/canada/
- Currently doesn't work. Get request is denied (403 error) -- they probably don't like programatic access

In [41]:
# url = "https://nationalpost.com/category/news/canada/"

# response = requests.get(url)
# print(response.status_code)
# # soup = BeautifulSoup(response.content, "html.parser")

# links = soup.find_all("a", attrs = {'class' : 'c-posts__inner'})

# global_headlines = []
# for link in links:

#     # get span element -- text is here
#     if link.span:
#         href = link['href'].strip()
#         text = link.span.text.strip()

#         # make sure both are not None
#         if (text and href):
#             global_headlines.append((text, href))

# global_headlines[:5]

# Sentiment Analysis

In [42]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli")

In [43]:
from tqdm import tqdm

This hasn't ever worked but I'm leaving it case it ever might start working

In [44]:
# import requests

# hf_token = open("../.hf_token").read().strip()
# print(hf_token[:10])

# API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
# headers = {"Authorization": f"Bearer {hf_token}"}

# def query(payload):
# 	response = requests.post(API_URL, headers = headers, json = payload)
# 	return response.json()

# output = query({
#     "inputs": "Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!",
#     "parameters": {"candidate_labels": ["refund", "legal", "faq"]},
# })
# output

In [45]:
def SA_headlines(headlines_urls: list, labels: list, source: str = None):

    responses_dict = {lab:[] for lab in labels}

    responses_dict = {
        'source' : [],
        'date' : [],
        "headline" : [],
        'url' : [],
        "chosen_label" : [],
        'chosen_label_val' : [],
        **responses_dict, # empty list for each passed label
        }

    if ('negative' and 'positive') in labels:
        responses_dict['compound'] = []

    date = get_today_iso()

    # get rid of any duplicates in headlines list
    headlines_urls = list(set(headlines_urls))

    for head_text, url in tqdm(headlines_urls, desc = source):
  
        # add non-SA things to dict
        responses_dict['source'].append(source.strip())
        responses_dict['date'].append(date.strip())
        responses_dict['headline'].append(head_text.strip())
        responses_dict['url'].append(url)

        # perform SA
        response = classifier(head_text, labels)

        # add SA results to dict
        for i in range(len(labels)):
            responses_dict[labels[i]].append(response['scores'][response['labels'].index(labels[i])]) # score of "___" index label

        # creating compound score from 'negative' and 'positive' and 'neutral' scores
        if ('negative' and 'positive' and 'neutral') in labels:
            compound = (responses_dict['positive'][-1] - responses_dict['negative'][-1]) * (1 - responses_dict['neutral'][-1])
            responses_dict['compound'].append(compound)
            responses_dict['chosen_label_val'].append(compound) # redundant but conceptually correct

            # creating SA labels from compound score (a bit more robust than model's labels)
            if compound >= 0.1:
                responses_dict['chosen_label'].append("positive")
            elif compound <= -0.1:
                # responses_dict['chosen_label_val'].append("negative")
                responses_dict['chosen_label'].append('negative')
            else:
                responses_dict['chosen_label'].append("neutral")

        # if no pos + neg, just use model's labels
        else:
            responses_dict['chosen_label_val'].append(response['scores'][0])
            responses_dict['chosen_label'].append(response['labels'][0])

    return responses_dict

# labels = ['positive', 'negative', 'neutral']
# news_source = "Global"

# sa_results = SA_headlines(headlines_urls = headlines, labels = labels, source = news_source)

# Testing Pipeline on all 3 Sources

In [46]:
labels = ['positive', 'negative', 'neutral']

# all respective sources and headlines
sources_headlines = [
    ("Global", global_headlines),
    ('CTV', ctv_headlines),
    ('CBC', cbc_headlines)
]

master_results = {}

for source, headlines in sources_headlines:

    # make sure both are not None -- there could be some scraping error and return empty headlines
    if (source and headlines):

        sa_results = SA_headlines(headlines_urls = headlines, labels = labels, source = source)

        # just overwrite the whole dict with the first results
        if not master_results:
            master_results = sa_results

        # afterwards, add everything to each list
        else:
            for key, val in sa_results.items():
                master_results[key].extend(val)

Global: 100%|██████████| 36/36 [00:16<00:00,  2.17it/s]
CTV: 100%|██████████| 95/95 [00:51<00:00,  1.85it/s]
CBC: 100%|██████████| 29/29 [00:14<00:00,  1.98it/s]


In [47]:
master_df = pd.DataFrame(master_results)

### WRITING DAILY DF TO CSV
master_df.to_csv(f"../data/{get_today_iso()}_SA_full.csv", index = False)

grouped_df = master_df.groupby(['date', 'source']).mean(numeric_only = True).reset_index()
new_labs = []

for row in grouped_df.index:
    val = grouped_df.loc[row, 'chosen_label_val']

    if val >= 0.1:
        new_labs.append('positive')
    elif val <= -0.1:
        new_labs.append('negative')
    else:
        new_labs.append('neutral')

grouped_df['chosen_label'] = new_labs
grouped_df.to_csv(f"../data/{get_today_iso()}_SA_grouped.csv", index = False)
grouped_df

Unnamed: 0,date,source,chosen_label_val,positive,negative,neutral,compound,chosen_label
0,2023-08-16,CBC,-0.43864,0.210721,0.687448,0.101831,-0.43864,negative
1,2023-08-16,CTV,-0.38403,0.24734,0.651948,0.100713,-0.38403,negative
2,2023-08-16,Global,-0.288941,0.284363,0.590733,0.124903,-0.288941,negative


# Plotting Results

In [48]:
import plotly.express as px

fig = px.line(grouped_df, x = 'date', y = 'chosen_label_val', color = 'source', title = "Sentiment Valency over Time", markers = True)
fig.update_layout(
    title = "Sentiment of Canadian News Outlets Over Time",
    xaxis_title = "Date",
    yaxis_title = "Sentiment Valence"
)

fig.show()

In [49]:
import plotly.express as px

fig = px.line(grouped_df, x = 'date', y = 'chosen_label_val', color = 'source', title = "Sentiment Valency over Time", markers = True)
fig.update_layout(
    title = "Sentiment of Canadian News Outlets Over Time",
    xaxis_title = "Date",
    yaxis_title = "Sentiment Valence"
)

fig.show()

## Plotting Multiple Days

In [50]:
import os

def find_csv_files(directory_path):
    """Returns a list of all .csv files in the given directory"""
    return [file for file in os.listdir(directory_path) if file.endswith('.csv')]

# Example usage
directory_path = '../data/'  # current directory
csv_files = find_csv_files(directory_path)
most_recent_10_days = sorted(csv_files)[::-1][:20]
most_recent_10_days

['2023-08-16_SA_grouped.csv',
 '2023-08-16_SA_full.csv',
 '2023-08-15_SA_grouped.csv',
 '2023-08-15_SA_full.csv']

In [51]:
grouped_dfs = []
full_dfs = []

for fpath in most_recent_10_days:
    df = pd.read_csv('../data/' + fpath)

    # grouped dfs
    if "grouped" in fpath:
        grouped_dfs.append(df)

    # full dfs
    else:
        full_dfs.append(df)

grouped_df_10 = pd.concat(grouped_dfs, axis = 0, ignore_index = True).sort_values(by = 'date', ascending = True).reset_index(drop = True)
grouped_df_10['date_str'] = grouped_df_10['date'].apply(lambda x: get_date_str(x))
grouped_df_10

Unnamed: 0,date,source,chosen_label_val,positive,negative,neutral,compound,chosen_label,date_str
0,2023-08-15,CBC,-0.298532,0.291011,0.60641,0.102579,-0.298532,negative,"Aug 15, 2023"
1,2023-08-15,CTV,-0.434609,0.222419,0.683478,0.094102,-0.434609,negative,"Aug 15, 2023"
2,2023-08-15,Global,-0.337955,0.26968,0.620905,0.109415,-0.337955,negative,"Aug 15, 2023"
3,2023-08-16,CBC,-0.43864,0.210721,0.687448,0.101831,-0.43864,negative,"Aug 16, 2023"
4,2023-08-16,CTV,-0.38403,0.24734,0.651948,0.100713,-0.38403,negative,"Aug 16, 2023"
5,2023-08-16,Global,-0.288941,0.284363,0.590733,0.124903,-0.288941,negative,"Aug 16, 2023"


In [52]:
full_df_10 = pd.concat(full_dfs, axis = 0, ignore_index = True).sort_values(by = 'date', ascending = True).reset_index(drop = True)
full_df_10['date_str'] = full_df_10['date'].apply(lambda x: get_date_str(x))
full_df_10.head(1)#['headline'].tolist()

Unnamed: 0,source,date,headline,url,chosen_label,chosen_label_val,positive,negative,neutral,compound,date_str
0,Global,2023-08-15,Alberta bound: thousands of Canadians flock to...,https://globalnews.ca/video/9898946/alberta-bo...,positive,0.479241,0.704233,0.12924,0.166526,0.479241,"Aug 15, 2023"


In [53]:
full_df_10.groupby(['source']).max()['headline'].tolist()

["Will electrifying cars and home heating break Canada's grid?",
 '‘Lucky she could escape’: Multiple fire code violations found after fire gutted St. Thomas apartment',
 '‘We’re blessed’: Canadian business owner living in Hawaii feels grief, gratitude']

## Plotting All Grouped

In [54]:
import plotly.express as px

fig = px.line(grouped_df_10, x = 'date_str', y = 'chosen_label_val', color = 'source', title = "Sentiment Valency over Time", markers = True)
fig.update_layout(
    title = "Sentiment of Canadian News Outlets Over Time",
    xaxis_title = "Date",
    yaxis_title = "Sentiment Valence",
)

fig.show()

In [55]:
import plotly.express as px

# Assuming you already have your grouped_df_10 defined
fig = px.line(grouped_df_10, x = 'date_str', y = 'chosen_label_val', color = 'source',
              color_discrete_map = {"CBC": "#EC1D2D", "CTV": "#0046D4", "Global": "#231F20"},
              title = "Sentiment Valency over Time", markers = True)

# Updating layout with font sizes and title position
fig.update_layout(
    title = {
        'text': "Sentiment of Canadian News Outlets Over Time",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {
            'size': 20  # or any desired font size
        }
    },
    xaxis_title = "Date",
    xaxis_title_font_size = 16,  # or any desired font size
    xaxis_tickfont_size = 14,  # or any desired font size
    yaxis_title = "Sentiment Valence",
    yaxis_title_font_size = 16,  # or any desired font size
    yaxis_tickfont_size = 14   # or any desired font size
)

fig.show()


**test**