# Scraping All Headlines + Other Data

From 
- https://www.cbc.ca/news
- https://www.ctvnews.ca/canada
- https://globalnews.ca/canada/

In [1]:
import pandas as pd
from dateutil import parser
from datetime import date
import requests
from bs4 import BeautifulSoup

## Get ISO Date
- Format that is sortable by Pandas

In [2]:
# from datetime import date

def get_today_iso():
    return date.today().isoformat()

# Example usage:
print(get_today_iso())

2023-08-16


## Robust Date Parsing
- Can parse date from strings of text -- useful for checking if something is published today (else ignore)

In [3]:
# from dateutil import parser

def robust_parse_date(date_string):
    # Attempt to parse the date_string
    dt = parser.parse(date_string, fuzzy=True)
    
    # Return the date in the ISO format
    return dt.date().isoformat()

# Example usage:
date_string_1 = "Tue Aug 15 20:38:14 2023"
date_string_2 = "Tue Aug 15 20:38:14 EDT 2023"
print(robust_parse_date(date_string_1))
print(robust_parse_date(date_string_2))

2023-08-15
2023-08-15


## Converting Date back to Str
- Used specifically for the final plot. Df must be sorted according to numeric date, then this is used for plotting along x-axis (more interpretable than the numeric date)

In [4]:
from datetime import datetime

def get_date_str(date_str):
    # Parse the input date string to a datetime object
    date_obj = datetime.strptime(date_str, '%Y-%m-%d')
    
    # Format the datetime object to the desired output format
    return date_obj.strftime('%b %d, %Y')

# Example
date_str = "2020-02-29"
print(get_date_str(date_str))  # Outputs: Aug 15, 2023

Feb 29, 2020


## CBC Scraping
https://www.cbc.ca/news

In [5]:
# import requests
# from bs4 import BeautifulSoup

In [34]:
url = "https://www.cbc.ca/news"

response = requests.get(url)
print(response.status_code)
soup = BeautifulSoup(response.content, "html.parser")

links = soup.find_all("a")
cbc_headlines = []
for link in links:
    headline = link.h3
    href = link['href']

    # make sure both are not None
    if (headline and href):
        cbc_headlines.append((headline.text, "https://www.cbc.ca" + href))

print(len(cbc_headlines))
cbc_headlines[:5]

200
36


[("Family drove 'through embers' fleeing N.W.T. wildfires as vehicle melted around them",
  'https://www.cbc.ca/news/canada/north/driving-through-embers-hay-river-family-flees-wildfires-in-nwt-1.6937089'),
 ("Family drove 'through embers' fleeing N.W.T. wildfires as vehicle melted around them",
  'https://www.cbc.ca/news/canada/north/driving-through-embers-hay-river-family-flees-wildfires-in-nwt-1.6937089'),
 ('Visit our low-bandwidth CBC North news website',
  'https://www.cbc.cahttps://www.cbc.ca/lite/news/canada/north?sort=editors-picks'),
 ('N.W.T. evacuees relive journeys out of wildfire danger',
  'https://www.cbc.ca/news/canada/edmonton/northwest-territories-evacuees-relive-their-journeys-out-of-wildfire-danger-1.6938157'),
 ("Wildfire could reach Yellowknife's outskirts by weekend",
  'https://www.cbc.ca/news/canada/north/nwt-highway-3-evacuation-order-extended-to-yellowknife-boundary-1.6938032')]

## CTV Scraping
https://www.ctvnews.ca/canada

In [35]:
url = "https://www.ctvnews.ca/canada"

response = requests.get(url)
print(response.status_code)
soup = BeautifulSoup(response.content, "html.parser")

links = soup.find_all("h3", attrs = {'class' : 'c-list__item__title'})
ctv_headlines = []
for link in links:

    # get url and text
    if link.a:
        href = link.a['href'].strip()
        text = link.a.text.strip()

        # make sure both are not None
        if (text and href):
            ctv_headlines.append((text, href))

print(len(ctv_headlines))
ctv_headlines[:5]

200
110


[('Flaring wildfire in southern B.C. traps backcountry campers as only road is cut',
  'https://bc.ctvnews.ca/flaring-wildfire-in-southern-b-c-traps-backcountry-campers-as-only-road-is-cut-1.6521596'),
 ('18-month-old toddler dead in Edmonton; police call circumstances suspicious',
  'https://edmonton.ctvnews.ca/18-month-old-toddler-dead-in-edmonton-police-call-circumstances-suspicious-1.6521826'),
 ('Ex-CRA employee allegedly embezzled almost $20,000 in CERB, CRCB scam: RCMP',
  'https://www.ctvnews.ca/canada/ex-cra-employee-allegedly-embezzled-almost-20-000-in-cerb-crcb-scam-rcmp-1.6521582'),
 ('In photos: Fires rage near N.W.T. communities',
  'https://www.ctvnews.ca/canada/in-photos-fires-rage-near-n-w-t-communities-1.6521657'),
 ("Why gas prices are high and why an expert says they won't drop until the winter",
  'https://www.ctvnews.ca/business/why-gas-prices-are-high-and-why-an-expert-says-they-won-t-drop-until-the-winter-1.6521349')]

## Global News Scraping
https://globalnews.ca/canada/

In [36]:
url = "https://globalnews.ca/canada/"

response = requests.get(url)
print(response.status_code)
soup = BeautifulSoup(response.content, "html.parser")

links = soup.find_all("a", attrs = {'class' : 'c-posts__inner'})

global_headlines = []
for link in links:

    # get span element -- text is here
    if link.span:
        href = link['href'].strip()
        text = link.span.text.strip()

        # make sure both are not None
        if (text and href):
            global_headlines.append((text, href))

print(len(global_headlines))
global_headlines[:5]

200
49


[('More time needed for Saskatoon downtown entertainment district reports',
  'https://globalnews.ca/news/9899843/more-time-needed-saskatoon-downtown-entertainment-district-reports/'),
 ('LCBO warns subscribers of promotional emails after data breach',
  'https://globalnews.ca/news/9900135/lcbo-emails-data-breach/'),
 ('Saskatoon city committee discusses 2022 snow removal debt solution on Wednesday',
  'https://globalnews.ca/news/9899757/saskatoon-city-committee-2022-snow-removal-debt-solution/'),
 ('FSIN First Vice-Chief highlights resource development concerns leading up to AFN election',
  'https://globalnews.ca/video/9899966/fsin-first-vice-chief-highlights-resource-development-concerns-leading-up-to-afn-election/'),
 ('Winnipeg sees second smoky morning on Wednesday',
  'https://globalnews.ca/video/9899832/winnipeg-sees-second-smoky-morning-on-wednesday/')]

## National Post Scraping
https://nationalpost.com/category/news/canada/
- Currently doesn't work. Get request is denied (403 error) -- they probably don't like programatic access

In [9]:
# url = "https://nationalpost.com/category/news/canada/"

# response = requests.get(url)
# print(response.status_code)
# # soup = BeautifulSoup(response.content, "html.parser")

# links = soup.find_all("a", attrs = {'class' : 'c-posts__inner'})

# global_headlines = []
# for link in links:

#     # get span element -- text is here
#     if link.span:
#         href = link['href'].strip()
#         text = link.span.text.strip()

#         # make sure both are not None
#         if (text and href):
#             global_headlines.append((text, href))

# global_headlines[:5]

# Sentiment Analysis

In [10]:
from transformers import pipeline

## general bart large for SA
# classifier = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli")

## roberta for SA specifically trained on news and finance 
classifier = pipeline("text-classification", model = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

2023-08-16 14:56:09.570273: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Downloading (…)lve/main/config.json:   0%|          | 0.00/933 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [40]:
for headline, url in cbc_headlines:
    result = classifier(headline, ['positive', 'negative', 'netural'])
    print(headline)
    print(result)
    break

Ignoring args : (['positive', 'negative', 'netural'],)


Family drove 'through embers' fleeing N.W.T. wildfires as vehicle melted around them
[{'label': 'neutral', 'score': 0.9991379976272583}]


In [11]:
from tqdm import tqdm

This hasn't ever worked but I'm leaving it case it ever might start working

In [59]:
# output[0]

In [60]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Downloading (…)lve/main/config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [95]:
text = "Feds blamed AFN for farty, slow progress on First Nations policing bill: documents"

def sa_model(headline: str):
    encoded_input = tokenizer(headline, return_tensors = 'pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]

    results = {}

    for i in range(scores.shape[0]):
        l = config.id2label[ranking[i]]
        s = scores[ranking[i]]
        results[l] = s

    return results


new_text = "This is my favourite flavour of cake"

sa_model(headline = new_text)

{'positive': 0.972092, 'neutral': 0.022085054, 'negative': 0.0058229654}

In [13]:
# def SA_headlines(headlines_urls: list, labels: list, source: str = None):

#     responses_dict = {lab:[] for lab in labels}

#     responses_dict = {
#         'source' : [],
#         'date' : [],
#         "headline" : [],
#         'url' : [],
#         "chosen_label" : [],
#         'chosen_label_val' : [],
#         **responses_dict, # empty list for each passed label
#         }

#     if ('negative' and 'positive') in labels:
#         responses_dict['compound'] = []

#     date = get_today_iso()

#     # get rid of any duplicates in headlines list
#     headlines_urls = list(set(headlines_urls))

#     for head_text, url in tqdm(headlines_urls, desc = source):
  
#         # add non-SA things to dict
#         responses_dict['source'].append(source.strip())
#         responses_dict['date'].append(date.strip())
#         responses_dict['headline'].append(head_text.strip())
#         responses_dict['url'].append(url)

#         # perform SA
#         response = classifier(head_text, labels)

#         # add SA results to dict
#         for i in range(len(labels)):
#             responses_dict[labels[i]].append(response['scores'][response['labels'].index(labels[i])]) # score of "___" index label

#         # creating compound score from 'negative' and 'positive' and 'neutral' scores
#         if ('negative' and 'positive' and 'neutral') in labels:
#             compound = (responses_dict['positive'][-1] - responses_dict['negative'][-1]) * (1 - responses_dict['neutral'][-1])
#             responses_dict['compound'].append(compound)
#             responses_dict['chosen_label_val'].append(compound) # redundant but conceptually correct

#             # creating SA labels from compound score (a bit more robust than model's labels)
#             if compound >= 0.1:
#                 responses_dict['chosen_label'].append("positive")
#             elif compound <= -0.1:
#                 # responses_dict['chosen_label_val'].append("negative")
#                 responses_dict['chosen_label'].append('negative')
#             else:
#                 responses_dict['chosen_label'].append("neutral")

#         # if no pos + neg, just use model's labels
#         else:
#             responses_dict['chosen_label_val'].append(response['scores'][0])
#             responses_dict['chosen_label'].append(response['labels'][0])

#     return responses_dict

# # labels = ['positive', 'negative', 'neutral']
# # news_source = "Global"

# # sa_results = SA_headlines(headlines_urls = headlines, labels = labels, source = news_source)

In [158]:
def sa_headlines(headlines_urls: list, source: str = None):

    responses_dict = {lab:[] for lab in labels}

    responses_dict = {
        'source' : [],
        'date' : [],
        "headline" : [],
        'url' : [],
        "chosen_label" : [],
        # 'chosen_label_val' : [],
        'positive': [],
        'negative': [],
        'neutral': [],
        'compound' : []
        }

    date = get_today_iso()

    # get rid of any duplicates in headlines list
    headlines_urls = list(set(headlines_urls))

    for head_text, url in tqdm(headlines_urls, desc = source):
  
        # add non-SA things to dict
        responses_dict['source'].append(source.strip())
        responses_dict['date'].append(date.strip())
        responses_dict['headline'].append(head_text.strip())
        responses_dict['url'].append(url)

        # perform SA
        response = sa_model(headline = head_text)

        # # add SA results to dict
        for lab, score in response.items():
            responses_dict[lab].append(score)

        # creating compound score from 'negative' and 'positive' and 'neutral' scores
        if ('negative' and 'positive' and 'neutral') in labels:
            compound = (responses_dict['positive'][-1] - responses_dict['negative'][-1]) * (1 - responses_dict['neutral'][-1])
            responses_dict['compound'].append(compound)
            # responses_dict['chosen_label_val'].append(compound) # redundant but conceptually correct

            # creating SA labels from compound score (a bit more robust than model's labels)
            if compound >= 0.1:
                responses_dict['chosen_label'].append("positive")
            elif compound <= -0.1:
                # responses_dict['chosen_label_val'].append("negative")
                responses_dict['chosen_label'].append('negative')
            else:
                responses_dict['chosen_label'].append("neutral")

        # if no pos + neg, just use model's labels
        else:
            # responses_dict['chosen_label_val'].append(response['scores'][0])
            responses_dict['chosen_label'].append(response['labels'][0])

    return responses_dict

# labels = ['positive', 'negative', 'neutral']
# news_source = "Global"

# sa_results = SA_headlines(headlines_urls = headlines, labels = labels, source = news_source)

# Testing Pipeline on all 3 Sources

In [159]:
# all respective sources and headlines
sources_headlines = [
    ("Global", global_headlines),
    ('CTV', ctv_headlines),
    ('CBC', cbc_headlines)
]

master_results = {}

for source, headlines in sources_headlines:

    # make sure both are not None -- there could be some scraping error and return empty headlines
    if (source and headlines):

        # sa_results = SA_headlines(headlines_urls = headlines, labels = labels, source = source)
        sa_results = sa_headlines(headlines_urls = headlines, source = source)

        # just overwrite the whole dict with the first results
        if not master_results:
            master_results = sa_results

        # afterwards, add everything to each list
        else:
            for key, val in sa_results.items():
                master_results[key].extend(val)

Global: 100%|██████████| 34/34 [00:01<00:00, 24.33it/s]
CTV: 100%|██████████| 94/94 [00:03<00:00, 25.53it/s]
CBC: 100%|██████████| 34/34 [00:01<00:00, 23.90it/s]


In [161]:
master_df = pd.DataFrame(master_results)

### WRITING DAILY DF TO CSV
master_df.to_csv(f"../data/{get_today_iso()}_SA_full.csv", index = False)

grouped_df = master_df.groupby(['date', 'source']).mean(numeric_only = True).reset_index()
new_labs = []

for row in grouped_df.index:
    val = grouped_df.loc[row, 'compound']

    if val >= 0.05:
        new_labs.append('positive')
    elif val <= -0.05:
        new_labs.append('negative')
    else:
        new_labs.append('neutral')

grouped_df['chosen_label'] = new_labs
grouped_df.to_csv(f"../data/{get_today_iso()}_SA_grouped.csv", index = False)
grouped_df

Unnamed: 0,date,source,positive,negative,neutral,compound,chosen_label
0,2023-08-16,CBC,0.067274,0.314027,0.618699,-0.165573,negative
1,2023-08-16,CTV,0.069451,0.393609,0.53694,-0.202504,negative
2,2023-08-16,Global,0.1009,0.280214,0.618886,-0.109817,negative


# Plotting Results

In [163]:
import plotly.express as px

fig = px.line(grouped_df, x = 'date', y = 'compound', color = 'source', title = "Sentiment Valence over Time", markers = True)
fig.update_layout(
    title = "Sentiment of Canadian News Outlets Over Time",
    xaxis_title = "Date",
    yaxis_title = "Sentiment Valence"
)

fig.show()

In [164]:
import plotly.express as px

fig = px.line(grouped_df, x = 'date', y = 'compound', color = 'source', title = "Sentiment Valency over Time", markers = True)
fig.update_layout(
    title = "Sentiment of Canadian News Outlets Over Time",
    xaxis_title = "Date",
    yaxis_title = "Sentiment Valence"
)

fig.show()

## Plotting Multiple Days

In [165]:
import os

def find_csv_files(directory_path):
    """Returns a list of all .csv files in the given directory"""
    return [file for file in os.listdir(directory_path) if file.endswith('.csv')]

# Example usage
directory_path = '../data/'  # current directory
csv_files = find_csv_files(directory_path)
most_recent_10_days = sorted(csv_files)[::-1][:20]
most_recent_10_days

['2023-08-16_SA_grouped.csv',
 '2023-08-16_SA_full.csv',
 '2023-08-15_SA_grouped.csv',
 '2023-08-15_SA_full.csv']

In [166]:
grouped_dfs = []
full_dfs = []

for fpath in most_recent_10_days:
    df = pd.read_csv('../data/' + fpath)

    # grouped dfs
    if "grouped" in fpath:
        grouped_dfs.append(df)

    # full dfs
    else:
        full_dfs.append(df)

grouped_df_10 = pd.concat(grouped_dfs, axis = 0, ignore_index = True).sort_values(by = 'date', ascending = True).reset_index(drop = True)
grouped_df_10['date_str'] = grouped_df_10['date'].apply(lambda x: get_date_str(x))
grouped_df_10

Unnamed: 0,date,source,positive,negative,neutral,compound,chosen_label,chosen_label_val,date_str
0,2023-08-15,CBC,0.082955,0.296434,0.620611,-0.129276,positive,0.706347,"Aug 15, 2023"
1,2023-08-15,CTV,0.082049,0.37805,0.539901,-0.191215,positive,0.690778,"Aug 15, 2023"
2,2023-08-15,Global,0.08839,0.326688,0.584922,-0.164836,positive,0.763416,"Aug 15, 2023"
3,2023-08-16,CBC,0.067274,0.314027,0.618699,-0.165573,negative,,"Aug 16, 2023"
4,2023-08-16,CTV,0.069451,0.393609,0.53694,-0.202504,negative,,"Aug 16, 2023"
5,2023-08-16,Global,0.1009,0.280214,0.618886,-0.109817,negative,,"Aug 16, 2023"


In [105]:
full_df_10 = pd.concat(full_dfs, axis = 0, ignore_index = True).sort_values(by = 'date', ascending = True).reset_index(drop = True)
full_df_10['date_str'] = full_df_10['date'].apply(lambda x: get_date_str(x))
full_df_10.head(1)#['headline'].tolist()

Unnamed: 0,source,date,headline,url,chosen_label,chosen_label_val,positive,negative,neutral,compound,date_str
0,Global,2023-08-15,Why inflation’s latest jump is bad news for th...,https://globalnews.ca/news/9896838/inflation-r...,negative,-0.991469,0.002038,0.995727,0.002235,-0.991469,"Aug 15, 2023"


In [106]:
full_df_10.groupby(['source']).max()['headline'].tolist()

["Will electrifying cars and home heating break Canada's grid?",
 '‘Lucky she could escape’: Multiple fire code violations found after fire gutted St. Thomas apartment',
 '‘We’re blessed’: Canadian business owner living in Hawaii feels grief, gratitude']

## Plotting All Grouped

In [107]:
import plotly.express as px

fig = px.line(grouped_df_10, x = 'date_str', y = 'chosen_label_val', color = 'source', title = "Sentiment Valency over Time", markers = True)
fig.update_layout(
    title = "Sentiment of Canadian News Outlets Over Time",
    xaxis_title = "Date",
    yaxis_title = "Sentiment Valence",
)

fig.show()

In [108]:
import plotly.express as px

# Assuming you already have your grouped_df_10 defined
fig = px.line(grouped_df_10, x = 'date_str', y = 'chosen_label_val', color = 'source',
              color_discrete_map = {"CBC": "#EC1D2D", "CTV": "#0046D4", "Global": "#231F20"},
              title = "Sentiment Valency over Time", markers = True)

# Updating layout with font sizes and title position
fig.update_layout(
    title = {
        'text': "Sentiment of Canadian News Outlets Over Time",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {
            'size': 20  # or any desired font size
        }
    },
    xaxis_title = "Date",
    xaxis_title_font_size = 16,  # or any desired font size
    xaxis_tickfont_size = 14,  # or any desired font size
    yaxis_title = "Sentiment Valence",
    yaxis_title_font_size = 16,  # or any desired font size
    yaxis_tickfont_size = 14   # or any desired font size
)

fig.show()


### Re-running old data on new model

In [110]:
today_date = get_today_iso()
today_date

'2023-08-16'

In [112]:
sa_model(headline = "hello")

{'neutral': 0.5090616, 'positive': 0.4082349, 'negative': 0.08270366}

In [168]:
data_dir = "../data/"
dfs = []
for fpath in os.listdir(data_dir):
    if '2023-08-15' in fpath:
        if 'full' in fpath:
            print(fpath)
            dfs.append(pd.read_csv(data_dir + fpath))

df = dfs[0].copy().drop(columns = 'chosen_label_val')
df.head(5)

2023-08-15_SA_full.csv


Unnamed: 0,source,date,headline,url,chosen_label,positive,negative,neutral,compound
0,Global,2023-08-15,"Inflation rates reverse in July, exiting Bank ...",https://globalnews.ca/video/9898898/inflation-...,neutral,0.026564,0.192263,0.781172,-0.036259
1,Global,2023-08-15,Timelapse of Perseid meteor shower,https://globalnews.ca/video/9891022/timelapse-...,neutral,0.11031,0.015873,0.873817,0.011916
2,Global,2023-08-15,Why inflation’s latest jump is bad news for th...,https://globalnews.ca/news/9896838/inflation-r...,negative,0.013865,0.783747,0.202388,-0.614067
3,Global,2023-08-15,Alberta bound: thousands of Canadians flock to...,https://globalnews.ca/video/9898946/alberta-bo...,neutral,0.122714,0.036794,0.840492,0.013705
4,Global,2023-08-15,Recent Calgary shootings have hallmarks of org...,https://globalnews.ca/video/9898908/recent-cal...,negative,0.008512,0.688372,0.303116,-0.473784


In [169]:
for row in tqdm(df.index, desc = "Headlines"):

    headline = df.loc[row, 'headline']
    
    ### new model SA -- returns dict like 
    response = sa_model(headline)

    compound = (response['positive'] - response['negative']) * (1 - response['neutral'])
    
    if compound >= 0.05:
        df.loc[row, 'chosen_label'] = 'positive'
    elif compound <= -0.05:
        df.loc[row, 'chosen_label'] = 'negative'
    else:
        df.loc[row, 'chosen_label'] = 'neutral'

    df.loc[row, 'compound'] = compound

    for lab, score in response.items():
        df.loc[row, lab] = score

df

Headlines: 100%|██████████| 168/168 [00:07<00:00, 22.10it/s]


Unnamed: 0,source,date,headline,url,chosen_label,positive,negative,neutral,compound
0,Global,2023-08-15,"Inflation rates reverse in July, exiting Bank ...",https://globalnews.ca/video/9898898/inflation-...,neutral,0.026564,0.192263,0.781172,-0.036259
1,Global,2023-08-15,Timelapse of Perseid meteor shower,https://globalnews.ca/video/9891022/timelapse-...,neutral,0.110310,0.015873,0.873817,0.011916
2,Global,2023-08-15,Why inflation’s latest jump is bad news for th...,https://globalnews.ca/news/9896838/inflation-r...,negative,0.013865,0.783747,0.202388,-0.614067
3,Global,2023-08-15,Alberta bound: thousands of Canadians flock to...,https://globalnews.ca/video/9898946/alberta-bo...,neutral,0.122714,0.036794,0.840492,0.013705
4,Global,2023-08-15,Recent Calgary shootings have hallmarks of org...,https://globalnews.ca/video/9898908/recent-cal...,negative,0.008512,0.688372,0.303116,-0.473784
...,...,...,...,...,...,...,...,...,...
163,CBC,2023-08-15,The National,https://www.cbc.ca/player/play/2074756675798,neutral,0.126469,0.062770,0.810761,0.012054
164,CBC,2023-08-15,City of St. John's rejects owner's pleas to te...,https://www.cbc.ca/news/canada/newfoundland-la...,negative,0.006242,0.758321,0.235436,-0.575012
165,CBC,2023-08-15,Federal government has gone more than six mont...,https://www.cbc.ca/news/politics/ethics-commis...,negative,0.017374,0.618263,0.364363,-0.381947
166,CBC,2023-08-15,And that's a wrap: Edmonton restaurant chain w...,https://www.cbc.ca/news/canada/edmonton/donair...,positive,0.226506,0.015125,0.758369,0.051076


In [170]:
### WRITING DAILY DF TO CSV
df.to_csv(f"../data/2023-08-15_SA_full.csv", index = False)

grouped_df = df.groupby(['date', 'source']).mean(numeric_only = True).reset_index()
new_labs = []

for row in grouped_df.index:
    val = grouped_df.loc[row, 'compound']

    if val >= 0.05:
        new_labs.append('positive')
    elif val <= -0.05:
        new_labs.append('negative')
    else:
        new_labs.append('neutral')

grouped_df['chosen_label'] = new_labs
grouped_df.to_csv(f"../data/2023-08-15_SA_grouped.csv", index = False)
grouped_df

Unnamed: 0,date,source,positive,negative,neutral,compound,chosen_label
0,2023-08-15,CBC,0.082955,0.296434,0.620611,-0.129276,negative
1,2023-08-15,CTV,0.082049,0.37805,0.539901,-0.191215,negative
2,2023-08-15,Global,0.08839,0.326688,0.584922,-0.164836,negative
