In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
from datetime import datetime

In [None]:
def clean_date(input_str: str) -> datetime:
    date_str = input_str.replace("Published \n \n ", "").replace(" GMT\n", "").strip()
    date_time_obj = datetime.strptime(date_str, "%d %B %Y %H:%M")

    return date_time_obj

In [None]:
ID = 0
article_archive = {}

for i in tqdm(range(10000)):
    url = f'https://www.upstreamonline.com/archive?offset={i + 1}0&publishdate=01.01.2014-31.12.2023'
    response = requests.get(url)

    if response.status_code != 200:
        print(f'Failed to fetch page {url}')
        pass

    soup = BeautifulSoup(response.text, 'html.parser')

    articles = soup.find_all('a', class_='card-link text-reset') # Find all articles
    publish_dates = soup.find_all('span', class_='published-at') # Find all publish dates

    for i, a in enumerate(articles):
        date = publish_dates[i].text[14:-5]
        
        link = 'https://www.upstreamonline.com' + a.get('href')
        text = a.text[3:]

        article_archive[ID] = [date, text, link]
        ID += 1

    time.sleep(0.1)  # Sleep to be respectful to the server


In [None]:
df = pd.DataFrame(article_archive, index=['DATE', 'HEADLINE', 'LINK']).T

df['DATETIME'] = df['DATE'].apply(clean_date)

display(df)

# save as csv
df.to_csv('data/upstreamonline.csv', index=False)


In [None]:
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


tokenized = df['HEADLINE'].apply(sent_tokenize)

# Tokenize the headlines
tokenized = tokenized.apply(lambda x: [word_tokenize(s) for s in x])

# Remove stopwords
stop_words = set(stopwords.words('english'))
tokenized = tokenized.apply(lambda x: [[w for w in s if w.lower() not in stop_words] for s in x][0])

# sentiment analysis
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

print(tokenized)

sentiments = tokenized.apply(lambda x: [sia.polarity_scores(s) for s in x])
sentiments = sentiments.apply(lambda x: [s['compound'] for s in x])



In [None]:
df['SENTIMENT'] = sentiments.apply(sum)
df.set_index('DATETIME', inplace=True)
display(df)

In [None]:
import yfinance as yf
import matplotlib.pyplot as plt

# group by date and calculate the mean sentiment
df_sent = df.resample('D').mean()

fig, ax1 = plt.subplots(figsize=(10, 5))
ax2 = ax1.twinx()

df_crude = yf.download('CL=F', start='2019-09-18', end='2023-12-31')[['Adj Close']]


df_sent['SENTIMENT'].ewm(alpha=0.05).mean().plot(kind='line', ax=ax1, title='Sentiment of UpstreamOnline headlines', label='Sentiment Index')
df_crude['Adj Close'].plot(kind='line', color='r', ax=ax2, label='Crude Oil price')

ax1.legend(loc='upper left')
ax2.legend(loc='upper right')
ax1.set_ylabel('Sentiment')
ax2.set_ylabel('Crude Oil price')

