In [None]:
%%capture
%pip install datasets
%pip install yfinance

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_dataset
import numpy as np
import yfinance as yf
from tqdm import tqdm
from google.colab import drive
import os
import datetime
import torch

# Use tqdm to time pandas functions
tqdm.pandas()

# Use google drive to save files
drive.mount('/content/drive')

# Check if a GPU is available
if torch.cuda.is_available():
  device = torch.device("cuda")
  print("GPU is available and being used.")
else:
  device = torch.device("cpu")
  print("GPU is not available, using CPU instead.")

Mounted at /content/drive
GPU is available and being used.


In [None]:
file_location = '/content/drive/MyDrive/Colab Notebooks/Dissertation/ic_fspml/data'

<h1> Load in datasets </h1>

In [None]:
# Loading in the ic_fspml dataset, as a dataframe
def load_ic_fspml_dataset():
    # Define the file paths for the train, validation, and test splits to be loaded in a dictionary
    splits = {'train': 'data/train-00000-of-00001-161ce92db66dabb3.parquet',
              'validation': 'data/validation-00000-of-00001-7588e28a0ed7e31b.parquet',
              'test': 'data/test-00000-of-00001-d9fc71e4a9b02e72.parquet'}

    # Read in the parquet files, and combine them into one dataframe.
    df1 = pd.read_parquet("hf://datasets/ic-fspml/stock_news_sentiment/" + splits["train"])
    df2 = pd.read_parquet("hf://datasets/ic-fspml/stock_news_sentiment/" + splits["validation"])
    df3 = pd.read_parquet("hf://datasets/ic-fspml/stock_news_sentiment/" + splits["test"])
    combined_df = pd.concat([df1, df2, df3])

    return combined_df

# Function to get high level information about loaded dataset
def get_dataset_stats(df, dataset_name, date_col):
    tickers = get_tickers(df)
    print('Dataset ' + dataset_name + 'contains ' + str(len(tickers)) + ' unique tickers.')
    print('Dataset spans from ' + str(df[date_col].min()) + ' to ' + str(df[date_col].max()))
    print('Dataset shape: ' + str(df.shape))

In [None]:
# Function to get the list of unique tickers in the dataset
def get_tickers(df):
    tickers = df['ticker'].unique().tolist()
    return tickers


# Function to get sector and type information for a list of tickers using yfinance
def get_stock_info(tickers):
    # Get the sector and type from yFinance info. Returns list of dictionaries.
    stock_info = []
    for ticker in tickers:
        stock = yf.Ticker(ticker)
        info = {'ticker': ticker, 'sector': stock.info.get('sector'), 'type': stock.info.get('quoteType')}
        stock_info.append(info)

    # Return a DataFrame from the list of dictionaries
    return pd.DataFrame(stock_info)


# Function to get stock prices for a list of tickers within a specific date range
def get_stock_prices(tickers, start_date, end_date):
    # YFinance requires start_date and end_date to be datetime objects in date format
    start_date = pd.to_datetime(start_date).date()
    end_date = pd.to_datetime(end_date).date()
    end_date = pd.to_datetime(end_date) + datetime.timedelta(days=1)

    # Query the available daily stock prices from yfinance - note, this is slow, and produces warning output for failed downloads
    df = yf.download(tickers, start=start_date, end=end_date)

    # Select specific columns ('Open', 'Close', 'Adj Close', 'Volume') and stack them
    df = pd.DataFrame(df[['Open','Close','Adj Close', 'Volume']].stack()).reset_index()
    df['DayChange'] = df['Open'] - df['Adj Close']

    return df


# Function to merge stock prices with additional stock information (sector and type)
def merge_stock_prices_with_info(stock_info, stock_prices):
    # Drop rows in stock_info where either 'sector' or 'type' is missing - likely stocks have been delisted.
    stock_info = stock_info.dropna()

    # Left join to ensure again that all stock prices are retained
    df = stock_prices.merge(stock_info, left_on='Ticker', right_on='ticker', how='left')

    return df


# Function to save the stock prices DF to a CSV file
def save_stock_prices(df, file_source):
    df.to_csv(f'{file_location}/{file_source}_prices.csv', index=False)

<h1> Sentiment Analysis </h1>

In [None]:
# Function to load in the finbert sentiment analysis tokenizer and model
def load_in_bert_model():
    # https://huggingface.co/transformers/v3.0.2/model_doc/bert.html
    tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert")
    bert_model = BertForSequenceClassification.from_pretrained("ProsusAI/finbert")

    return tokenizer, bert_model

tokenizer, bert_model = load_in_bert_model()

In [None]:
# Function to use finbert to extract the sentiment in a string
def sentiment_analysis_bert(text):
    # Move the FinBERT model to the device loaded earlier - ideally this will be a GPU if available
    bert_model.to(device)

    # Tokenize the input text, and move to device (CPU or GPU)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)

    # Perform inference - no need for gradients as we're not training the FinBERT model. Saves memory.
    with torch.no_grad():
      outputs = bert_model(**inputs)

    # Extract the predicted probabilities
    probs = outputs.logits.softmax(dim=1).detach().cpu().numpy()[0]  # Move the output back to CPU for further processing

    # Define and assign the predicted sentiment label, based on highes probability
    sentiment_labels = ['negative', 'neutral', 'positive']
    predicted_sentiment = sentiment_labels[np.argmax(probs)]

    return predicted_sentiment


# Function to apply sentiment analysis to a DataFrame in batches and save the results
# Useful to ensure output is saved, as limited computer resources were available
def apply_sentiment_analysis_to_df(df, col_to_analyse, file_source, batch_size=50000):
    # Split the DataFrame into batches of 50,000 rows
    num_batches = (len(df) + batch_size - 1) // batch_size

    # Iterate through each batch and extract the sentiment
    for i in range(num_batches):
      start_idx = i * batch_size
      end_idx = min((i + 1) * batch_size, len(df))
      batch = df.iloc[start_idx:end_idx]

      # Apply sentiment analysis to the batch
      batch['sentiment'] = batch[col_to_analyse].progress_apply(sentiment_analysis_bert)

      # Save the batch to a CSV file
      batch.to_csv(f'{file_location}/{file_source}_{i}_senti.csv', index=False)

      print('Completed batch ' + str(i) + ' which spans from ' + str(start_idx) + ' to ' + str(end_idx))


# Function to combine all CSV files in a directory into a single DataFrame and save the result
def combine_csv_files(file_source):
    folder_path = f'{file_location}/'
    combined_df = pd.DataFrame()
    for filename in os.listdir(folder_path):
      if filename.endswith('_senti.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        combined_df = pd.concat([combined_df, df])

    # Save the combined file to a new CSV
    combined_df.to_csv(f'{file_location}/{file_source}_sentiment.csv', index=False)

    return combined_df

<h1> Load & merge the collected and analysed data </h1>

In [None]:
# Function to load the saved csv with tickers, prices and stock info.
def load_prices_df(file_source):
    df = pd.read_csv(f'{file_location}/{file_source}_prices.csv')
    return df


# Function to load the sentiment DataFrame from a CSV file
def load_sentiment_df(file_source):
    df = pd.read_csv(f'{file_location}/{file_source}_sentiment.csv')
    return df


# Function to merge the sentiment and stock prices DataFrames
def merge_sentiment_and_prices(sentiment_df, prices_df):
    sentiment_df['Date'] = pd.to_datetime(sentiment_df['article_date']).dt.date
    prices_df['Date'] = pd.to_datetime(prices_df['Date']).dt.date
    full_df = prices_df.drop(columns=['ticker']).merge(sentiment_df.drop(columns=['type','sector','label']), left_on=['Date','Ticker'], right_on=['Date','ticker'], how='outer')
    return full_df.drop(columns=['ticker'])


# Function to save the combined sentiment and price DataFrame to a CSV file
def save_senti_prices(df, file_source):
    df.to_csv(f'{file_location}/{file_source}_senti_price.csv', index=False)

<h1> Run block - Run all to generate dataset from scratch</h1>

In [None]:
ic_fspml_df = load_ic_fspml_dataset() # Load in dataset
tickers = get_tickers(ic_fspml_df) # Get list of unique tickers
stock_info = get_stock_info(tickers) # Get stock info for all tickers - note, expect a lot of warnings

In [None]:
# Cut the tickers down to correctly populated equities only
tickers = stock_info[(stock_info['type']=='EQUITY') & (stock_info['sector'].notna())]['ticker'].unique().tolist()

# Get stock prices for tickers
ic_fspml_df = ic_fspml_df[ic_fspml_df['ticker'].isin(tickers)]
stock_prices = get_stock_prices(tickers, ic_fspml_df['article_date'].min(), ic_fspml_df['article_date'].max())

In [None]:
# Merge with the info df, and save the result. Saving takes a few mins.
stock_price_and_info = merge_stock_prices_with_info(stock_info, stock_prices)
save_stock_prices(stock_price_and_info, 'ic_fspml')

apply_sentiment_analysis_to_df(ic_fspml_df, 'article_headline', 'ic_fspml') # Apply sentiment analysis to the headlines, save as CSV
sentiment_df = combine_csv_files('ic_fspml') # Load & combine the CSV files into a single DataFrame

In [None]:
# Merge and save the price, stock info and sentiment.
full_price_and_sentiment_df = merge_sentiment_and_prices(sentiment_df, stock_price_and_info)
full_price_and_sentiment_df = full_price_and_sentiment_df.sort_values(['Date', 'Ticker'])
save_senti_prices(full_price_and_sentiment_df, 'ic_fspml')

In [1]:
# full_price_and_sentiment_df[~full_price_and_sentiment_df['sentiment'].isna()].head()