<a href="https://colab.research.google.com/github/hmezer/dai-project/blob/main/preprocessing/dai_data_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/hmezer/dai-project

Cloning into 'dai-project'...
remote: Enumerating objects: 116, done.[K
remote: Counting objects: 100% (116/116), done.[K
remote: Compressing objects: 100% (112/112), done.[K
remote: Total 116 (delta 52), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (116/116), 21.59 MiB | 15.97 MiB/s, done.
Resolving deltas: 100% (52/52), done.


In [2]:
import gzip
import json
import glob
import os

from bs4 import BeautifulSoup
import spacy
import string
from spacy.lang.en import English
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

In [3]:
# Path to the sample-data directory
data_dir = "dai-project/sample-data"
json_list = []

# Find all .jsonl.gz files in the directory
for file_path in glob.glob(os.path.join(data_dir, "*.jsonl.gz")):
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            json_list.append(json.loads(line))

# Now json_list contains all the JSON objects from the .jsonl.gz files
print(f"Loaded {len(json_list)} JSON objects from .jsonl.gz files.")

# Print the first JSON object
print(json.dumps(json_list[0], indent=2))

Loaded 5059 JSON objects from .jsonl.gz files.
{
  "ResultId": "urn:contentItem:6B2J-M2C1-DXY3-003N-00000-00",
  "Date": "2024-01-09T00:00:00Z",
  "Title": "OneStream Software Closes 2023 with Strong Momentum, Positioned for Further Expansion to Support Finance Leaders in 2024 CPM platform provider reports over $450M Annual Recurring Revenue (ARR), up 37% year-over-year",
  "Document": {
    "DocumentId": "/shared/document/news/urn:contentItem:6B2J-M2C1-DXY3-003N-00000-00",
    "DocumentIdType": "DocFullPath",
    "Content": "<entry xmlns=\"http://www.w3.org/2005/Atom\"><id>urn:contentItem:6B2J-M2C1-DXY3-003N-00000-00</id><title>OneStream Software Closes 2023 with Strong Momentum, Positioned for Further Expansion to Support Finance Leaders in 2024</title><published>2024-01-09T00:00:00Z</published><updated>2025-06-11T07:16:02Z</updated><author><name>LexisNexis</name></author><content type=\"application/xml\"><!--Transformation version 1.25--><articleDoc xmlns=\"\" xmlns:xsi=\"http://www

In [4]:
def get_sentiment_scores(sentiment_scores):
    flag_sum = 0
    num_sentences = len(sentiment_scores)
    num_polarized = 0

    for s in sentiment_scores:
        if s['label'] == 'Positive':
            flag_sum += s['score']
            num_polarized += 1
        elif s['label'] == 'Negative':
            flag_sum -= s['score']
            num_polarized += 1
        # Neutral is ignored for polarized count

    score_including_neutrals = flag_sum / num_sentences if num_sentences else 0
    score_polarized_only = flag_sum / num_polarized if num_polarized else 0

    return score_including_neutrals, score_polarized_only

In [5]:
spacy_nlp = spacy.load('en_core_web_sm')

# Load the FinBERT model and tokenizer inside the function
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_nlp = pipeline("sentiment-analysis",
                         model=model,
                         tokenizer=tokenizer,
                         truncation=True,
                         max_length=512)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Device set to use cpu


In [6]:
from datetime import datetime
from bs4 import BeautifulSoup
import re

# Function to extract data from a single press release
def extract_data(press_release):
    # Extract date and format it as 'YYYY-MM-DD'
    date = press_release["Date"]
    date = datetime.fromisoformat(date.replace('Z', '+00:00'))
    date = date.strftime('%Y-%m-%d')
    #print(date) # Uncomment for debugging


    # Get the HTML content of the document
    html_str = press_release["Document"]["Content"]
    text = ""
    # If HTML content exists, parse it and extract text
    if html_str:
        soup = BeautifulSoup(html_str, 'html.parser')
        # Find the body content tag and extract its text
        body_tag = soup.find('nitf:body.content')
        if body_tag:
            text = body_tag.get_text()


    # Clean the extracted text and identify tickers
    if text:
        # Function to clean patterns in the text
        def pattern_clean(text):
            import re
            # Replace bullet points with periods
            text = re.sub(r'•\s+', '. ', text)
            text = re.sub(r'•', '. ', text)
            # Remove patterns like ".#" or ".# " where # is a number
            text = re.sub(r'\.\d+\s+', '. ', text)
            text = re.sub(r'\.\d+', '. ', text)
            return text
        text = pattern_clean(text)

        # Use spaCy to split the text into sentences
        doc = spacy_nlp(text)
        sentences = [sent.text for sent in doc.sents]

        # Function to check if a sentence is relevant (not just metadata)
        def is_relevant_sentence(sentence):
            # Regex patterns for identifying metadata sentences (dates, times, etc.)
            metadata_patterns = [
                r"\bEastern Time\b",
                r"\b\d{4}\b",                         # Year
                r"\bJanuary|\bFebruary|\bMarch|\bApril|\bMay|\bJune|\bJuly|\bAugust|\bSeptember|\bOctober|\bNovember|\bDecember\b",
                r"\b\d{1,2}:\d{2}\s*(AM|PM)?\b",      # Time
            ]
            # Check if any metadata pattern exists and if the sentence is short
            for pattern in metadata_patterns:
                if re.search(pattern, sentence) and len(sentence.split()) < 10:
                    #print(sentence) # Uncomment for debugging
                    return False
            # Optionally: filter very short sentences
            if len(sentence.split()) < 4:
                return False
            return True

        # Filter out irrelevant sentences
        sentences = [s for s in sentences if is_relevant_sentence(s)]
        # Get sentiment scores for the relevant sentences using the loaded pipeline
        sentences_sentiment = sentiment_nlp(sentences)


        # Regex pattern to find tickers within parentheses or brackets
        pattern = r'[\(\[]([A-Za-z ]+): ?([A-Za-z0-9\.\-]+) *[\)\]]'
        tickers_in_text = dict()
        # Iterate through sentences and their sentiment scores
        for item in zip(sentences, sentences_sentiment):
            sentence = item[0]
            sentiment = item[1]
            # Find matches for the ticker pattern in the sentence
            matches = re.findall(pattern, sentence, re.IGNORECASE)
            if matches:
                #print(matches) # Uncomment for debugging
                #print(sentence) # Uncomment for debugging
                #print(sentiment) # Uncomment for debugging
                for m in matches:
                    # If the ticker is not already in the dictionary, add it with a sentiment score
                    if m[1] not in tickers_in_text.keys():
                        if sentiment["label"] == "Positive":
                            tickers_in_text[m[1]] = 1
                        elif sentiment["label"] == "Negative":
                            tickers_in_text[m[1]] = -1
                        else: # Neutral sentiment
                            tickers_in_text[m[1]] = 0
                    else:
                        # If the ticker exists, update its score based on the current sentence's sentiment
                        if sentiment["label"] == "Positive":
                            tickers_in_text[m[1]] += 1
                        elif sentiment["label"] == "Negative":
                            tickers_in_text[m[1]] -= 1


        #print(f"Tickers in text: {tickers_in_text}") # Uncomment for debugging


    # Calculate overall sentiment scores (diluted and pure) using the helper function
    polarity_diluted, polarity_pure = get_sentiment_scores(sentences_sentiment)
    #print(f"Polarity diluted: {polarity_diluted}") # Uncomment for debugging
    #print(f"Polarity pure: {polarity_pure}") # Uncomment for debugging

    # Return a list of dictionaries, each containing ticker, date, and sentiment scores
    return [
        {
            "ticker": ticker,
            "date": date,
            "polarity_diluted": polarity_diluted,
            "polarity_pure": polarity_pure,
            "polarity_immediate": tickers_in_text[ticker] # Sentiment score based on sentences containing the ticker
        } for ticker in tickers_in_text.keys()
    ]

In [7]:
press_release = json_list[108]
extract_data(press_release)

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

[{'ticker': 'PAR',
  'date': '2024-01-10',
  'polarity_diluted': 0.521927165574041,
  'polarity_pure': 0.8903463412733639,
  'polarity_immediate': 1}]