<a href="https://colab.research.google.com/github/hmezer/dai-project/blob/main/dai_sentiment_analysis-v20250827.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/hmezer/dai-project

Cloning into 'dai-project'...
remote: Enumerating objects: 58, done.[K
remote: Counting objects: 100% (58/58), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 58 (delta 21), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (58/58), 19.66 MiB | 24.08 MiB/s, done.
Resolving deltas: 100% (21/21), done.


### Import JSON data

In [2]:
import gzip
import json
import glob
import os

# Path to the sample-data directory
data_dir = "dai-project/sample-data"
json_list = []

# Find all .jsonl.gz files in the directory
for file_path in glob.glob(os.path.join(data_dir, "*.jsonl.gz")):
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            json_list.append(json.loads(line))

# Now json_list contains all the JSON objects from the .jsonl.gz files
print(f"Loaded {len(json_list)} JSON objects from .jsonl.gz files.")

# Print the first JSON object
print(json.dumps(json_list[0], indent=2))

Loaded 5059 JSON objects from .jsonl.gz files.
{
  "ResultId": "urn:contentItem:6B5S-9891-JBG1-804X-00000-00",
  "Date": "2024-01-24T00:00:00Z",
  "Title": "Nacha Announces Board of Directors for 2024, Reflecting Expertise in the Payments Industry",
  "Document": {
    "DocumentId": "/shared/document/news/urn:contentItem:6B5S-9891-JBG1-804X-00000-00",
    "DocumentIdType": "DocFullPath",
    "Content": "<entry xmlns=\"http://www.w3.org/2005/Atom\"><id>urn:contentItem:6B5S-9891-JBG1-804X-00000-00</id><title>Nacha Announces Board of Directors for 2024, Reflecting Expertise in the Payments Industry</title><published>2024-01-24T00:00:00Z</published><updated>2025-06-11T07:25:58Z</updated><author><name>LexisNexis</name></author><content type=\"application/xml\"><!--Transformation version 1.25--><articleDoc xmlns=\"\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:noNamespaceSchemaLocation=\"http://www.lexisnexis.com/xmlschemas/content/public/articledoc/1/\" schemaVersion=\"1.8\"

### Set the model

In [3]:
from bs4 import BeautifulSoup
import spacy
import string
from spacy.lang.en import English
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

spacy_nlp = spacy.load('en_core_web_sm')

# Load the FinBERT model and tokenizer inside the function
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_nlp = pipeline("sentiment-analysis",
                         model=model,
                         tokenizer=tokenizer,
                         truncation=True,
                         max_length=512)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Device set to use cpu


### Functions for the workflow

In [4]:
def analyze_sentiment_from_text(text):
    """
    Analyzes the sentiment of a given text string using FinBERT.

    Args:
        text (str): The input text string.

    Returns:
        tuple: A tuple containing:
            - list: Sentiment analysis results for the lemmatized text.
            - list: Sentiment analysis results for each sentence in the text.
    """

    # Create a list of sentences
    doc = spacy_nlp(text)
    sentences = [sent.text for sent in doc.sents]


    import re

    # PART OF preprocessing the text format
    def is_relevant_sentence(sentence):
        # Regex for date/time patterns
        metadata_patterns = [
            r"\bEastern Time\b",
            r"\b\d{4}\b",                         # Year
            r"\bJanuary|\bFebruary|\bMarch|\bApril|\bMay|\bJune|\bJuly|\bAugust|\bSeptember|\bOctober|\bNovember|\bDecember\b",
            r"\b\d{1,2}:\d{2}\s*(AM|PM)?\b",      # Time
        ]
        for pattern in metadata_patterns:
            if re.search(pattern, sentence) and len(sentence.split()) < 10:
                print(sentence)
                return False
        # Optionally: filter very short sentences
        if len(sentence.split()) < 4:
            return False
        return True

    sentences = [s for s in sentences if is_relevant_sentence(s)]


    sentences_sentiment = sentiment_nlp(sentences)

    return sentences, sentences_sentiment



def get_sentiment_scores(sentiment_scores):
    flag_sum = 0
    num_sentences = len(sentiment_scores)
    num_polarized = 0

    for s in sentiment_scores:
        if s['label'] == 'Positive':
            flag_sum += s['score']
            num_polarized += 1
        elif s['label'] == 'Negative':
            flag_sum -= s['score']
            num_polarized += 1
        # Neutral is ignored for polarized count

    score_including_neutrals = flag_sum / num_sentences if num_sentences else 0
    score_polarized_only = flag_sum / num_polarized if num_polarized else 0

    return score_including_neutrals, score_polarized_only

---

In [5]:
html_str = json_list[99]["Document"]["Content"]
text = ""
if html_str:
    soup = BeautifulSoup(html_str, 'html.parser')
    body_tag = soup.find('nitf:body.content')
    if body_tag:
        text = body_tag.get_text()

if text:
    # PART OF preprocessing the text format
    def pattern_clean(text):
        import re
        # Replace "• " (bullet with space after) with "."
        text = re.sub(r'•\s+', '.', text)
        # Replace any remaining bullet (sticky case) with ". "
        text = re.sub(r'•', '. ', text)
        # Remove ".# " pattern where # is a number
        text = re.sub(r'\.\d+\s+', '. ', text)
        # Remove ".#" pattern where # is a number (no space)
        text = re.sub(r'\.\d+', '. ', text)
        return text
    text = pattern_clean(text)

    sentences, sentences_sentiment = analyze_sentiment_from_text(text)

    print(get_sentiment_scores(sentences_sentiment))
    print("\nSentiment Analysis per Sentence:")
    i = 0
    for sentence in sentences:
        print(f"Sentence {i+1}: {sentence}")
        print(f"Sentiment: {sentences_sentiment[i]['label']}, Score: {sentences_sentiment[i]['score']:.4f}")
        print("-" * 20)
        i += 1

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

(0.2083421003818512, 0.7440789299351829)

Sentiment Analysis per Sentence:
Sentence 1: Corebridge Financial today announced the expansion of its annuity lineup for registered investment advisors (RIAs) with the launch of American Pathway AdvisorySM-a multi-year guaranteed annuity (MYGA) that offers growth and principal protection, along with flexibility and convenience.
Sentiment: Positive, Score: 0.9992
--------------------
Sentence 2: American Pathway Advisory is a fixed annuity that brings enhanced flexibility to the MYGA space through innovative renewal options.
Sentiment: Positive, Score: 1.0000
--------------------
Sentence 3: Like many MYGAs, American Pathway Advisory allows consumers to lock in growth for three, five or seven years, but where the Corebridge MYGA stands out is the ability of consumers to maintain or modify their term at renewal without having to fill out a new application or purchase a new contract.
Sentiment: Positive, Score: 0.9971
--------------------
Sentenc