In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import json
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

In [69]:
ROOT_DIR = "/home/ibrahim/stock/data/processed"
YEARS = ["2020", "2021", "2022", "2023", "2024", "2025"]

data = {
    "2020": {
        "high_impact": [],
        "medium_impact": [],
        "low_impact": [],
        "news": []
    },
    "2021": {
        "high_impact": [],
        "medium_impact": [],
        "low_impact": [],
        "news": []
    },
    "2022": {
        "high_impact": [],
        "medium_impact": [],
        "low_impact": [],
        "news": []
    },
    "2023": {
        "high_impact": [],
        "medium_impact": [],
        "low_impact": [],
        "news": []
    },
    "2024": {
        "high_impact": [],
        "medium_impact": [],
        "low_impact": [],
        "news": []
    },
    "2025": {
        "high_impact": [],
        "medium_impact": [],
        "low_impact": [],
        "news": []
    }
}

for year in YEARS:
    PATHS = os.listdir(os.path.join(ROOT_DIR, year))
    
    for path in PATHS:

        with open(os.path.join(ROOT_DIR, year, path), "r") as file:

            if "high" in path:
                data[year]["high_impact"] = json.load(file)
            elif "medium" in path:
                data[year]["medium_impact"] = json.load(file)
            elif "low" in path:
                data[year]["low_impact"] = json.load(file)
            else:
                data[year]["news"] = json.load(file)


In [None]:
PLOTS_DIR = "/home/ibrahim/stock/plots"

corpus = {"2020": {}, "2021": {}, "2022": {}, "2023": {}, "2024": {}, "2025": {}}

for news in data:
    for impact in data[news]:
        if data[news][impact] is None:
            print(f"No data for {news} {impact}")
        if len(data[news][impact]) > 0:
            text = " ".join(data[news][impact])
            text = text.replace("nvidia", " ")

            tokens = word_tokenize(text)

            corpus[news][impact] = text
            
            wordcloud = WordCloud(width=800, height=400, background_color="white", max_words=150).generate(text)

            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud, interpolation="bilinear")
            plt.axis("off")
            plt.title(f"{news} {impact} Word Cloud")
            plt.savefig(os.path.join(PLOTS_DIR, f"{news}_{impact}.png"))
            plt.close()

In [93]:
import torch

from transformers import Pipeline

class FinBERTPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        if "text" in kwargs:
            preprocess_kwargs["text"] = kwargs["text"]
        return preprocess_kwargs, {}, {}

    def preprocess(self, sentence, maybe_arg=2):
        return self.tokenizer(sentence, return_tensors="pt")

    def _forward(self, inputs):
        return self.model(**inputs, output_hidden_states=True)

    def postprocess(self, outputs):
        sentence_embedding = torch.mean(outputs.hidden_states[-1][0], dim=0).numpy()
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        prediction_max_index = int(torch.argmax(predictions))
        label = self.model.config.id2label[prediction_max_index]
        return {'label': label, 'score': predictions[0][prediction_max_index].item(), 'embedding': sentence_embedding}
    


In [94]:
from transformers import pipeline
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import AutoModelForSequenceClassification

PIPELINE_REGISTRY.register_pipeline(
   'finbert-pipeline-with-sentence-embedding',
   pipeline_class=FinBERTPipeline,
   pt_model=AutoModelForSequenceClassification,
)

pipe = pipeline('finbert-pipeline-with-sentence-embedding', model='ProsusAI/finbert', device=0)
outputs = pipe('EXAMPLE SENTENCE')

print(outputs['label'], outputs['score'], outputs['embedding'].shape)

Device set to use cuda:0


neutral 0.8927797675132751 (768,)


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

finbert = BertForSequenceClassification.from_pretrained("ProsusAI/finbert")
tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert", device_map="auto")

inputs = tokenizer("EXAMPLE SENTENCE", return_tensors="pt", truncation=True, padding=True)
outputs = finbert(**inputs, output_hidden_states=True)
# logits = outputs.logits
# probabilities = logits.softmax(dim=-1).tolist()[0]

outputs.hidden_states[-1].mean(dim=1)

tensor([[ 4.9657e-01,  1.3090e-01, -6.3200e-02, -3.8852e-02, -4.4911e-01,
         -9.5320e-01,  7.2255e-01,  4.1358e-01, -2.3714e-01, -2.1281e-01,
          1.9503e-01, -6.3107e-01,  3.2011e-01, -2.4609e-03, -3.1287e-01,
         -1.9257e-01,  4.9090e-02,  3.7381e-01,  4.2787e-02, -1.5683e-01,
          2.6213e-01,  3.8908e-02, -1.6212e-01,  1.6726e-01,  2.4482e-01,
          5.4331e-01, -1.0891e-01,  1.7704e-01,  8.6575e-02, -3.2981e-01,
         -1.2469e-01,  6.6089e-02, -2.2721e-01, -1.9966e-01,  4.2461e-01,
         -5.3609e-01,  4.9235e-01, -3.5622e-01, -4.5563e-01,  1.6188e-01,
         -1.2379e+00, -3.6276e-01, -1.3031e-01,  1.3809e-01, -2.7757e-01,
         -2.2755e-01,  5.5868e-01,  5.0482e-01, -2.9229e-01,  8.2040e-01,
         -3.0340e-02,  6.7432e-02, -4.1842e-01,  1.2779e-01,  3.7652e-01,
          3.4655e-01, -3.4712e-01, -9.6518e-01, -5.5033e-01, -5.6727e-02,
          3.1613e-01,  3.0858e-01, -6.0979e-01,  2.1034e-01,  5.7790e-01,
          1.7279e-01,  4.2284e-03, -1.

In [85]:
tfidf = {"2020": {}, "2021": {}, "2022": {}, "2023": {}, "2024": {}, "2025": {}}

for year in data:
    for impact in data[year]:

        if data[year][impact] is None:
            print(f"No data for {year} {impact}")

        if len(data[year][impact]) > 0:
            text = data[year][impact]
            
            vectorizer = TfidfVectorizer()

            try:
                X = vectorizer.fit_transform(text)
                tfidf[year][f"{impact}_tfidf"] = X
            except ValueError as e:
                print(f"Error processing {year} {impact}: {e}")
                continue
            
            feature_names = vectorizer.get_feature_names_out()
            tfidf_scores = X.toarray()[0]

            sorted_items = sorted(zip(tfidf_scores, feature_names), reverse=True)
            top_n = sorted_items[:10]

            print(f"Top 10 words for {year} {impact}:")
            for score, word in top_n:
                print(f"{word}: {score}")

for year in tfidf:
    for impact in tfidf[year]:
        data[year][impact] = tfidf[year][impact]

del tfidf

Top 10 words for 2020 high_impact:
nvidia: 1.0
שבת: 0.0
פרסום: 0.0
פוסט: 0.0
מעריב: 0.0
לוח: 0.0
כניסת: 0.0
חדשות: 0.0
חגים: 0.0
זמני: 0.0
Top 10 words for 2020 medium_impact:
send: 1.0
香港繁中: 0.0
台灣繁中: 0.0
zz74si: 0.0
zoom: 0.0
zeus: 0.0
zeroday: 0.0
zelda: 0.0
zealandenglish: 0.0
zealand: 0.0


MemoryError: Unable to allocate 389. GiB for an array with shape (1190387, 43840) and data type float64

In [79]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pca = PCA(n_components=2)
reduced_matrix = pca.fit_transform(corpus["2020"]["high_impact_tfidf"].toarray())

plt.figure(figsize=(10, 8))
plt.scatter(reduced_matrix[:, 0], reduced_matrix[:, 1])
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title("TF-IDF Vectors Visualized with PCA")
plt.show()

ValueError: n_components=2 must be between 0 and min(n_samples, n_features)=1 with svd_solver='full'

In [83]:
corpus["2020"]["news_tfidf"].toarray().shape

(1, 43504)