# Import and Configuration

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF
import pyLDAvis

  from pandas.core import (


In [2]:
INPUT_PATH = "C:/Users/mushj/Downloads/PROCESSED FINANCE DATA/"
OUTPUT_PATH = INPUT_PATH

SOURCE_COL = 'Lsa_summary'
CLEANED_COL = 'Lsa_summary_cleaned'

REP_METHOD = 'TF-IDF'
# REP_METHOD = 'BoW'
# MODEL = 'LDA'
MODEL = 'NMF'

In [3]:
df = pd.read_csv(INPUT_PATH+"FNSPID_NVDA_cleaned.csv")
df.shape

(8716, 3)

# Numerical representation of text

In [4]:
MAX_DF = 0.75
MIN_DF = 0.05

In [5]:
# Choose one representation for the next step.
if REP_METHOD == 'TF-IDF':
    vectorizer = TfidfVectorizer(max_df=MAX_DF, min_df=MIN_DF, stop_words='english')
    text_matrix = vectorizer.fit_transform(df[CLEANED_COL])
else:
    vectorizer = CountVectorizer(max_df=MAX_DF, min_df=MIN_DF, stop_words='english')
    text_matrix = vectorizer.fit_transform(df[CLEANED_COL])

In [6]:
text_matrix

<8716x129 sparse matrix of type '<class 'numpy.float64'>'
	with 129924 stored elements in Compressed Sparse Row format>

# Latent Dirichlet Allocation

## - Fit model

In [7]:
N_TOPICS = 20  # Number of topics

In [12]:
%%time
if MODEL == 'LDA':
    # Latent Dirichlet Allocation (LDA)
    model = LatentDirichletAllocation(n_components=N_TOPICS, random_state=42)
    topics = model.fit_transform(text_matrix)
elif MODEL == 'NMF':
    # Non-Negative Matrix Factorization (NMF)
    model = NMF(n_components=N_TOPICS, random_state=42)
    topics = model.fit_transform(text_matrix)

CPU times: total: 1.91 s
Wall time: 1.01 s


## - Analyze topics

In [13]:
# Get top words per topic
def display_topics(model, feature_names, no_top_words):
    """Returns a list of strings that summarizes the words that make up each topic.
    """
    topics = []
    
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
        topics.append(f"Topic {topic_idx + 1}: {', '.join(top_words)}")
    
    return topics

In [14]:
feature_names = vectorizer.get_feature_names_out()
topics_to_display = display_topics(model, feature_names, no_top_words=5)

In [15]:
# Print topics
for topic in topics_to_display:
    print(topic)

Topic 1: company, like, business, make, technology
Topic 2: free, report, analysis, stock, click
Topic 3: rate, federal, reserve, wall, street
Topic 4: stock, nasdaqnvda, nvda, source, market
Topic 5: fool, motley, video, decade, run
Topic 6: amd, device, advanced, micro, intel
Topic 7: etf, research, holding, report, sp
Topic 8: trading, share, month, today, far
Topic 9: nvda, corp, growth, industry, computer
Topic 10: chip, said, reuters, nvdao, china
Topic 11: earnings, zacks, nvda, estimate, share
Topic 12: meta, platform, microsoft, alphabet, apple
Topic 13: graphic, unit, processing, gpus, gaming
Topic 14: fund, average, nvda, nasdaqnvda, report
Topic 15: nasdaq, nvda, video, stock, market
Topic 16: data, center, revenue, nvidias, gaming
Topic 17: ai, artificial, intelligence, chip, cloud
Topic 18: analyst, price, share, buy, higher
Topic 19: semiconductor, industry, giant, demand, global
Topic 20: year, market, past, time, gain


## - Analyze topics distribution

In [16]:
topics.shape

(8716, 20)

In [17]:
# 
dominant_topics = np.argmax(topics, axis=1)
df['topic'] = dominant_topics

In [18]:
df['topic'].value_counts(normalize=True).sort_index()

topic
0     0.000115
1     0.151790
2     0.095801
3     0.027077
4     0.039009
5     0.019849
6     0.012735
7     0.022487
8     0.023520
9     0.034305
10    0.031207
11    0.040730
12    0.051514
13    0.018701
14    0.060808
15    0.056333
16    0.102226
17    0.077214
18    0.071592
19    0.062988
Name: proportion, dtype: float64

## - Sample text to topic mappings

In [19]:
i = 0

row = df.iloc[i]
print(topics_to_display[row['topic']], '\n')
print(row[SOURCE_COL])

Topic 13: graphic, unit, processing, gpus, gaming 

Stock splits, meanwhile, continue to get attention from investors after nearly every big tech stock split its shares in 2021 and 2022, including Tesla, Apple, Alphabet, Amazon, Nvidia (NASDAQ: NVDA), and Shopify. The launch of OpenAI's ChatGPT in late 2022 set off a new race to harness generative AI technologies, which some tech CEOs think could be as transformative as the internet has been over the past three decades. Its graphics processing units (GPUs) and accelerators have been in high demand from cloud infrastructure companies and others looking to scale up and build capacity for AI applications.
