## Assignment 2, Natural Language Processing, Group 14

## Load the dataset and choose only the articles where NVIDIA word appears in the content

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('us_equities_news_dataset.csv')

# Filter rows where 'content' column contains the word 'NVIDIA'
nvidia_rows = df[df['content'].str.contains('NVIDIA', case=False, na=False)]

# Display the filtered rows
print(nvidia_rows)

            id ticker                                              title  \
24      221539    NIO  A Central Bank War Just Started And Its Good F...   
32      221547    NIO         6 Stocks To Watch  Nivida Could Be Falling   
57      221572    NIO  Stocks   Dow Drops Nearly 400 Points as Apple ...   
78      221593   UBER  The Zacks Analyst Blog Highlights  Advanced Mi...   
82      221597   UBER                     The Best Of CES 2020  Revised    
...        ...    ...                                                ...   
221141  442657    AMD    Here s Why Nvidia  NVDA  Stock Is Gaining Today   
221166  442682    AMD      4 Stocks To Watch Today  ATW  CWEI  MXL  SLCA   
221189  442705    AMD  Here s What The Buy Side Expects From AMD Thur...   
221468  442984      T  Zacks com Featured Highlights  AT T  Nu Skin E...   
221471  442987      T  5 Dividend Growth Stocks To Sail Through Uncer...   

       category                                            content  \
24      opinion  

## Remove unnecessary columns

In [4]:
# Remove the specified columns
nvidia_rows = nvidia_rows.drop(['id', 'ticker', 'url', 'release_date'], axis=1)

# Save the modified DataFrame to a new CSV file
nvidia_rows.to_csv('nvidia_rows_news_dataset.csv', index=False)

Index(['id', 'ticker', 'title', 'category', 'content', 'release_date',
       'provider', 'url', 'article_id'],
      dtype='object')


## Preprocess the data

In [8]:
from nltk.tokenize import word_tokenize
import re
from unidecode import unidecode
import pandas as pd
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(texts):
    n=0
    processed_texts = []
    # lowercasing, keep text only, remove accents, tokenization
    tokens = [word for word in word_tokenize(re.sub(r'[^a-zA-Z\s]', '', unidecode(texts.lower())))]
    # stopword removal
    tokens = [token for token in tokens if token not in stopwords.words('english')]

    # remove short words
    tokens = [token for token in tokens if len(token) > 2]
        
    # Apply stemming to each token
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
        
    processed_texts.append(stemmed_tokens)

    # remove top-n% and bottom-n% words (optional)
    if n > 0:
        word_freq = Counter([word for sentence in processed_texts for word in sentence])
        top_n = set([word for word, _ in word_freq.most_common(int(n/100*len(word_freq)))])
        bottom_n = set([word for word, _ in word_freq.most_common()[:-int(n/100*len(word_freq))-1:-1]])
        processed_texts = [[word for word in sentence if word not in top_n and word not in bottom_n] for sentence in processed_texts] 

    # Flattening the list of lists into a single list
    flattened_list = [item for sublist in processed_texts for item in sublist]
    
    return flattened_list

# Example usage
FILE_PATH = 'nvidia_rows_news_dataset.csv'
df = pd.read_csv(FILE_PATH)
# df = df.head(100)
text_data = df['content'].tolist()

# Apply preprocessing function to each article and create a new column 'preprocessed_content'
df['preprocessed_content'] = df['content'].apply(preprocess_text)

print("Preprocessing complete. Sample of preprocessed content:")
print(df[['content', 'preprocessed_content']].head())  # Display a sample

# Save the modified DataFrame to a new CSV file
df.to_csv('nvidia_rows_news_dataset_preprocessed.csv', index=False)

print(f"Length of text_data {len(text_data)}")

Preprocessing complete. Sample of preprocessed content:
                                             content  \
0  ECB Effects\nThe move in the euro was huge  fa...   
1  6 Stocks To Watch  March 6 Trading Session\nSt...   
2  Investing com   A rout in Apple and Facebook  ...   
3  For Immediate ReleaseChicago  IL   January 13 ...   
4  With 4 500 companies bringing their innovation...   

                                preprocessed_content  
0  [ecb, effect, move, euro, huge, fall, pip, hug...  
1  [stock, watch, march, trade, session, stock, s...  
2  [invest, com, rout, appl, facebook, nasdaq, mo...  
3  [immedi, releasechicago, januari, zack, com, a...  
4  [compani, bring, innov, ce, jan, get, realli, ...  
Length of text_data 100


## Train LDA model

In [10]:
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
import re

read_processed_data = df['preprocessed_content']
# Create a dictionary and corpus from the preprocessed data
dictionary = corpora.Dictionary(read_processed_data)
corpus = [dictionary.doc2bow(text) for text in read_processed_data]

# Train the LDA model
lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=15, random_state=100, passes=10)

# Print the topics found by the LDA model
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)
#Get coherence score
print(CoherenceModel(model=lda_model, texts=read_processed_data, dictionary=dictionary, coherence='c_v').get_coherence())

0    [ecb, effect, move, euro, huge, fall, pip, hug...
1    [stock, watch, march, trade, session, stock, s...
Name: preprocessed_content, dtype: object
(0, '0.014*"billion" + 0.013*"invest" + 0.010*"firm" + 0.009*"also" + 0.007*"new"')
(1, '0.021*"nasdaq" + 0.013*"nyse" + 0.012*"stock" + 0.010*"trade" + 0.010*"inc"')
(2, '0.000*"compani" + 0.000*"stock" + 0.000*"market" + 0.000*"fund" + 0.000*"nasdaq"')
(3, '0.001*"compani" + 0.001*"market" + 0.001*"year" + 0.000*"nasdaq" + 0.000*"drive"')
(4, '0.001*"nasdaq" + 0.001*"compani" + 0.001*"inc" + 0.001*"stock" + 0.001*"billion"')
(5, '0.022*"nasdaq" + 0.015*"inc" + 0.012*"stock" + 0.010*"nyse" + 0.009*"day"')
(6, '0.018*"year" + 0.017*"stock" + 0.015*"zack" + 0.014*"compani" + 0.011*"rate"')
(7, '0.014*"drive" + 0.010*"vehicl" + 0.009*"compani" + 0.009*"self" + 0.009*"market"')
(8, '0.017*"zack" + 0.016*"compani" + 0.014*"year" + 0.014*"nasdaq" + 0.012*"earn"')
(9, '0.033*"fund" + 0.029*"softbank" + 0.020*"vision" + 0.013*"invest" + 0.010*

##  BERTTopic

In [None]:
import pandas as pd
from bertopic import BERTopic
# from sentence_transformers import SentenceTransformer
import nltk
nltk.download("stopwords")
nltk.download('punkt_tab')

In [None]:
## Bert Topic code
# Convert the 'preprocessed_content' column to a list of documents (text format)
documents = [" ".join(tokens) for tokens in df['preprocessed_content']]

# Initialize BERTopic model
topic_model = BERTopic()

# Fit the model on the documents to get topics and probabilities
topics, probabilities = topic_model.fit_transform(documents)

# Show a summary of topics
print(topic_model.get_topic_info())


In [None]:
#check certain topic
num_topic = 0
topic_model.get_topic(num_topic)

In [None]:
#Heatmap
topic_model.visualize_heatmap(width=1000, height=1000)

## FLSA-W Model

In [None]:
## FLSA-W Model