## Assignment 2, Natural Language Processing, Group 14

## Load the dataset and choose only the articles where NVIDIA word appears in the content

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('us_equities_news_dataset.csv')

# Filter rows where 'content' column contains the word 'NVIDIA'
nvidia_rows = df[df['content'].str.contains('NVIDIA', case=False, na=False)]

# Display the filtered rows
print(nvidia_rows)

            id ticker                                              title  \
24      221539    NIO  A Central Bank War Just Started And Its Good F...   
32      221547    NIO         6 Stocks To Watch  Nivida Could Be Falling   
57      221572    NIO  Stocks   Dow Drops Nearly 400 Points as Apple ...   
78      221593   UBER  The Zacks Analyst Blog Highlights  Advanced Mi...   
82      221597   UBER                     The Best Of CES 2020  Revised    
...        ...    ...                                                ...   
221141  442657    AMD    Here s Why Nvidia  NVDA  Stock Is Gaining Today   
221166  442682    AMD      4 Stocks To Watch Today  ATW  CWEI  MXL  SLCA   
221189  442705    AMD  Here s What The Buy Side Expects From AMD Thur...   
221468  442984      T  Zacks com Featured Highlights  AT T  Nu Skin E...   
221471  442987      T  5 Dividend Growth Stocks To Sail Through Uncer...   

       category                                            content  \
24      opinion  

## Remove unnecessary columns

In [4]:
# Remove the specified columns
nvidia_rows = nvidia_rows.drop(['id', 'ticker', 'url', 'release_date'], axis=1)

# Save the modified DataFrame to a new CSV file
nvidia_rows.to_csv('nvidia_rows_news_dataset.csv', index=False)

Index(['id', 'ticker', 'title', 'category', 'content', 'release_date',
       'provider', 'url', 'article_id'],
      dtype='object')


## Preprocess the data

In [89]:
from nltk.tokenize import word_tokenize
import re
from unidecode import unidecode
import pandas as pd
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_texts(texts, n=0):
    processed_texts = []
    # lowercasing, keep text only, remove accents, tokenization
    tokens = [word for word in word_tokenize(re.sub(r'[^a-zA-Z\s]', '', unidecode(texts.lower())))]
    # stopword removal
    tokens = [token for token in tokens if token not in stopwords.words('english')]

    # remove short words
    tokens = [token for token in tokens if len(token) > 2]
        
    # Apply stemming to each token
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
        
    processed_texts.append(stemmed_tokens)

    # remove top-n% and bottom-n% words (optional)
    if n > 0:
        word_freq = Counter([word for sentence in processed_texts for word in sentence])
        top_n = set([word for word, _ in word_freq.most_common(int(n/100*len(word_freq)))])
        bottom_n = set([word for word, _ in word_freq.most_common()[:-int(n/100*len(word_freq))-1:-1]])
        processed_texts = [[word for word in sentence if word not in top_n and word not in bottom_n] for sentence in processed_texts]    
    return processed_texts

# Example usage
FILE_PATH = 'nvidia_rows_news_dataset.csv'
df = pd.read_csv(FILE_PATH)
# df = df.head(100)
text_data = df['content'].tolist()
print(f"Length of text_data {len(text_data)}")

# df['processed_content'] = df['content'].apply(lambda x: preprocess_texts([x], 0))
# df.to_csv("nvidia_rows_news_dataset_processed.csv", index=False)

processed_articles = []
print(f"Length of processed_articles before processing {len(processed_articles)}")

for single_article in text_data:        
    processed_article = preprocess_texts(single_article, 0)
    processed_articles.append(processed_article)

print(f"Length of processed_articles {len(processed_articles)}")
print("Finished with processing the data")
print("Writing to processed_data.txt")

# Open the file in write mode
with open("processed_data.txt", "w") as file:
    # Iterate through the processed_articles and write each article's tokens on a new line
    for article in processed_articles:
        # Ensure that article is a flat list of strings
        if isinstance(article, list):
            # Join the words (tokens) of each article with a space and write them as one line
            file.write(" ".join(str(token) for token in article) + "\n")
        
        # Write an empty line to separate articles
        file.write("\n")
# print(processed_data)

Length of text_data 100


AttributeError: 'list' object has no attribute 'lower'

## Train LDA model

In [114]:
import gensim
from gensim import corpora
from gensim.models import CoherenceModel

import re

read_processed_data = []  # Initialize an empty list
current_document = []      # Temporary list to store tokens for the current document

with open("processed_data.txt", "r") as file:
    for line in file:
        # Check if the line is empty (i.e., it's a document separator)
        if line.strip() == "":
            # If we have tokens in the current document, add them to the list
            if current_document:
                read_processed_data.append(current_document)
                current_document = []  # Reset for the next document
        else:
            # Remove any unwanted characters (like brackets or quotes) from the line
            cleaned_line = line.strip().replace('[', '').replace(']', '')
            
            # Split the line into words
            words = cleaned_line.split()
            
            # Clean each word to remove punctuation (like commas and single quotes)
            cleaned_words = [re.sub(r"[',]", '', word) for word in words]
            
            # Add the cleaned words to the current document
            current_document.extend(cleaned_words)

    # After the loop, add the last document (if any) to the list
    if current_document:
        read_processed_data.append(current_document)

# # Check the structure of the read_processed_data
# for i, doc in enumerate(read_processed_data[:2]):  # Print the first two documents for inspection
#     print(f"Document {i+1}: {doc}")

# Create a dictionary and corpus from the preprocessed data
dictionary = corpora.Dictionary(read_processed_data)
corpus = [dictionary.doc2bow(text) for text in read_processed_data]

# Train the LDA model
lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=15, random_state=100, passes=10)

# Print the topics found by the LDA model
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)
#Get coherence score
print(CoherenceModel(model=lda_model, texts=read_processed_data, dictionary=dictionary, coherence='c_v').get_coherence())

(0, '0.019*"compani" + 0.013*"technolog" + 0.013*"nasdaq" + 0.011*"zack" + 0.011*"market"')
(1, '0.026*"drive" + 0.022*"car" + 0.018*"nvidia" + 0.017*"vehicl" + 0.017*"self"')
(2, '0.033*"quarter" + 0.031*"year" + 0.022*"revenu" + 0.021*"zack" + 0.021*"compani"')
(3, '0.012*"market" + 0.011*"trade" + 0.009*"week" + 0.008*"year" + 0.007*"china"')
(4, '0.025*"stock" + 0.022*"earn" + 0.020*"year" + 0.020*"zack" + 0.016*"compani"')
(5, '0.030*"zack" + 0.020*"invest" + 0.017*"analyst" + 0.015*"stock" + 0.015*"research"')
(6, '0.035*"nvidia" + 0.029*"amd" + 0.027*"game" + 0.021*"gpu" + 0.014*"graphic"')
(7, '0.039*"stock" + 0.035*"trade" + 0.032*"invest" + 0.023*"nvidia" + 0.021*"nasdaq"')
(8, '0.055*"inc" + 0.039*"nasdaq" + 0.026*"nyse" + 0.016*"biotech" + 0.015*"share"')
(9, '0.011*"simo" + 0.010*"motion" + 0.010*"ssd" + 0.007*"silicon" + 0.006*"control"')
(10, '0.011*"appl" + 0.009*"googl" + 0.008*"compani" + 0.006*"new" + 0.006*"also"')
(11, '0.019*"stock" + 0.013*"etf" + 0.012*"nasdaq" 

##  BERTTopic

In [38]:
## Bert Topic code

## FLSA-W Model

In [None]:
## FLSA-W Model