## Assignment 2, Natural Language Processing, Group 14

## Load the dataset and choose only the articles where NVIDIA word appears in the content

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('us_equities_news_dataset.csv')

# Filter rows where 'content' column contains the word 'NVIDIA'
nvidia_rows = df[df['content'].str.contains('NVIDIA', case=False, na=False)]

# Display the filtered rows
print(nvidia_rows)

            id ticker                                              title  \
24      221539    NIO  A Central Bank War Just Started And Its Good F...   
32      221547    NIO         6 Stocks To Watch  Nivida Could Be Falling   
57      221572    NIO  Stocks   Dow Drops Nearly 400 Points as Apple ...   
78      221593   UBER  The Zacks Analyst Blog Highlights  Advanced Mi...   
82      221597   UBER                     The Best Of CES 2020  Revised    
...        ...    ...                                                ...   
221141  442657    AMD    Here s Why Nvidia  NVDA  Stock Is Gaining Today   
221166  442682    AMD      4 Stocks To Watch Today  ATW  CWEI  MXL  SLCA   
221189  442705    AMD  Here s What The Buy Side Expects From AMD Thur...   
221468  442984      T  Zacks com Featured Highlights  AT T  Nu Skin E...   
221471  442987      T  5 Dividend Growth Stocks To Sail Through Uncer...   

       category                                            content  \
24      opinion  

## Remove unnecessary columns

In [4]:
# Remove the specified columns
nvidia_rows = nvidia_rows.drop(['id', 'ticker', 'url', 'release_date'], axis=1)

# Save the modified DataFrame to a new CSV file
nvidia_rows.to_csv('nvidia_rows_news_dataset.csv', index=False)

Index(['id', 'ticker', 'title', 'category', 'content', 'release_date',
       'provider', 'url', 'article_id'],
      dtype='object')


## Preprocess the data

In [1]:
import nltk;

import click 
import joblib
import regex
import tqdm

nltk.download('punkt')


ModuleNotFoundError: No module named 'tdqm'

In [28]:
from nltk.tokenize import word_tokenize
import re
from unidecode import unidecode
import pandas as pd
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_texts(texts, n=0):
    processed_texts = []
    for text in texts:
        # lowercasing, keep text only, remove accents, tokenization
        tokens = [word for word in word_tokenize(re.sub(r'[^a-zA-Z\s]', '', unidecode(text.lower())))]

        # stopword removal
        tokens = [token for token in tokens if token not in stopwords.words('english')]

        # remove short words
        tokens = [token for token in tokens if len(token) > 2]

        # Apply stemming to each token
        stemmed_tokens = [stemmer.stem(word) for word in tokens]
        
        processed_texts.append(stemmed_tokens)

        # remove top-n% and bottom-n% words (optional)
        if n > 0:
            word_freq = Counter([word for sentence in processed_texts for word in sentence])
            top_n = set([word for word, _ in word_freq.most_common(int(n/100*len(word_freq)))])
            bottom_n = set([word for word, _ in word_freq.most_common()[:-int(n/100*len(word_freq))-1:-1]])
            processed_texts = [[word for word in sentence if word not in top_n and word not in bottom_n] for sentence in processed_texts]
            
    return processed_texts

# Example usage
FILE_PATH = 'nvidia_rows_news_dataset.csv'
df = pd.read_csv(FILE_PATH)
df = df.head(100)
text_data = df['content'].tolist()

processed_data = preprocess_texts(text_data, 0)
print(processed_data)

[['ecb', 'effect', 'move', 'euro', 'huge', 'fall', 'pip', 'huge', 'import', 'piec', 'euro', 'broke', 'essenti', 'support', 'wrote', 'thought', 'euro', 'could', 'even', 'fall', 'pariti', 'fed', 'may', 'need', 'cut', 'rate', 'later', 'year', 'draghi', 'dovish', 'today', 'outlook', 'much', 'german', 'bund', 'fell', 'around', 'bp', 'bp', 'year', 'fell', 'break', 'send', 'year', 'back', 'decemb', 'low', 'time', 'investor', 'come', 'realiz', 'rate', 'remain', 'low', 'fed', 'need', 'stay', 'hold', 'avoid', 'dollar', 'strengthen', 'much', 'dollar', 'rise', 'kill', 'whatev', 'inflationari', 'forc', 'hurt', 'multin', 'compani', 'result', 'may', 'fed', 'need', 'cut', 'rate', 'late', 'keep', 'dollar', 'line', 'versu', 'euro', 'avoid', 'scenario', 'good', 'stock', 'investor', 'move', 'risk', 'curv', 'low', 'interest', 'rate', 'foster', 'multipl', 'expans', 'stock', 'spi', 'nyse', 'spi', 'stock', 'end', 'fall', 'drop', 'roughli', 'basi', 'point', 'certainli', 'end', 'day', 'declin', 'worth', 'note',

## Train LDA model

In [35]:
import gensim
from gensim import corpora
from gensim.models import CoherenceModel

# Create a dictionary and corpus from the preprocessed data
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

# Train the LDA model
lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=100, passes=10)

# Print the topics found by the LDA model
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)
#Get coherence score
print(CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v').get_coherence())

(0, '0.012*"year" + 0.011*"compani" + 0.009*"billion" + 0.009*"invest" + 0.008*"zack"')
(1, '0.028*"nasdaq" + 0.016*"nyse" + 0.013*"inc" + 0.013*"stock" + 0.012*"trade"')
(2, '0.011*"drive" + 0.010*"vehicl" + 0.010*"japan" + 0.009*"driverless" + 0.008*"technolog"')
(3, '0.016*"compani" + 0.014*"drive" + 0.010*"self" + 0.010*"stock" + 0.009*"vehicl"')
(4, '0.010*"year" + 0.009*"uber" + 0.009*"amazon" + 0.009*"compani" + 0.006*"invest"')
0.3889805030932033


##  BERTTopic

In [38]:
## Bert Topic code

## FLSA-W Model

In [None]:
## FLSA-W Model