# Stock Sentiment Analysis
Natural Language Processing project that uses financial news headlines from finviz (https://finviz.com) to visualize and compare stock prospects for selected tickers over time   

The Sentiment Analysis Model is trained under 5 different approaches below, and the one with the best accuracy is selected as the final model.
- MLP (Multi Layer Perceptron)
- CNN (Convolutional Neural Netwrok)
- LSTM
- GRU


In [None]:
# All imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import SimpleRNN
from keras.layers import LSTM
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
import json


## Training Data Set
Sentiment Analysis data for Financial News (https://www.kaggle.com/datasets/ankurzing/sentiment-analysis-for-financial-news)   
This dataset (FinancialPhraseBank) contains the sentiments for financial news headlines from the perspective of a retail investor.

In [None]:
# reading csv file (debugging encoding)
colnames=['label', 'headline'] 
df = pd.read_csv('./data/all-data.csv',delimiter=',',encoding='latin-1', names=colnames, header=None)
df.head()

In [None]:
df.shape

In [None]:
cnt_pro = df['label'].value_counts()
print(cnt_pro)
print(type(cnt_pro.index))
print(type(cnt_pro.values))

In [None]:
# visualizing data
cnt_pro = df['label'].value_counts()
plt.figure(figsize=(12,4))
# Create the bar plot and store the ax object
ax = sns.barplot(x=cnt_pro.index, y=cnt_pro.values, alpha=0.8)

# Add value labels on top of the bars
for p in ax.patches:
    height = int(p.get_height())  # Convert the height to an integer
    ax.annotate(f'{height}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
                textcoords='offset points')
    
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('sentiment', fontsize=12)
plt.xticks(rotation=90)
plt.show()

## Data Processsing
- converting label to numeric (postive: 1, neutral: 0, negative: -1)
- cleaning text: tokenizing, removing punctuation, digits, convert to lowercase
- converting dataframe to numpy arrays
- fixing input data to tensors with same dimensions

In [None]:
#Convert label to numeric
sentiment  = {'positive': 1,'neutral': 0,'negative':-1} 

df.label = [sentiment[item] for item in df.label] 
print(df)

In [None]:
# convert headline to list of strings
# removing punctuation
punctstr = string.punctuation
punctstr = punctstr.replace('.','') # don't want to remove periods because they might represent decimal points
print(punctstr)

In [None]:
# cleaning text: puncuation, convert to lowercase
punctdigstr = string.punctuation
punctdigstr += "0123456789"

def cleanText(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', punctdigstr))
    text = text.replace(' s ', ' ') # removing 's

    text = text.split()
    return text

df['headline'] = df['headline'].apply(cleanText)

In [None]:
print(df[:20].headline)

max_length = df['headline'].apply(len).max()
print("Length of the longest list:", max_length)

In [None]:
# this is for determining the input_dim of the embedding layers later on
# Concatenate all lists in the 'headline' column to create a single list containing all words
all_words = [word for sublist in df['headline'] for word in sublist]

# Calculate the vocabulary size, which is the total number of unique words
vocabulary_size = len(set(all_words))

print("Vocabulary size:", vocabulary_size)

In [None]:
# Initialize the tokenizer
max_words = vocabulary_size  # Set the maximum number of words in your vocabulary
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')  # <OOV> for out-of-vocabulary words

# Fit the tokenizer on text data
tokenizer.fit_on_texts(df['headline'])

# Get the word-to-index mapping
word_index = tokenizer.word_index

# Convert each headline into a sequence of word indices
sequences = tokenizer.texts_to_sequences(df['headline'])

In [None]:
with open('word_index_library.json', 'w') as f:
    json.dump(word_index, f)

In [None]:
print(sequences)
print(type(sequences))

In [None]:
# Create a new DataFrame with the 'headline' column containing the sequences
df_headline_tokenized = pd.DataFrame({'headline': sequences})

print(df_headline_tokenized)

In [None]:
#split into training and testing data (80% - 20%)
X = df_headline_tokenized  # Features
y = df['label']              # Target

# Set the proportion for training and testing (e.g., 80% for training, 20% for testing)
test_size = 0.2

# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)


In [None]:
print(X_train)
print(y_train)

In [None]:
# convert the training and testing data into numpy arrays
X_train= X_train['headline'].values
X_test = X_test['headline'].values

print(X_train)
print(type(X_train))


In [None]:
# convert the training and testing data into numpy arrays
# y is a pandas series so require different conversion
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()
print(y_train)
print(type(y_train))

In [None]:
# Convert y_train, y_test into one-hot encoded format
num_classes = 3
y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

In [None]:
print(y_train)
print(type(y_train))

In [None]:
# convert headline list to tensor and convert with same length (50)
max_words = 50
X_train = sequence.pad_sequences(X_train, maxlen = max_words, dtype=object)
X_test = sequence.pad_sequences(X_test, maxlen=max_words, dtype=object)

In [None]:
print("X_train.shape: ",  X_train.shape)
print("X_test.shape: " , X_test.shape)

In [None]:
print(X_train[0])

In [None]:
# debugging: need to convert all values to same type
X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)

In [None]:
print(X_train[0])

In [None]:
print(X_train)

In [None]:
print("X_train.shape: ", X_train.shape)
print("y_train.shape: ", y_train.shape)

## Text Data Vectorization
Purpose:
- map text to geometrical space, using space to describe relationship between text 
- RNN, LSTM, and GRU 's input and output data are all vectors

Implementation: Adding Embedding layer with Keras

In [None]:
# this is an example that will be implemented likewise in every model
# Embedding must be the first layer of sequential model
model = Sequential()
# setting output dimension to 100 initially
model.add(Embedding(vocabulary_size, 32, input_length=max_words))

## Using MLP for Sentiment Analysis
observed how people built their MLP archtitecture in similar sentiment analysis projects

In [None]:
num_classes = 3  # Three classes: positive, neutral, and negative
MLP_model = Sequential()
MLP_model.add(Embedding(vocabulary_size, 32, input_length=max_words))
MLP_model.add(Dropout(0.25)) # 25% of the units will be set to 0 during training
MLP_model.add(Flatten())    # flatten to 1D vector
MLP_model.add(Dense(256, activation="relu")) # relu: non-linearity
MLP_model.add(Dropout(0.25))
MLP_model.add(Dense(units=num_classes, activation='softmax'))  # Output layer

In [None]:
# compiling model
# For multi-class classification, 'categorical_crossentropy' is used.
MLP_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
MLP_history = MLP_model.fit(X_train, y_train, validation_split=0.2, epochs=5, batch_size=102, verbose=2)

In [None]:
# evaluating model performance with testing data
MLP_loss, MLP_accuracy = MLP_model.evaluate(X_test, y_test)
print("testing dataset's accuracy = {:.2f}".format(MLP_accuracy))

## Using RNN for Sentiment Analysis
RNN has memory over sequential data, can treat each headline as a sequence data


In [None]:
num_classes = 3
RNN_model = Sequential()
RNN_model.add(Embedding(vocabulary_size, 32, input_length=max_words))
RNN_model.add(Dropout(0.25))
# Set return_sequences=True to ensure the output is 3D (batch_size, timesteps, units)
RNN_model.add(SimpleRNN(32))
RNN_model.add(Dropout(0.25))
RNN_model.add(Dense(units=num_classes, activation='softmax'))  # Output layer

In [None]:
RNN_model.summary()

In [None]:
# compiling model
# For multi-class classification, 'categorical_crossentropy' is used.
RNN_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
RNN_history = RNN_model.fit(X_train, y_train, validation_split=0.2, epochs=5, batch_size=102, verbose=2)

In [None]:
# evaluating model performance with testing data
RNN_loss, RNN_accuracy = RNN_model.evaluate(X_test, y_test)
print("testing dataset's accuracy = {:.2f}".format(RNN_accuracy))

## Using LSTM for Sentiment Analysis
LSTM is an improvisation of RNN's vanishing gradient problem, which includes long term memory via cell state, can analyze words with long time step gaps