# Sentiment Analysis of Stock Market News Data
* The goal is to classify text into categories such as positive or negative.

## Models for Sentiment Analysis
1. **Baseline Model: Logistic Regression**: A baseline model that uses traditional machine learning techniques with text features. 
2. **Intermediate Model: Recurrent Neural Networks (RNNs) with LSTM**:  Captures sequential dependencies and context in text.
3. **Advanced Model: Transformers (BERT etc.)**: Provides state-of-the-art performance by leveraging deep contextual understanding.

# Get Utils file from Github 
https://github.com/kamran945/NLP-Text-Classification/raw/main/nlp_sentiment_utils.py

In [None]:
!pip install contractions # required in utils file

In [None]:
import os

file_name = "nlp_sentiment_utils.py"
github_url = "https://github.com/kamran945/NLP-Text-Classification/raw/main/nlp_sentiment_utils.py"

if not os.path.exists(file_name):
    print(f"{file_name} not found. Downloading from GitHub...")
    !wget {github_url} -O {file_name}
else:
    print(f"{file_name} already exists. No need to download.")

import nlp_sentiment_utils

# Load and Explore the data

In [None]:
import pandas as pd

df = pd.read_csv("/kaggle/input/stockmarket-sentiment-dataset/stock_data.csv")
df.head()

In [None]:
# check distribution of classes
import matplotlib.pyplot as plt

df["Sentiment"].value_counts(normalize=True).plot(kind='bar')
plt.xlabel('Sentiment')
plt.ylabel('Frequency')
plt.title('Sentiment Frequency Distribution');

In [None]:
df["Sentiment"].replace(-1, 0, inplace=True) # replace -1 with 0
df.rename(columns={'Text': 'text'}, inplace=True) # rename 'Text' column to 'text'
df.tail()

## Clean Text

In [None]:
from nltk.corpus import stopwords
import re

stop_words = stopwords.words('english')
important_stop_words = ['not', 'no', 'nor', "isn't", "aren't", "wasn't", "weren't", "haven't", "hasn't", "hadn't", "doesn't", "don't", "didn't", "won't", "wouldn't", "shan't", "shouldn't", "can't", "couldn't", "mustn't", "mightn't", "needn't"]

stop_words = [word for word in stop_words if not re.match(r'^(not|no|nor|\w*\'?n?t|[\w]*n$)$', word)]
print(stop_words)

In [None]:
df = nlp_sentiment_utils.clean_text(df, column="text")
df

## Vocabulary

In [None]:
vocabulary = nlp_sentiment_utils.get_vocabulary(df, column='text')

In [None]:
reduced_vocab = nlp_sentiment_utils.reduce_vocabulary(vocabulary, quantile=0.95)

### View Vocabualry

In [None]:
nlp_sentiment_utils.plot_wordcloud(vocabulary, title='Both Classes: Word Cloud')

In [None]:
# get vocabulary for different classes

vocab_class_1 = nlp_sentiment_utils.get_vocabulary(df[df["Sentiment"] == 1])
vocab_class_0 = nlp_sentiment_utils.get_vocabulary(df[df["Sentiment"] == 0])
nlp_sentiment_utils.plot_wordcloud(vocab_class_1, title='Class 1: Word Cloud')
nlp_sentiment_utils.plot_wordcloud(vocab_class_0, title='Class 0: Word Cloud')


# Create Train, Validation and Test Splits

In [None]:
from sklearn.model_selection import train_test_split

X = df['text']  # Feature
y = df['Sentiment']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, 
                                                stratify=y_test, random_state=42)

len(X_train), len(y_train), len(X_val), len(y_val), len(X_test), len(y_test)

In [None]:
import numpy as np
np.sum(y_train == 1) / len(y_train), np.sum(y_val == 1) / len(y_val), np.sum(y_test == 1) / len(y_test)

# Baseline: Logistic Regression Model

## Count Vectorizer
* CountVectorizer implements the Bag of Words (BoW) model
* **BoW Model**: Represents text data as a collection of words and their frequencies, ignoring grammar and word order. **Counts the occurrences of each word** in the documents and converts these counts into a numerical feature matrix.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

vectorizer_count = CountVectorizer() 

X_train_count = vectorizer_count.fit_transform(X_train)

X_val_count = vectorizer_count.transform(X_val)
X_test_count = vectorizer_count.transform(X_test)

model = LogisticRegression(class_weight='balanced')
model.fit(X_train_count, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
from collections import defaultdict

eval_metrics = defaultdict(dict)

y_pred = model.predict(X_val_count)
eval_metrics.update({'logistic_reg_count': nlp_sentiment_utils.get_eval_metrics(y_val, y_pred)})
print(classification_report(y_val, y_pred))

## TF-TDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tfidf = TfidfVectorizer()

X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)

X_val_tfidf = vectorizer_tfidf.transform(X_val)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

model = LogisticRegression(class_weight='balanced')
model.fit(X_train_tfidf, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
from collections import defaultdict

y_pred = model.predict(X_val_tfidf)
eval_metrics.update({'logistic_reg_tfidf': nlp_sentiment_utils.get_eval_metrics(y_val, y_pred)})
print(classification_report(y_val, y_pred))

In [None]:
eval_metrics

# RNN Model (LSTM Based) using PyTorch

## Prepare the data for Deep Learning Model

## LSTM Model

In [None]:
import torch
import torch.nn as nn

class LSTMClassifierWithEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim, 
                 hidden_size, num_classes, num_layers=1, 
                 dropout=0.5):
        
        super(LSTMClassifierWithEmbedding, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_size, 
                            num_layers, # number of stacked layers in lstm
                            batch_first=True, # first dimension represents the batch size
                            dropout=dropout,
                            bidirectional=False)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        # Apply embedding layer
        x = self.embedding(x)
        
        # Forward pass through LSTM
        lstm_out, _ = self.lstm(x) 
        
        # Take the output from the last time step
        # Last step in the sequence represents the entire sequence
        # This is generally done for classification tasks
        lstm_out = lstm_out[:, -1, :] 
        
        # Apply dropout
        lstm_out = self.dropout(lstm_out)
        
        # Fully connected layer
        out = self.fc(lstm_out)
        
        return out


In [None]:
# work in progress



# import torch
# import torch.nn as nn
# import torch.optim as optim

# vocab_size = reduced_vocabulary
# embedding_dim = 256
# hidden_size = 128
# num_classes = 2
# num_layers = 3

# model_lstm = LSTMClassifierWithEmbedding(vocab_size, embedding_dim, 
#                                          hidden_size, num_classes,
#                                          num_layers=num_layers)
# loss_fcn = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model_lstm.parameters(), lr=0.001)

# # Training loop
# num_epochs = 10
# for epoch in range(num_epochs):
#     for inputs, labels in dataloader:
#         # Forward pass
#         outputs = model_lstm(inputs)
#         loss = loss_fcn(outputs, labels)
        
#         # Backward pass and optimization
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()