Possible datasets
 - https://data.world/crowdflower/apple-twitter-sentiment
 - https://paperswithcode.com/dataset/stocknet-1
 - https://www.kaggle.com/datasets/equinxx/stock-tweets-for-sentiment-analysis-and-prediction
 - https://www.kaggle.com/datasets/thedevastator/tweet-sentiment-s-impact-on-stock-returns
 - https://ieee-dataport.org/open-access/stock-market-tweets-data
 - https://www.kaggle.com/datasets/yash612/stockmarket-sentiment-dataset
 - https://www.kaggle.com/datasets/kazanova/sentiment140

In [2]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import gensim
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import time
from multiprocessing import Pool
from tqdm import tqdm
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Howard\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10
BATCH_SIZE = 1024
EPOCHS = 8
SEQUENCE_LENGTH = 300
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

In [4]:
dataset_filename = os.listdir("data")[0]
dataset_path = os.path.join("","data",dataset_filename)
df = pd.read_csv(dataset_path, encoding =DATASET_ENCODING , names=DATASET_COLUMNS)

## Preprocessing:

In [5]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}

def decode_sentiment(label):
    return decode_map[int(label)]

df.target = df.target.apply(lambda x: decode_sentiment(x))

stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

def preprocess(text):
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = [word for word in text.split() if word not in stop_words]
    return " ".join(tokens)

df.text = df.text.apply(lambda x: preprocess(x))

## Word2Vec Training:

In [None]:
documents = [_text.split() for _text in df.text] 
w2v_model = gensim.models.word2vec.Word2Vec(window=W2V_WINDOW, min_count=W2V_MIN_COUNT, workers=8)
w2v_model.build_vocab(documents)
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

NameError: name 'df' is not defined

## Preparation for PyTorch Model:

In [22]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, w2v_model, max_length=300):
        args = [(text, w2v_model, max_length) for text in texts]
        with Pool(processes=os.cpu_count()) as pool:
            self.texts = list(tqdm(pool.starmap(self.text_to_sequence, args), total=len(args)))
        self.labels = [0 if label == "NEGATIVE" else 1 for label in labels]
        self.max_length = max_length
    @staticmethod
    def text_to_sequence(text, w2v_model, max_length):
        sequence = []
        for word in text.split():
            if word in w2v_model.wv.key_to_index:  # Updated this line
                sequence.append(w2v_model.wv[word])
        if len(sequence) < max_length:
            sequence += [[0]*W2V_SIZE for _ in range(max_length - len(sequence))]
        else:
            sequence = sequence[:max_length]
        return sequence

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tensor_text = torch.tensor(self.texts[idx], dtype=torch.float)
        tensor_label = torch.tensor(self.labels[idx], dtype=torch.float)
        return tensor_text, tensor_label

df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)

train_data = TextDataset(df_train.text, df_train.target, w2v_model, SEQUENCE_LENGTH)
train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)

test_data = TextDataset(df_test.text, df_test.target, w2v_model, SEQUENCE_LENGTH)
test_loader = DataLoader(test_data, shuffle=False, batch_size=BATCH_SIZE)

In [None]:
vocab = {word: index for index, word in enumerate(w2v_model.wv.index_to_key)}
embedding_matrix = np.zeros((len(vocab), W2V_SIZE))
for word, i in vocab.items():
    embedding_matrix[i] = w2v_model.wv[word]


In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_weights, vocab_size, embed_size=300, hidden_size=100, output_size=1):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_weights, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.dropout(x)
        x = self.fc(x[:, -1, :])
        return self.sigmoid(x)

model = LSTMClassifier(embedding_weights=embedding_matrix, vocab_size=len(vocab), embed_size=W2V_SIZE)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels.float())
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    return epoch_loss

def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct_predictions = 0
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels.float())
            
            running_loss += loss.item() * inputs.size(0)
            preds = torch.round(outputs)
            correct_predictions += torch.sum(preds == labels.float())
    
    epoch_loss = running_loss / len(test_loader.dataset)
    epoch_acc = correct_predictions.double() / len(test_loader.dataset)
    return epoch_loss, epoch_acc

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

EPOCHS = 10  # or however many you want

train_losses = []
val_losses = []
val_accuracies = []

for epoch in range(EPOCHS):
    train_loss = train_model(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = evaluate_model(model, test_loader, criterion, device)
    
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    val_accuracies.append(val_acc)
    
    print(f"Epoch {epoch+1}/{EPOCHS} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}")



## Evaluate

In [None]:
# Assuming you store train and validation loss & accuracy in separate lists

# Lists to keep track of training progress (For the sake of example)
train_losses = []
val_losses = []
val_accs = []

# ... inside your training loop
train_losses.append(train_loss)
val_losses.append(val_loss)
val_accs.append(val_acc)

# Plotting
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.legend()
plt.title('Losses')

plt.subplot(1, 2, 2)
plt.plot(val_accs, label='Validation Accuracy')
plt.legend()
plt.title('Accuracy')

plt.tight_layout()
plt.show()


In [None]:
def predict(model, text):
    model.eval()
    processed_text = preprocess(text)
    tokenized_text = ...  # Tokenize the processed text using the same method as before
    input_tensor = torch.tensor(tokenized_text).to(device)
    with torch.no_grad():
        score = model(input_tensor)
    return decode_sentiment(score.item(), include_neutral=True)

y_pred = [predict(model, text) for text in df_test.text]
y_true = df_test.target.tolist()

print("Classification Report:")
print(classification_report(y_true, y_pred))
print("\nAccuracy Score:", accuracy_score(y_true, y_pred))


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_true, y_pred, labels=[POSITIVE, NEUTRAL, NEGATIVE])

# Plot confusion matrix
plt.figure(figsize=(8,6))
sns.heatmap(cnf_matrix, annot=True, fmt='g', cmap='Blues', 
            xticklabels=[POSITIVE, NEUTRAL, NEGATIVE], 
            yticklabels=[POSITIVE, NEUTRAL, NEGATIVE])
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()


In [3]:
torch.save(model.state_dict(), "sentiment_model.pth")

In [None]:
model = LSTMClassifier(embedding_weights=embedding_matrix, vocab_size=len(vocab), embed_size=W2V_SIZE)
model.load_state_dict(torch.load("sentiment_model.pth"))
model = model.to(device)
