In [177]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

In [178]:
class TwitterDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
class LSTMModel(nn.Module):
    def __init__(self,vocab_size ,embedding_dim, lstm_hidden_dim, output_dim, dropout_prob=0.2):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(p=dropout_prob)
        self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, dropout=dropout_prob, batch_first=True)
        self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, batch_first=True)
        self.fc = nn.Linear(lstm_hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded_dropout = self.dropout(embedded)
        lstm_out, _ = self.lstm(embedded_dropout)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out[:, -1, :]
        output = self.fc(lstm_out)
        return output


# Load data

In [179]:
input_data_size = 100000
data_path = "./data/cleaned_data.csv"
df = pd.read_csv(data_path).dropna().sample(100000)

# Split data

In [180]:
X = df['processed_text']
Y = df['sentiment']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)


# Vectorizer

## Config

In [164]:
stop_words= "english"
max_word_freq_pct = 0.9 # 0.9
min_word_freq_cnt = 2 # 2
max_features = 100

## Bag of words

In [165]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=max_word_freq_pct, min_df=min_word_freq_cnt, max_features=max_features, stop_words=stop_words)
bow_vectorizer.fit(train_df['text'])

X_train_vectorized = bow_vectorizer.transform(train_df['text'])
X_test_vectorized = bow_vectorizer.transform(test_df['text'])

In [166]:
len(bow_vectorizer.vocabulary_)

100

## TF-IDF

In [167]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = CountVectorizer(max_df=max_word_freq_pct, min_df=min_word_freq_cnt, max_features=max_features, stop_words=stop_words)
tfidf_vectorizer.fit(train_df['text'])
len(tfidf_vectorizer.vocabulary_)

100

# Create Dataset

In [168]:
device = torch.device("cuda") 
vectorizer = tfidf_vectorizer

In [169]:
X_train_vectorized = vectorizer.transform(train_df['text'])
X_test_vectorized = vectorizer.transform(test_df['text'])

In [170]:
vocab_size = len(vectorizer.vocabulary_)
vocab_size

100

In [174]:
embedding_dim = 128
hidden_dim = 100
output_dim = 2
dropout = 0.2
learning_rate = 0.01
epochs = 100
batch_size = 8

model = LSTMModel(vocab_size,embedding_dim, hidden_dim, output_dim, dropout).to(device)
optimizer = optim.Adam(model.parameters(),lr=learning_rate)
criterion = nn.CrossEntropyLoss()


train_dataset = TwitterDataset(X_train_vectorized, Y_train)
test_dataset = TwitterDataset(X_test_vectorized, Y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) 

In [175]:
print(model)

LSTMModel(
  (embedding): Embedding(100, 128)
  (lstm): LSTM(128, 100, batch_first=True)
  (fc): Linear(in_features=100, out_features=2, bias=True)
)


## Setup wandb

In [None]:
import wandb
wandb.login(key="")

In [162]:
import wandb
from datetime import datetime
run_name = datetime.today().strftime('%d%m%y%H%M%S')
wandb_run = wandb.init(
    project="nlp-sentiement-analysis",
    name=run_name,
    config={
        "learning_rate": learning_rate,
        "architecture": "LSTM",
        "dataset": "Twitter Sentiment Analysis",
        "epochs": epochs,
        "train_size": len(train_loader),
        "test_size": len(test_loader),
        "batch_size": batch_size,
        "embedding_dim": embedding_dim,
        "hidden_dim": hidden_dim,
        "dropout": dropout,
        "vectorizer": vectorizer.__class__.__name__,
        "input_data_size": input_data_size,
        "vocab_size": vocab_size

    }
)

In [176]:

for epoch in range(epochs):
    # Training
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    for inputs, labels in train_loader:

        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels) # input to the model
        loss.backward() # update weights based on loss
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_correct += (predicted == labels).sum().item()
        train_total += labels.size(0)
        
    steps_cnt = len(train_loader)
    train_loss_avg = train_loss / len(train_loader)
    train_acc_avg = train_correct / train_total
    print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss_avg:.4f}, Train Acc: {train_acc_avg:.4f}')
    

    # Validation, we have to use "eval" so that the dropout isn't added
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            val_loss += loss.item() 
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    # Print validation loss and accuracy after each epoch
    val_loss_avg = val_loss / len(test_loader)
    val_acc_avg = val_correct / val_total
    print(f'Epoch [{epoch+1}/{epochs}], Val Loss: {val_loss_avg:.4f}, Val Acc: {val_acc_avg:.4f}')
    wandb_run.log({
            f"epoch": epoch,
            f"train_loss": train_loss_avg,
            f"val_loss": val_loss_avg,
            f"val_accuracy": val_acc_avg,
            f"train_accuracy":train_acc_avg
    

    })

torch.save(model.state_dict(), "./data/model.pth")
artifact = wandb.Artifact(run_name, type='model')
artifact.add_file("./data/model.pth")
wandb_run.log_artifact(artifact)
wandb_run.finish()

Epoch [1/100], Train Loss: 0.6934, Train Acc: 0.5054
Epoch [1/100], Val Loss: 0.6939, Val Acc: 0.5037
Epoch [2/100], Train Loss: 0.6936, Train Acc: 0.5044
Epoch [2/100], Val Loss: 0.6954, Val Acc: 0.5032


KeyboardInterrupt: 