In [2]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score

In [13]:

    
class LSTMModel(nn.Module):
    def __init__(self,vocab_size ,embedding_dim, lstm_hidden_dim, output_dim, dropout_prob=0.2):
        super(LSTMModel, self).__init__()
        self.linear = nn.Linear(vocab_size, embedding_dim)
    
        self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim,dropout=dropout_prob, batch_first=True)
    
        self.fc = nn.Linear(lstm_hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.linear(x)
        
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out
        output = self.fc(lstm_out)
        return output



In [4]:
input_data_size = 1600000
data_path = "/kaggle/input/input-data-twitter/cleaned_data.csv"
df = pd.read_csv(data_path).dropna().sample(input_data_size)

In [5]:
import random
 # drop rows with missing values
df = [[x[-1], 0 if x[0] == 0 else 1] for x in df.values.tolist()] # convert the labels from 0 and 4 to 0 and 1
random.shuffle(df)
split_idx = int(len(df) * 0.8)  # index of element around ~ the first 80% of data

train_set, test_set = df[:split_idx], df[split_idx:] # training: 80% of data, validation: 20% of data

In [6]:
stop_words= "english"
max_word_freq_pct = 0.9 # 0.9
min_word_freq_cnt = 2 # 2
max_features = None

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=max_word_freq_pct, min_df=min_word_freq_cnt, max_features=max_features, stop_words=stop_words)
bow_vectorizer.fit([x[0] for x in train_set])



In [8]:
vocab_size = len(bow_vectorizer.vocabulary_)

In [9]:
device = torch.device("cuda") 
vectorizer = bow_vectorizer

In [10]:
X_train_vectorized =  vectorizer.transform([x[0] for x in train_set])
X_test_vectorized = vectorizer.transform([x[0] for x in test_set])
# torch.tensor(vectorizer.transform([x[0] for x in test_set]).toarray(), dtype=torch.float32)

In [11]:
class TwitterDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx].toarray()[0], dtype=torch.float32), self.y[idx]

In [14]:
embedding_dim = 128
hidden_dim = 100
output_dim = 2
dropout = 0.2
learning_rate = 0.01
epochs = 100
batch_size = 128

model = LSTMModel(vocab_size,embedding_dim, hidden_dim, output_dim, dropout).to(device)
optimizer = optim.Adam(model.parameters(),lr=learning_rate)
criterion = nn.CrossEntropyLoss()

y_train = [x[1] for x in train_set]
y_test = [x[1] for x in test_set] 

y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)
train_dataset = TwitterDataset(X_train_vectorized, y_train)
test_dataset = TwitterDataset(X_test_vectorized, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) 



In [17]:
import wandb
wandb.login(key="2fff0adbd8c9e77982d937a66b6de880392e9e92")

[34m[1mwandb[0m: Currently logged in as: [33mkarpinski-j[0m ([33mkarpinski-gsn[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [18]:
import wandb
from datetime import datetime
run_name = datetime.today().strftime('%d%m%y%H%M%S')
wandb_run = wandb.init(
    project="nlp-sentiement-analysis",
    name=run_name,
    config={
        "learning_rate": learning_rate,
        "architecture": "LSTM",
        "dataset": "Twitter Sentiment Analysis",
        "epochs": epochs,
        "train_size": len(train_loader),
        "test_size": len(test_loader),
        "batch_size": batch_size,
        "embedding_dim": embedding_dim,
        "hidden_dim": hidden_dim,
        "dropout": dropout,
        "vectorizer": vectorizer.__class__.__name__,
        "input_data_size": input_data_size,
        "vocab_size": vocab_size

    }
)

In [None]:

for epoch in range(epochs):
    # Training
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    for batch_idx, (inputs, labels) in enumerate(train_loader):

        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels) # input to the model
        loss.backward() # update weights based on loss
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs, 1)
        train_correct += (predicted == labels).sum().item()
        train_total += labels.size(0)
        if batch_idx % 100 == 9:  # Print every 10 batches
            train_loss_avg = train_loss / train_total
            train_acc_avg = train_correct / train_total
            print(f'Epoch [{epoch+1}/{epochs}], Step [{batch_idx+1}/{len(train_loader)}], Train Loss: {train_loss_avg:.4f}, Train Acc: {train_acc_avg:.4f}')        
        
    steps_cnt = len(train_loader)
    train_loss_avg = train_loss / train_total
    train_acc_avg = train_correct / train_total

    

    # Validation, we have to use "eval" so that the dropout isn't added
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            val_loss += loss.item() 
            _, predicted = torch.max(outputs, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    # Print validation loss and accuracy after each epoch
    val_loss_avg = val_loss / len(test_loader)
    val_acc_avg = val_correct / val_total
    precision = precision_score(all_labels, all_predictions, average='weighted')
    recall = recall_score(all_labels, all_predictions, average='weighted')
    f1 = f1_score(all_labels, all_predictions, average='weighted')
    print(f'Epoch [{epoch+1}/{epochs}], Val Loss: {val_loss_avg:.4f}, Val Acc: {val_acc_avg:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')
    wandb_run.log({
            f"epoch": epoch,
            f"train_loss": train_loss_avg,
            f"val_loss": val_loss_avg,
            f"val_accuracy": val_acc_avg,
            f"train_accuracy":train_acc_avg,
            f"precision": precision,
            f"recall": recall,
            f"f1": f1
    

    })

torch.save(model.state_dict(), "./data/model.pth")
artifact = wandb.Artifact(run_name, type='model')
artifact.add_file("./data/model.pth")
wandb_run.log_artifact(artifact)
wandb_run.finish()

Epoch [1/100], Step [10/10000], Train Loss: 0.6809, Train Acc: 0.5734
Epoch [1/100], Step [110/10000], Train Loss: 0.5847, Train Acc: 0.6903
Epoch [1/100], Step [210/10000], Train Loss: 0.5643, Train Acc: 0.7111
Epoch [1/100], Step [310/10000], Train Loss: 0.5500, Train Acc: 0.7217
Epoch [1/100], Step [410/10000], Train Loss: 0.5431, Train Acc: 0.7272
Epoch [1/100], Step [510/10000], Train Loss: 0.5389, Train Acc: 0.7298
Epoch [1/100], Step [610/10000], Train Loss: 0.5358, Train Acc: 0.7322
Epoch [1/100], Step [710/10000], Train Loss: 0.5335, Train Acc: 0.7341
Epoch [1/100], Step [810/10000], Train Loss: 0.5296, Train Acc: 0.7371
Epoch [1/100], Step [910/10000], Train Loss: 0.5274, Train Acc: 0.7392
Epoch [1/100], Step [1010/10000], Train Loss: 0.5261, Train Acc: 0.7403
Epoch [1/100], Step [1110/10000], Train Loss: 0.5248, Train Acc: 0.7413
Epoch [1/100], Step [1210/10000], Train Loss: 0.5233, Train Acc: 0.7418
Epoch [1/100], Step [1310/10000], Train Loss: 0.5227, Train Acc: 0.7422
Epo