In [1]:
import pandas as pd
import re
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class TwitterDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
class LSTMModel(nn.Module):
    def __init__(self,vocab_size ,embedding_dim, lstm_hidden_dim, output_dim, dropout_prob=0.2):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(p=dropout_prob)
        self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, dropout=dropout_prob, batch_first=True)
        
        self.fc = nn.Linear(lstm_hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded_dropout = self.dropout(embedded)
        lstm_out, _ = self.lstm(embedded_dropout)
        lstm_out = lstm_out[:, -1, :]
        output = self.fc(lstm_out)
        return output
    


# Load data

In [46]:
input_data_size = 100000
data_path = "./data/cleaned_data.csv"
df = pd.read_csv(data_path).dropna().sample(100000)

# Split data

In [47]:
import random
 # drop rows with missing values
df = [[x[-1], 0 if x[0] == 0 else 1] for x in df.values.tolist()] # convert the labels from 0 and 4 to 0 and 1
random.shuffle(df)
split_idx = int(len(df) * 0.8)  # index of element around ~ the first 80% of data

train_set, test_set = df[:split_idx], df[split_idx:] # training: 80% of data, validation: 20% of data

# Tokenizer

In [22]:
tokenizer_model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name, padding_side="left") 
max_len = 100 # max length of the input. everything else will be cut off

In [48]:
def preprocess_text(text):
    return tokenizer(
        text,
        max_length=max_len,        # Maximum length of the sequence
        padding='max_length',     # Pad to maximum length
        truncation=True,          # Truncate longer sequences
        return_tensors='pt'       # Return PyTorch tensors, which are like numpy arrays
    )

X_train = preprocess_text([x[0] for x in train_set])['input_ids']
X_test = preprocess_text([x[0] for x in test_set])['input_ids']


y_train = [x[1] for x in train_set]
y_test = [x[1] for x in test_set] 

y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)


# Create Dataset

In [31]:
embedding_dim = 128
hidden_dim = 100
output_dim = 2
dropout = 0.2
learning_rate = 0.01
epochs = 100
batch_size = 8
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = LSTMModel(tokenizer.vocab_size,embedding_dim, hidden_dim, output_dim, dropout).to(device)
optimizer = optim.Adam(model.parameters(),lr=learning_rate)
criterion = nn.CrossEntropyLoss()


# train_dataset = TwitterDataset(X_train_tokenied, torch.tensor(Y_train.values,dtype=torch.long) )
# test_dataset = TwitterDataset(X_test_tokenied, torch.tensor(Y_test.values,dtype=torch.long))
train_dataset = TwitterDataset(X_train, y_train)
test_dataset = TwitterDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) 



In [49]:
print(model)

LSTMModel(
  (embedding): Embedding(30522, 128)
  (dropout): Dropout(p=0.2, inplace=False)
  (lstm): LSTM(128, 100, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=100, out_features=2, bias=True)
)


## Setup wandb

In [None]:
import wandb
wandb.login(key="")

In [50]:
import wandb
from datetime import datetime
run_name = datetime.today().strftime('%d%m%y%H%M%S')
wandb_run = wandb.init(
    project="nlp-sentiement-analysis",
    name=run_name,
    config={
        "learning_rate": learning_rate,
        "architecture": "LSTM",
        "dataset": "Twitter Sentiment Analysis",
        "epochs": epochs,
        "train_size": len(train_loader),
        "test_size": len(test_loader),
        "batch_size": batch_size,
        "embedding_dim": embedding_dim,
        "hidden_dim": hidden_dim,
        "dropout": dropout,
        "input_data_size": input_data_size,
        "vocab_size": tokenizer.vocab_size

    }
)

In [52]:

for epoch in range(epochs):
    # Training
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    for batch_idx, (inputs, labels) in enumerate(train_loader):

        inputs = inputs.to(device)
        labels = labels.to(device)
        print(inputs.shape)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels) # input to the model
        loss.backward() # update weights based on loss
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs, 1)
        train_correct += (predicted == labels).sum().item()
        train_total += labels.size(0)
        if batch_idx % 100 == 9:  # Print every 10 batches
            train_loss_avg = train_loss / train_total
            train_acc_avg = train_correct / train_total
            print(f'Epoch [{epoch+1}/{epochs}], Step [{batch_idx+1}/{len(train_loader)}], Train Loss: {train_loss_avg:.4f}, Train Acc: {train_acc_avg:.4f}')        
        
    steps_cnt = len(train_loader)
    train_loss_avg = train_loss / train_total
    train_acc_avg = train_correct / train_total

    

    # Validation, we have to use "eval" so that the dropout isn't added
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            val_loss += loss.item() 
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    # Print validation loss and accuracy after each epoch
    val_loss_avg = val_loss / len(test_loader)
    val_acc_avg = val_correct / val_total
    print(f'Epoch [{epoch+1}/{epochs}], Val Loss: {val_loss_avg:.4f}, Val Acc: {val_acc_avg:.4f}')
    wandb_run.log({
            f"epoch": epoch,
            f"train_loss": train_loss_avg,
            f"val_loss": val_loss_avg,
            f"val_accuracy": val_acc_avg,
            f"train_accuracy":train_acc_avg
    

    })

torch.save(model.state_dict(), "./data/model.pth")
artifact = wandb.Artifact(run_name, type='model')
artifact.add_file("./data/model.pth")
wandb_run.log_artifact(artifact)
wandb_run.finish()

torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
Epoch [1/100], Step [10/10000], Train Loss: 0.5152, Train Acc: 0.7250
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])
torch.

KeyboardInterrupt: 