In [1]:
import pandas as pd
import re
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
import random
from torch.utils.data import DataLoader, Dataset, Subset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class TwitterDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
class LSTMModel(nn.Module):
    def __init__(self,vocab_size ,embedding_dim, lstm_hidden_dim, output_dim, dropout_prob=0.2,lstm_layers=2):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim,num_layers=lstm_layers, dropout=dropout_prob, batch_first=True)
        self.fc = nn.Linear(lstm_hidden_dim, output_dim)
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out[:, -1, :]
        output = self.fc(lstm_out)
        return output



class EarlyStopping:
    def __init__(self, patience=5, min_delta=0):
        """
        Args:
            patience (int): How many epochs to wait after last time validation loss improved.
            min_delta (float): Minimum change in the monitored quantity to qualify as an improvement.
        """
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0


# Load data

In [3]:
input_data_size = 100000
data_path = "./data/cleaned_data.csv"
df = pd.read_csv(data_path)\
    .dropna()\
     .sample(input_data_size)


# Split data

In [4]:
df_values = df.values

X = df_values[:, 1].flatten()
Y = np.array(df_values[:, 0], dtype=int)

# Tokenizer

In [5]:
tokenizer_model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest" #"bert-base-uncased" # 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name, padding_side="left") 
max_len = 64 # 32 max length of the input. everything else will be cut off

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
def preprocess_text(text):
    return tokenizer(
        text,
        max_length=max_len,        # Maximum length of the sequence
        padding='max_length',     # Pad to maximum length
        truncation=True,          # Truncate longer sequences
        return_tensors='pt'       # Return PyTorch tensors, which are like numpy arrays
    )

X_toknized = preprocess_text(X.tolist())['input_ids']
Y = torch.tensor(Y, dtype=torch.long)


# Training parameters

In [7]:
embedding_dim = 128
hidden_dim = 100
output_dim = 2
dropout = 0.2
learning_rate = 0.01
epochs = 10
batch_size = 64
weight_decay = 1e-3
k_folds = 5
learning_rate_decay = 0.1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Create Dataset

In [8]:

X_train, X_test, Y_train, Y_test = train_test_split(X_toknized, Y, test_size=0.1, stratify=Y, random_state=2)

train_dataset = TwitterDataset(X_train, Y_train)
test_dataset = TwitterDataset(X_test, Y_test)


## Cross validation

In [9]:
kf = KFold(n_splits=k_folds, shuffle=True)

## Setup wandb

In [60]:
import wandb
wandb.login(key="")



True

In [11]:
import wandb
from datetime import datetime
run_name = datetime.today().strftime('%d%m%y%H%M%S')
wandb_run = wandb.init(
    project="nlp-sentiement-analysis",
    name=run_name,
    config={
        "learning_rate": learning_rate,
        "architecture": "LSTM",
        "dataset": "Twitter Sentiment Analysis",
        "epochs": epochs,
        "train_size": len(train_dataset)*0.8,
        "test_size": len(train_dataset)*0.2,
        "batch_size": batch_size,
        "embedding_dim": embedding_dim,
        "hidden_dim": hidden_dim,
        "dropout": dropout,
        "input_data_size": input_data_size,
        "vocab_size": tokenizer.vocab_size,
        "token_size":max_len,
        "weight_decay": weight_decay,
        "k_folds": k_folds,
        "learning_rate_decay": learning_rate_decay

    }
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mkarpinski-j[0m ([33mkarpinski-gsn[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [12]:
import os
models_path = f"./models/{run_name}"
if not os.path.exists(models_path):
    os.makedirs(models_path)

for fold, (train_indices, val_indices) in enumerate(kf.split(train_dataset)):
    ############ MODEL ###############
    models = []
    model = LSTMModel(tokenizer.vocab_size,embedding_dim, hidden_dim, output_dim, dropout).to(device)  
    optimizer = optim.Adam(model.parameters(),lr=learning_rate,weight_decay=weight_decay)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=learning_rate_decay)
    criterion = nn.CrossEntropyLoss()

    ####### EARLY STOPPING ########

    early_stopping = EarlyStopping(patience=4, min_delta=0.01)

    fold_val_loss = []
    fold_val_acc = []
    fold_val_precision = []
    fold_val_recall = []
    fold_val_f1 = []

    print(f'#########Fold: {fold+1}##############')
    #### SPLIT DATASET ####
    train_subset = Subset(train_dataset, train_indices)
    val_subset = Subset(train_dataset, val_indices)
    train_dataloader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)
    
    train_samples = len(train_subset)
    val_samples = len(val_subset)

    #### TRAINING LOOP ####
    for epoch in range(epochs):
        best_val_loss = float('inf')
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        #### EPOCH TRAINING LOOP ####
        for batch_idx, (inputs, labels) in enumerate(train_dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_loss += loss.item() * inputs.size(0)
            train_total += labels.size(0)
            _, predicted = torch.max(outputs, 1)
            train_correct += (predicted == labels).sum().item()
            ### EPCH METIRCS ###     
        train_loss_avg = train_loss / train_total
        train_acc_avg = train_correct / train_total


        ### EPOCH VALIDATION LOOP ###

        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        all_predictions = []
        all_labels = []

        with torch.no_grad():
            for inputs, labels in val_dataloader:
                inputs = inputs.to(device)
                labels = labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs, 1)
                all_predictions.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
                val_correct += (predicted == labels).sum().item()
                val_total += labels.size(0)

        
            val_loss_avg = val_loss / val_samples
            val_acc_avg = val_correct / val_total
            val_precision = precision_score(all_labels, all_predictions,average='weighted')
            val_recall = recall_score(all_labels, all_predictions,average='weighted')
            val_f1 = f1_score(all_labels, all_predictions,average='weighted')

        #Save the model with the best validation loss
        if val_loss_avg < best_val_loss:
            best_val_loss = val_loss_avg
            best_model = model.state_dict()
  
        ### PRINT EPOCH METRICS ###
        lr = optimizer.param_groups[0]['lr']
        print("#################################################")
        print(f'Fold {fold} Epoch [{epoch+1}/{epochs}], Learing Rate: {lr}, Training Loss: {train_loss_avg:.4f}  Val Loss: {val_loss_avg:.4f}, Val Acc: {val_acc_avg:.4f}, Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}, Val F1: {val_f1:.4f}')
        
        scheduler.step()

        logs = {
            f"epoch_fold_{fold}": epoch,
            f"train_loss_fold_{fold}": train_loss_avg,
            f"train_accuracy_fold_{fold}": train_acc_avg,
            f"val_loss_fold_{fold}": val_loss_avg,
            f"val_accuracy_fold_{fold}": val_acc_avg,
            f"val_precision_fold_{fold}": val_precision,
            f"val_recall_fold_{fold}": val_recall,
            f"val_f1_fold_{fold}": val_f1
        }
        wandb_run.log(logs)

        best_model_name = f"model_fold_{fold}.pth"    
        best_model_path = f"{models_path}/{best_model_name}.pth"  
        if early_stopping(val_loss_avg):
            print(f'#######Early stopping on epoch {epoch}################')
            torch.save(best_model, best_model_path)
            artifact = wandb.Artifact(best_model_name, type='model')
            artifact.add_file(best_model_path)
            wandb_run.log_artifact(artifact)
            break

    best_model_name = f"model_fold_{fold}"    
    best_model_path = f"{models_path}/{best_model_name}.pth"        
    torch.save(best_model, best_model_path)
    artifact = wandb.Artifact(best_model_name, type='model')
    artifact.add_file(best_model_path)
    wandb_run.log_artifact(artifact)

wandb_run.finish()
    
    





#########Fold: 1##############
#################################################
Fold 0 Epoch [1/10], Learing Rate: 0.01, Training Loss: 0.5770  Val Loss: 0.5114, Val Acc: 0.7523, Val Precision: 0.7561, Val Recall: 0.7523, Val F1: 0.7509
#################################################
Fold 0 Epoch [2/10], Learing Rate: 0.01, Training Loss: 0.5155  Val Loss: 0.5186, Val Acc: 0.7477, Val Precision: 0.7557, Val Recall: 0.7477, Val F1: 0.7462
#################################################
Fold 0 Epoch [3/10], Learing Rate: 0.01, Training Loss: 0.5043  Val Loss: 0.5057, Val Acc: 0.7552, Val Precision: 0.7580, Val Recall: 0.7552, Val F1: 0.7549
#################################################
Fold 0 Epoch [4/10], Learing Rate: 0.01, Training Loss: 0.5007  Val Loss: 0.4923, Val Acc: 0.7638, Val Precision: 0.7646, Val Recall: 0.7638, Val F1: 0.7634
#################################################
Fold 0 Epoch [5/10], Learing Rate: 0.01, Training Loss: 0.4993  Val Loss: 0.4976, Val Acc: 

0,1
epoch_fold_0,▁▂▃▃▄▅▆▆▇█
epoch_fold_1,▁▂▃▃▄▅▆▆▇█
epoch_fold_2,▁▂▃▃▄▅▆▆▇█
epoch_fold_3,▁▂▃▃▄▅▆▆▇█
epoch_fold_4,▁▂▃▃▄▅▆▆▇█
train_accuracy_fold_0,▆███████▁▃
train_accuracy_fold_1,▁▆▇▇██▇███
train_accuracy_fold_2,▁▇████████
train_accuracy_fold_3,▁▆█▅██████
train_accuracy_fold_4,▁▇████████

0,1
epoch_fold_0,9.0
epoch_fold_1,9.0
epoch_fold_2,9.0
epoch_fold_3,9.0
epoch_fold_4,9.0
train_accuracy_fold_0,0.64087
train_accuracy_fold_1,0.76446
train_accuracy_fold_2,0.7661
train_accuracy_fold_3,0.76769
train_accuracy_fold_4,0.76383


# Sentiment analysis evaluation

In [13]:


model_1 = LSTMModel(tokenizer.vocab_size,embedding_dim, hidden_dim, output_dim, dropout).to(device)
model_2 = LSTMModel(tokenizer.vocab_size,embedding_dim, hidden_dim, output_dim, dropout).to(device)
model_3 = LSTMModel(tokenizer.vocab_size,embedding_dim, hidden_dim, output_dim, dropout).to(device)
model_4 = LSTMModel(tokenizer.vocab_size,embedding_dim, hidden_dim, output_dim, dropout).to(device)
model_5 = LSTMModel(tokenizer.vocab_size,embedding_dim, hidden_dim, output_dim, dropout).to(device)

model_1.load_state_dict(torch.load(f"{models_path}/model_fold_0.pth"))
model_2.load_state_dict(torch.load(f"{models_path}/model_fold_1.pth"))
model_3.load_state_dict(torch.load(f"{models_path}/model_fold_2.pth"))
model_4.load_state_dict(torch.load(f"{models_path}/model_fold_3.pth"))
model_5.load_state_dict(torch.load(f"{models_path}/model_fold_4.pth"))

model_1.eval()
model_2.eval()
model_3.eval()
model_4.eval()
model_5.eval()



  model_1.load_state_dict(torch.load(f"{models_path}/model_fold_0.pth"))
  model_2.load_state_dict(torch.load(f"{models_path}/model_fold_1.pth"))
  model_3.load_state_dict(torch.load(f"{models_path}/model_fold_2.pth"))
  model_4.load_state_dict(torch.load(f"{models_path}/model_fold_3.pth"))
  model_5.load_state_dict(torch.load(f"{models_path}/model_fold_4.pth"))


LSTMModel(
  (embedding): Embedding(50265, 128)
  (lstm): LSTM(128, 100, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=100, out_features=2, bias=True)
)

In [14]:
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [15]:
all_predictions = []
all_labels = []
test_correct = 0
test_total = 0

results = []
for model in [model_1, model_2, model_3, model_4, model_5]:
    all_predictions = []
    all_labels = []
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for inputs, labels in val_dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            _, predicted = torch.max(outputs, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)


        val_acc_avg = val_correct / val_total
        val_precision = precision_score(all_labels, all_predictions,average='weighted')
        val_recall = recall_score(all_labels, all_predictions,average='weighted')
        val_f1 = f1_score(all_labels, all_predictions,average='weighted')
        results.append({
            "val_acc_avg":f"{val_acc_avg:.4f}",
            "val_precision": f"{val_precision:.4f}",
            "val_recall":f"{val_recall:.4f}",
            "val_f1": f"{val_f1:.4f}"
        }
            
            )

In [16]:
for results in results:
    print(results)

{'val_acc_avg': '0.7093', 'val_precision': '0.6473', 'val_recall': '0.6473', 'val_f1': '0.6472'}
{'val_acc_avg': '0.7206', 'val_precision': '0.7808', 'val_recall': '0.7431', 'val_f1': '0.7337'}
{'val_acc_avg': '0.7333', 'val_precision': '0.7873', 'val_recall': '0.7714', 'val_f1': '0.7680'}
{'val_acc_avg': '0.7452', 'val_precision': '0.7930', 'val_recall': '0.7926', 'val_f1': '0.7926'}
{'val_acc_avg': '0.7495', 'val_precision': '0.7713', 'val_recall': '0.7713', 'val_f1': '0.7713'}


# User input test

In [82]:
text = "" #TODO FILL ME 
embedding = preprocess_text(text)['input_ids'].to(device)

In [86]:
outputs = model_1(embedding)
_, predicted = torch.max(outputs, 1)

In [90]:
if predicted:
    print(f"Tweet '{text}' has positive sentiment")
else:
    print(f"Tweet '{text}' has negative sentiment")

Tweet '' has positive sentiment
