In [10]:
import os
import joblib
from glob import glob
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

import torch.nn as nn
import torch.optim as optim

from sentimentRNN import datasetclass
from sentimentRNN import textpreprocess

# create vocab

In [14]:
tokenizer = get_tokenizer("basic_english")

fns = glob(os.path.join('train', '*', '*.txt'))

def yield_tokens(fns):
    for fn in fns:
        with open(fn, 'r') as f:
            text = f.readlines()[0]
            tokens = tokenizer(textpreprocess.clean_text(text))
            yield tokens

# make vocabulary from tokenized text
vocab = build_vocab_from_iterator(yield_tokens(fns))

25000lines [00:04, 5575.63lines/s]


Vocabulary Sample:


AttributeError: 'Vocab' object has no attribute 'items'

# Create Dataset

In [16]:
fns = glob(os.path.join('train', '*', '*.txt'))
label_map = {'pos': 1, 'neg': 0}





dataset = datasetclass.TextSentimentDataset(fns, tokenizer, vocab, label_map)


def collate_batch(batch):
    text_list, label_list = [], []
    for _text, _label in batch:
        text_list.append(torch.tensor(_text))
        label_list.append(torch.tensor(_label))
    
    # pad sequences so that each one has the same length (for training)
    text_list = pad_sequence(text_list, batch_first=True, padding_value=0)
    
    return text_list, torch.tensor(label_list)


dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)


# Train Model

In [None]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size) # create embedding for sequence
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size) # classification head

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        out = self.fc(hidden[-1])
        return out

# hyperparameters
vocab_size = len(vocab)
embed_size = 64
hidden_size = 128
output_size = 2  # Positive or Negative
learning_rate = 0.001
num_epochs = 50

# set device
if torch.cuda.is_available(): device = 'cuda'
else: device = 'cpu'
print(f'Running on {device}')

# initialize model, loss, and optimizer
model = RNNModel(vocab_size, embed_size, hidden_size, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# train loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for texts, labels in tqdm(dataloader):
        texts, labels = texts.to(device), labels.to(device) # move to the correct device
        outputs = model(texts)
        loss = criterion(outputs, labels)

        running_loss += loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch}, Loss{running_loss/len(dataloader)}')


Running on cuda


  text_list.append(torch.tensor(_text))
  label_list.append(torch.tensor(_label))
100%|██████████| 782/782 [00:09<00:00, 78.20it/s]


Epoch 0, Loss0.6966196894645691


100%|██████████| 782/782 [00:08<00:00, 87.64it/s]


Epoch 1, Loss0.6960961818695068


100%|██████████| 782/782 [00:08<00:00, 89.03it/s]


Epoch 2, Loss0.7008267045021057


 28%|██▊       | 219/782 [00:02<00:06, 86.69it/s]

In [1]:

import torch
print("PyTorch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)

PyTorch Version: 2.4.1
CUDA Available: True
CUDA Version: 12.4
