In [10]:
import os
import joblib
from glob import glob
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

import torch.nn as nn
import torch.optim as optim

from sentimentRNN import datasetclass
from sentimentRNN import textpreprocess

# create vocab

In [14]:
tokenizer = get_tokenizer("basic_english")

fns = glob(os.path.join('train', '*', '*.txt'))

def yield_tokens(fns):
    for fn in fns:
        with open(fn, 'r') as f:
            text = f.readlines()[0]
            tokens = tokenizer(textpreprocess.clean_text(text))
            yield tokens

# make vocabulary from tokenized text
vocab = build_vocab_from_iterator(yield_tokens(fns))

25000lines [00:04, 5575.63lines/s]


Vocabulary Sample:


AttributeError: 'Vocab' object has no attribute 'items'

# Create Dataset

In [16]:
fns = glob(os.path.join('train', '*', '*.txt'))
label_map = {'pos': 1, 'neg': 0}





dataset = datasetclass.TextSentimentDataset(fns, tokenizer, vocab, label_map)


def collate_batch(batch):
    text_list, label_list = [], []
    for _text, _label in batch:
        text_list.append(torch.tensor(_text))
        label_list.append(torch.tensor(_label))
    
    # pad sequences so that each one has the same length (for training)
    text_list = pad_sequence(text_list, batch_first=True, padding_value=0)
    
    return text_list, torch.tensor(label_list)


dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)


# Train Model

In [17]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size) # create embedding for sequence
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size) # classification head

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        out = self.fc(hidden[-1])
        return out

# hyperparameters
vocab_size = len(vocab)
embed_size = 64
hidden_size = 128
output_size = 2  # Positive or Negative
learning_rate = 0.001
num_epochs = 50

# set device
if torch.cuda.is_available(): device = 'cuda'
else: device = 'cpu'
print(f'Running on {device}')

# initialize model, loss, and optimizer
model = RNNModel(vocab_size, embed_size, hidden_size, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# train loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for texts, labels in tqdm(dataloader):
        texts, labels = texts.to(device), labels.to(device) # move to the correct device
        outputs = model(texts)
        loss = criterion(outputs, labels)

        running_loss += loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch}, Loss {running_loss/len(dataloader)}')


Running on cuda


  text_list.append(torch.tensor(_text))
  label_list.append(torch.tensor(_label))
100%|██████████| 782/782 [00:09<00:00, 78.20it/s]


Epoch 0, Loss0.6966196894645691


100%|██████████| 782/782 [00:08<00:00, 87.64it/s]


Epoch 1, Loss0.6960961818695068


100%|██████████| 782/782 [00:08<00:00, 89.03it/s]


Epoch 2, Loss0.7008267045021057


100%|██████████| 782/782 [00:08<00:00, 87.23it/s]


Epoch 3, Loss0.6981198787689209


100%|██████████| 782/782 [00:08<00:00, 88.88it/s]


Epoch 4, Loss0.6998290419578552


100%|██████████| 782/782 [00:08<00:00, 88.40it/s]


Epoch 5, Loss0.6987996101379395


100%|██████████| 782/782 [00:08<00:00, 87.34it/s]


Epoch 6, Loss0.6984308958053589


100%|██████████| 782/782 [00:08<00:00, 90.36it/s]


Epoch 7, Loss0.6977647542953491


100%|██████████| 782/782 [00:08<00:00, 88.27it/s]


Epoch 8, Loss0.6976149082183838


100%|██████████| 782/782 [00:08<00:00, 90.17it/s]


Epoch 9, Loss0.6989143490791321


100%|██████████| 782/782 [00:08<00:00, 89.07it/s]


Epoch 10, Loss0.6987650990486145


100%|██████████| 782/782 [00:08<00:00, 88.86it/s]


Epoch 11, Loss0.6978203058242798


100%|██████████| 782/782 [00:08<00:00, 90.11it/s]


Epoch 12, Loss0.6970860362052917


100%|██████████| 782/782 [00:08<00:00, 89.22it/s]


Epoch 13, Loss0.6989297866821289


100%|██████████| 782/782 [00:08<00:00, 88.95it/s]


Epoch 14, Loss0.6952516436576843


100%|██████████| 782/782 [00:08<00:00, 89.91it/s]


Epoch 15, Loss0.695522129535675


100%|██████████| 782/782 [00:08<00:00, 88.29it/s]


Epoch 16, Loss0.695698082447052


100%|██████████| 782/782 [00:08<00:00, 90.21it/s]


Epoch 17, Loss0.6939797401428223


100%|██████████| 782/782 [00:08<00:00, 88.62it/s]


Epoch 18, Loss0.693157434463501


100%|██████████| 782/782 [00:08<00:00, 89.37it/s]


Epoch 19, Loss0.6910996437072754


100%|██████████| 782/782 [00:08<00:00, 89.34it/s]


Epoch 20, Loss0.6895789504051208


100%|██████████| 782/782 [00:08<00:00, 88.54it/s]


Epoch 21, Loss0.6884390711784363


100%|██████████| 782/782 [00:08<00:00, 89.31it/s]


Epoch 22, Loss0.6891595721244812


100%|██████████| 782/782 [00:08<00:00, 88.47it/s]


Epoch 23, Loss0.6867878437042236


100%|██████████| 782/782 [00:08<00:00, 88.57it/s]


Epoch 24, Loss0.6861751675605774


100%|██████████| 782/782 [00:08<00:00, 89.20it/s]


Epoch 25, Loss0.6866506338119507


100%|██████████| 782/782 [00:08<00:00, 87.33it/s]


Epoch 26, Loss0.6852633953094482


100%|██████████| 782/782 [00:08<00:00, 89.20it/s]


Epoch 27, Loss0.6841714978218079


100%|██████████| 782/782 [00:08<00:00, 88.25it/s]


Epoch 28, Loss0.6851123571395874


100%|██████████| 782/782 [00:08<00:00, 88.31it/s]


Epoch 29, Loss0.6840166449546814


100%|██████████| 782/782 [00:08<00:00, 88.58it/s]


Epoch 30, Loss0.6847826242446899


100%|██████████| 782/782 [00:08<00:00, 88.53it/s]


Epoch 31, Loss0.685257613658905


100%|██████████| 782/782 [00:08<00:00, 90.18it/s]


Epoch 32, Loss0.6853000521659851


100%|██████████| 782/782 [00:08<00:00, 88.22it/s]


Epoch 33, Loss0.6821085214614868


100%|██████████| 782/782 [00:08<00:00, 89.18it/s]


Epoch 34, Loss0.6830213069915771


100%|██████████| 782/782 [00:09<00:00, 86.04it/s]


Epoch 35, Loss0.6831216812133789


100%|██████████| 782/782 [00:09<00:00, 85.55it/s]


Epoch 36, Loss0.6835502982139587


100%|██████████| 782/782 [00:08<00:00, 89.55it/s]


Epoch 37, Loss0.681152880191803


100%|██████████| 782/782 [00:08<00:00, 89.91it/s]


Epoch 38, Loss0.6836217045783997


100%|██████████| 782/782 [00:08<00:00, 90.29it/s]


Epoch 39, Loss0.6807709336280823


100%|██████████| 782/782 [00:08<00:00, 89.70it/s]


Epoch 40, Loss0.6843134164810181


100%|██████████| 782/782 [00:08<00:00, 89.87it/s]


Epoch 41, Loss0.6824333071708679


100%|██████████| 782/782 [00:08<00:00, 89.68it/s]


Epoch 42, Loss0.6815732717514038


100%|██████████| 782/782 [00:08<00:00, 88.55it/s]


Epoch 43, Loss0.6822087168693542


100%|██████████| 782/782 [00:08<00:00, 90.21it/s]


Epoch 44, Loss0.6799623370170593


100%|██████████| 782/782 [00:08<00:00, 90.89it/s]


Epoch 45, Loss0.6812628507614136


100%|██████████| 782/782 [00:08<00:00, 90.04it/s]


Epoch 46, Loss0.6813211441040039


100%|██████████| 782/782 [00:08<00:00, 91.16it/s]


Epoch 47, Loss0.6801704168319702


100%|██████████| 782/782 [00:08<00:00, 90.27it/s]


Epoch 48, Loss0.6807780861854553


100%|██████████| 782/782 [00:08<00:00, 90.70it/s]

Epoch 49, Loss0.6805717945098877





In [18]:
joblib.dump(model, 'first model atempt.pkl')

['first model atempt.pkl']

In [1]:

import torch
print("PyTorch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)

PyTorch Version: 2.4.1
CUDA Available: True
CUDA Version: 12.4
