In [None]:
!pip install torchtext==0.6 torch==1.11

Collecting torchtext==0.6
  Using cached torchtext-0.6.0-py3-none-any.whl (64 kB)
Collecting torch==1.11
  Downloading torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl (750.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.6/750.6 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from torchtext==0.6)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 2.1.0+cu121
    Uninstalling torch-2.1.0+cu121:
      Successfully uninstalled torch-2.1.0+cu121
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.16.0
    Uninstalling torchtext-0.16.0:
      Successfully uninstalled torchtext-0.16.0
[31mERROR: pip's dependency resolve

In [None]:
import torch
import torch.nn.functional as F
import torchtext
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator
from torchtext.data.utils import get_tokenizer
import time
import random
import pandas as pd

torch.backends.cudnn.deterministic = True

In [None]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

vocab_size = 20000
l_rate = 0.005
batch_size = 128
num_epochs = 15

embedding_dim = 128
hidden_dim = 256
num_classes = 2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
!wget https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz

--2024-01-05 03:45:19--  https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/rasbt/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz [following]
--2024-01-05 03:45:19--  https://raw.githubusercontent.com/rasbt/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26521894 (25M) [application/octet-stream]
Saving to: ‘movie_data.csv.gz’


2024-01-05 03:45:19 (173 MB/s) - ‘movie_data.csv.gz’ saved [26521894/26521894]



In [None]:
!gunzip -f movie_data.csv.gz

In [None]:
import pandas as pd

df = pd.read_csv('movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [None]:
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [None]:
TEXT = torchtext.data.Field(tokenize=tokenizer)
LABEL = torchtext.data.LabelField(dtype=torch.long)

In [None]:
fields= [('text', TEXT), ('label', LABEL)]

In [None]:
dataset = torchtext.data.TabularDataset(
    path='movie_data.csv',format='csv', skip_header=True, fields=fields)

In [None]:
train_data, test_data = dataset.split(
    split_ratio = [0.8, 0.2],
    random_state = random.seed(RANDOM_SEED))

print(f'Num Train: {len(train_data)}')
print(f'Num Test: {len(test_data)}')

Num Train: 40000
Num Test: 10000


In [None]:
train_data, valid_data = train_data.split(
    split_ratio=[0.85, 0.15],
    random_state=random.seed(RANDOM_SEED))

print(f'Num Train: {len(train_data)}')
print(f'Num Validation: {len(valid_data)}')

Num Train: 34000
Num Validation: 6000


In [None]:
TEXT.build_vocab(train_data, max_size=vocab_size)
LABEL.build_vocab(train_data)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

Vocabulary size: 20002
Number of classes: 2


In [None]:
print(LABEL.vocab.stoi)

defaultdict(None, {'1': 0, '0': 1})


In [None]:
LABEL.vocab.freqs

Counter({'0': 16981, '1': 17019})

In [None]:
train_loader, valid_loader, test_loader = torchtext.data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size= batch_size,
        sort_within_batch=False,
        sort_key=lambda x: len(x.text),
        device = device
    )

In [None]:
class RNN(torch.nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self,text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)
        hidden.squeeze_(0)
        output = self.fc(hidden)
        return output

In [None]:
torch.manual_seed(RANDOM_SEED)
model = RNN(input_dim=len(TEXT.vocab),
            embedding_dim=embedding_dim,
            hidden_dim=hidden_dim,
            output_dim=num_classes # could use 1 for binary classification
)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [None]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for i, (features, targets) in enumerate(data_loader):

            features = features.to(device)
            targets = targets.float().to(device)

            logits = model(features)
            _, predicted_labels = torch.max(logits, 1)

            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100

In [None]:
start_time = time.time()

for epoch in range(num_epochs):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):
        text = batch_data.text.to(device)
        labels = batch_data.label.to(device)
        preds = model(text)
        loss = F.cross_entropy(preds, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{num_epochs:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Loss: {loss:.4f}')
    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader, device):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_loader, device):.2f}%')

    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, device):.2f}%')

Epoch: 001/015 | Batch 000/266 | Loss: 0.7168
Epoch: 001/015 | Batch 050/266 | Loss: 0.6913
Epoch: 001/015 | Batch 100/266 | Loss: 0.6896
Epoch: 001/015 | Batch 150/266 | Loss: 0.6994
Epoch: 001/015 | Batch 200/266 | Loss: 0.6907
Epoch: 001/015 | Batch 250/266 | Loss: 0.6928
training accuracy: 50.08%
valid accuracy: 51.15%
Time elapsed: 0.92 min
Epoch: 002/015 | Batch 000/266 | Loss: 0.6926
Epoch: 002/015 | Batch 050/266 | Loss: 0.6949
Epoch: 002/015 | Batch 100/266 | Loss: 0.6927
Epoch: 002/015 | Batch 150/266 | Loss: 0.6930
Epoch: 002/015 | Batch 200/266 | Loss: 0.6917
Epoch: 002/015 | Batch 250/266 | Loss: 0.6920
training accuracy: 50.14%
valid accuracy: 49.53%
Time elapsed: 1.81 min
Epoch: 003/015 | Batch 000/266 | Loss: 0.7058
Epoch: 003/015 | Batch 050/266 | Loss: 0.6929
Epoch: 003/015 | Batch 100/266 | Loss: 0.6975
Epoch: 003/015 | Batch 150/266 | Loss: 0.6912
Epoch: 003/015 | Batch 200/266 | Loss: 0.6910
Epoch: 003/015 | Batch 250/266 | Loss: 0.6929
training accuracy: 50.20%
va

NameError: name 'DEVICE' is not defined

In [None]:
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, device):.2f}%')

Total Training Time: 14.59 min
Test accuracy: 83.66%


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
torch.save(model.state_dict(), '/content/drive/My Drive/Colab Notebooks/hello.pt')

In [None]:
#Need to have this if the model was not previously created: model = TheModelClass(*args, **kwargs)
model.load_state_dict(torch.load('/content/drive/My Drive/Colab Notebooks/hello.pt'))


<All keys matched successfully>

In [None]:
model

RNN(
  (embedding): Embedding(20002, 128)
  (rnn): LSTM(128, 256)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)