In [1]:
import torch
from torchtext import data

In [2]:
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

In [None]:
from torchtext import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [None]:
print(f'Numbaer of training examples: {len(train_data)}') 
print(f'Numbaer of testing examples: {len(test_data)}')

In [26]:
print(vars(train_data[0]))

{'text': ['GBS', 'wrote', 'his', 'own', 'screen', 'adaptation', 'of', 'this', 'Nobel', 'Prize', 'winning', 'play', 'but', 'did', "n't", 'live', 'to', 'see', 'it', 'produced', '(', 'he', 'had', 'won', 'an', 'Oscar', 'in', '1938', 'for', 'his', 'brilliant', 'adaptation', 'of', 'his', '1914', 'play', 'PYGMALION', ')', '.', 'When', 'Otto', 'Preminger', 'mounted', '(', 'produced', 'and', 'directed', ')', 'this', 'production', 'in', '1957', ',', 'seven', 'years', 'after', 'Shaw', "'s", 'death', ',', 'he', 'had', 'noted', 'British', 'author', 'Graham', 'Greene', 'do', 'the', 'adaptation', 'and', 'it', 'was', 'a', 'solid', 'choice.<br', '/><br', '/>Taking', 'a', 'cue', 'from', 'Shaw', "'s", 'own', 'screenplay', ',', 'Greene', 'uses', 'material', 'from', 'the', 'stage', 'Epilogue', 'to', 'create', 'a', 'framing', 'device', 'to', 'meld', 'the', 'two', 'acts', 'of', 'the', 'play', '(', 'one', 'early', 'and', 'one', 'late', 'in', 'Joan', "'s", 'story', ')', 'into', 'a', 'unified', 'and', 'most', '

In [0]:
import random

train_data, valid_data = train_data.split(random_state = random.seed(0))

In [0]:
# class LabelField(Field):
#     def __init__(self, **kwargs):
#         kwargs['sequential'] = False
#         kwargs['unk_token'] = None
#         kwargs['is_target'] = True
        
#         super(LabelField, self).__init__(**kwargs)

In [0]:
MAX_VOCAB_SIZE = 50000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [39]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 202935), (',', 192993), ('.', 165982), ('and', 109677), ('a', 109555), ('of', 101096), ('to', 93933), ('is', 76589), ('in', 61368), ('I', 54708), ('it', 53827), ('that', 49401), ('"', 44483), ("'s", 43319), ('this', 42513), ('-', 37019), ('/><br', 35796), ('was', 35010), ('as', 30457), ('with', 29878)]


In [0]:
BATCH_SIZE = 64

device = torch.device('cude' if torch.cuda.is_available() else 'cpu')

train_iterator, vaild_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = BATCH_SIZE,
device = device)

In [0]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.nn =nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.nn(embedded)
        
        assert torch.equal(output[-1, :,:], hidden.squeeze(0))
        return self.fc(hidden.squeeze(0))

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

In [0]:
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(device)

In [0]:
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr=0.01)
criterion =nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

In [0]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per bath, i.e. if you get 8/10 right,
    this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division
    acc = correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc,item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)
        

In [0]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [69]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

AttributeError: ignored