In [152]:
import os
import torch
from torchvision.datasets.utils import download_url
# import zipfile 

import pandas as pd

### Load data

In [153]:
# Read data - short version 3400+ documents
reuters = pd.read_pickle('input/reuters_small.pkl')
print(len(reuters))
reuters[0:5]

3426


Unnamed: 0,codes,headline,text,classes,classes_pad
0,"[C18, C181, CCAT]",Eureko is latest suitor for French insurer GAN.,"\nEureko, an alliance of six European financia...","[25, 26, 44]","[25, 26, 44, -1, -1, -1, -1, -1, -1, -1, -1, -..."
1,"[G15, GCAT]",Reuter EC Report Long-Term Diary for July 28 -...,\n****\nHIGHLIGHTS\n****\nLUXEMBOURG - Luxembo...,"[80, 90]","[80, 90, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
2,"[G15, GCAT]",Official Journal contents - OJ L 190 of July 1...,\n*\n(Note - contents are displayed in reverse...,"[80, 90]","[80, 90, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
3,"[G15, GCAT]",Official Journal contents - OJ C 221 of July 1...,\n*\n(Note - contents are displayed in reverse...,"[80, 90]","[80, 90, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
4,"[G15, GCAT]",Official Journal contents - OJ C 220 of July 1...,\n*\n(Note - contents are displayed in reverse...,"[80, 90]","[80, 90, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"


In [154]:
# load full data, if produced. 435 MB. NOT in Github currently. 
# You can produce this in about 30 min with preprocess_data.ipynb

#reuters = pd.read_pickle('input/reuters_all.pkl')

In [155]:
# read classcodes
classcodes= pd.read_csv('input/classcodes.csv')
print(len(classcodes))
#classcodes[0:12]

126


### Using DataSet


Example in:
https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/A%20-%20Using%20TorchText%20with%20Your%20Own%20Datasets.ipynb

In [156]:
from torchtext import data
from torchtext import datasets

#Define the Fields
TEXT = data.Field()
HEADLINE = data.Field()

LABEL = data.LabelField(use_vocab=False)
#CLASSES = data.LabelField()

In [157]:
reuters[0:5]

Unnamed: 0,codes,headline,text,classes,classes_pad
0,"[C18, C181, CCAT]",Eureko is latest suitor for French insurer GAN.,"\nEureko, an alliance of six European financia...","[25, 26, 44]","[25, 26, 44, -1, -1, -1, -1, -1, -1, -1, -1, -..."
1,"[G15, GCAT]",Reuter EC Report Long-Term Diary for July 28 -...,\n****\nHIGHLIGHTS\n****\nLUXEMBOURG - Luxembo...,"[80, 90]","[80, 90, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
2,"[G15, GCAT]",Official Journal contents - OJ L 190 of July 1...,\n*\n(Note - contents are displayed in reverse...,"[80, 90]","[80, 90, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
3,"[G15, GCAT]",Official Journal contents - OJ C 221 of July 1...,\n*\n(Note - contents are displayed in reverse...,"[80, 90]","[80, 90, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
4,"[G15, GCAT]",Official Journal contents - OJ C 220 of July 1...,\n*\n(Note - contents are displayed in reverse...,"[80, 90]","[80, 90, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"


In [158]:
len(reuters)

3426

In [159]:
### Test, save pd df to JSON
                            # one record at time, one record by line
reuters.to_json('input/reuters_small.json', orient='records', lines=True)

In [160]:
# split it
train = reuters[0:2500]
test = reuters[2500:3000]
val = reuters[3000:len(reuters)]

train.to_json('input/train.json', orient='records', lines=True)
test.to_json('input/test.json', orient='records', lines=True)
val.to_json('input/val.json', orient='records', lines=True)

In [161]:
# Tell torchText which Fields to apply to which json elements

fields = {'headline': ('h', HEADLINE), 'text': ('t', TEXT), 'classes_pad': ('l', LABEL)}

In [162]:
fields

{'headline': ('h', <torchtext.data.field.Field at 0x7ff4f70f6710>),
 'text': ('t', <torchtext.data.field.Field at 0x7ff4f70f66d8>),
 'classes_pad': ('l', <torchtext.data.field.LabelField at 0x7ff4f70f6748>)}

In [163]:
# Create dataset (TabularDataset)
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = 'input',
                                        train = 'train.json',
                                        validation = 'val.json',
                                        test = 'test.json',
                                        format = 'json',
                                        fields = fields
)

In [164]:
# test
print(vars(train_data[2]))

{'h': ['Official', 'Journal', 'contents', '-', 'OJ', 'L', '190', 'of', 'July', '19,', '1997.'], 't': ['*', '(Note', '-', 'contents', 'are', 'displayed', 'in', 'reverse', 'order', 'to', 'that', 'in', 'the', 'printed', 'Journal)', '*', 'COMMISSION', 'DECISION', 'of', '16', 'July', '1997', 'amending', 'Decision', '97/285/EC', 'concerning', 'certain', 'protection', 'measures', 'relating', 'to', 'classical', 'swine', 'fever', 'in', 'Spain', '(Text', 'with', 'EEA', 'relevance)', 'COMMISSION', 'REGULATION', '(EC)', 'No', '1399/97', 'of', '18', 'July', '1997', 'establishing', 'the', 'standard', 'import', 'values', 'for', 'determining', 'the', 'entry', 'price', 'of', 'certain', 'fruit', 'and', 'vegetables', 'COMMISSION', 'REGULATION', '(EC)', 'No', '1398/97', 'of', '18', 'July', '1997', 'fixing', 'the', 'agricultural', 'conversion', 'rates', 'COMMISSION', 'REGULATION', '(EC)', 'No', '1397/97', 'of', '18', 'July', '1997', 'fixing,', 'for', 'the', '1996/97', 'marketing', 'year,', 'the', 'specific

### continue

train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train, val, test), batch_sizes=(16, 256, 256),
    sort_key=lambda x: len(x.TEXT), device=0)
    #sort_key=lambda x: len(x.TEXT), device=device)


In [165]:
TEXT

<torchtext.data.field.Field at 0x7ff4f70f66d8>

In [189]:
TEXT.build_vocab(train, max_size=25000)
HEADLINE.build_vocab(train)
#LABEL.build_vocab(train)

In [176]:
TEXT.vocab

<torchtext.vocab.Vocab at 0x7ff4f70f6400>

In [177]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 2500
Number of validation examples: 426
Number of testing examples: 500


In [190]:
#TEXT.build_vocab(train_data, max_size=25000)
#LABEL.build_vocab(train_data)

In [191]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens HEADLINE vocabulary: {len(HEADLINE.vocab)}")

Unique tokens in TEXT vocabulary: 16
Unique tokens HEADLINE vocabulary: 16


In [192]:
print(TEXT.vocab.freqs.most_common(10))

[('s', 7), ('e', 6), ('a', 4), ('c', 3), ('d', 3), ('l', 3), ('t', 2), ('o', 1), ('h', 1), ('i', 1)]


In [193]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 's', 'e', 'a', 'c', 'd', 'l', 't', '_']


In [194]:
print(LABEL.vocab.stoi)

AttributeError: 'LabelField' object has no attribute 'vocab'

In [195]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    device=device)

### simple model

In [196]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):

        #x = [sent len, batch size]
        
        embedded = self.embedding(x)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [197]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [198]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [199]:
criterion = nn.BCEWithLogitsLoss()

### Or??

In [200]:
criterion = nn.BCELoss()

In [201]:
criterion = nn.MultiLabelSoftMarginLoss()

In [202]:
criterion = nn.MultiLabelMarginLoss()

In [203]:
model = model.to(device)
criterion = criterion.to(device)

In [204]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [205]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [206]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [207]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

TypeError: an integer is required (got type NoneType)