In [1]:
import os
import torch
from torchvision.datasets.utils import download_url
# import zipfile 

import pandas as pd

### Load data

In [2]:
# Read data - short version 3400+ documents
reuters = pd.read_pickle('input/reuters_small.pkl')
print(len(reuters))
reuters[0:5]

3426


Unnamed: 0,codes,headline,text
0,"[C18, C181, CCAT]",Eureko is latest suitor for French insurer GAN.,"\nEureko, an alliance of six European financia..."
1,"[G15, GCAT]",Reuter EC Report Long-Term Diary for July 28 -...,\n****\nHIGHLIGHTS\n****\nLUXEMBOURG - Luxembo...
2,"[G15, GCAT]",Official Journal contents - OJ L 190 of July 1...,\n*\n(Note - contents are displayed in reverse...
3,"[G15, GCAT]",Official Journal contents - OJ C 221 of July 1...,\n*\n(Note - contents are displayed in reverse...
4,"[G15, GCAT]",Official Journal contents - OJ C 220 of July 1...,\n*\n(Note - contents are displayed in reverse...


In [3]:
# load full data, if produced. 435 MB. NOT in Github currently. 
# You can produce this in about 30 min with preprocess_data.ipynb

#reuters = pd.read_pickle('input/reuters_all.pkl')

In [4]:
# read classcodes
classcodes= pd.read_csv('input/classcodes.csv')
print(len(classcodes))
#classcodes[0:12]

126


In [5]:
# add index field to DataFrame
classcodes = classcodes.reset_index()
# Create dictionary index/int to classcode and classcode to int
itocode = dict(zip(classcodes.index, classcodes.Code))
codetoi = dict(zip(classcodes.Code, classcodes.index))
def listToInt(mylist):
    return [codetoi[item] for item in mylist]

reuters['codes'] = [listToInt(codelist) for codelist in reuters.codes]
reuters[0:3]
# Multihot, for single list - one row
def multihot(tags):
    return [1 if tag in tags else 0 for tag in taglist]

# list of classes, 126 int: [0...125]
taglist = list(classcodes.index)
Y_hot = [multihot(claslist) for claslist in reuters.codes]
reuters['codes'] = Y_hot

In [6]:
reuters

Unnamed: 0,codes,headline,text
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Eureko is latest suitor for French insurer GAN.,"\nEureko, an alliance of six European financia..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Reuter EC Report Long-Term Diary for July 28 -...,\n****\nHIGHLIGHTS\n****\nLUXEMBOURG - Luxembo...
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Official Journal contents - OJ L 190 of July 1...,\n*\n(Note - contents are displayed in reverse...
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Official Journal contents - OJ C 221 of July 1...,\n*\n(Note - contents are displayed in reverse...
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Official Journal contents - OJ C 220 of July 1...,\n*\n(Note - contents are displayed in reverse...
5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Official Journal contents - OJ C 219 of July 1...,\n*\n(Note - contents are displayed in reverse...
6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Fears for Kali & Salz jobs after BASF sale blo...,\nGerman Economics Minister Guenter Rexrodt ha...
7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",TABLE - Canada historical interest and other r...,\n3-MO 6-MO 1-YR BRATE C30YR CAN D...
8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Toronto stocks jump one percent after Greenspan.,\nCHANGE\t\t\t\t CHANGE\nTSE\t 6718.00 +7...
9,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Toronto stocks open higher as bullion boosts g...,\nCHANGE\t\t\t\t CHANGE\nTSE\t 6674.62 +2...


### Using DataSet


Example in:
https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/A%20-%20Using%20TorchText%20with%20Your%20Own%20Datasets.ipynb

In [7]:
from torchtext import data
from torchtext import datasets

#Define the Fields
TEXT = data.Field()
HEADLINE = data.Field()

LABELS = data.LabelField(sequential=False, use_vocab=False)
#CLASSES = data.LabelField()

In [8]:
reuters[0:5]

Unnamed: 0,codes,headline,text
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Eureko is latest suitor for French insurer GAN.,"\nEureko, an alliance of six European financia..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Reuter EC Report Long-Term Diary for July 28 -...,\n****\nHIGHLIGHTS\n****\nLUXEMBOURG - Luxembo...
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Official Journal contents - OJ L 190 of July 1...,\n*\n(Note - contents are displayed in reverse...
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Official Journal contents - OJ C 221 of July 1...,\n*\n(Note - contents are displayed in reverse...
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Official Journal contents - OJ C 220 of July 1...,\n*\n(Note - contents are displayed in reverse...


In [9]:
len(reuters)

3426

In [10]:
### Test, save pd df to JSON
                            # one record at time, one record by line
reuters.to_json('input/reuters_small.json', orient='records', lines=True)

In [11]:
# split it
train = reuters[0:2500]
test = reuters[2500:3000]
val = reuters[3000:len(reuters)]

train.to_json('input/train.json', orient='records', lines=True)
test.to_json('input/test.json', orient='records', lines=True)
val.to_json('input/val.json', orient='records', lines=True)

In [12]:
# Tell torchText which Fields to apply to which json elements

fields = {'headline': ('h', HEADLINE), 'text': ('t', TEXT), 'codes': ('l', LABELS)}

In [13]:
fields

{'headline': ('h', <torchtext.data.field.Field at 0x7fadf125e0b8>),
 'text': ('t', <torchtext.data.field.Field at 0x7fadf125ea20>),
 'codes': ('l', <torchtext.data.field.LabelField at 0x7fadf125e2b0>)}

In [14]:
# Create dataset (TabularDataset)
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = 'input',
                                        train = 'train.json',
                                        validation = 'val.json',
                                        test = 'test.json',
                                        format = 'json',
                                        fields = fields
)

In [15]:
# test
print(vars(train_data[2]))

{'h': ['Official', 'Journal', 'contents', '-', 'OJ', 'L', '190', 'of', 'July', '19,', '1997.'], 't': ['*', '(Note', '-', 'contents', 'are', 'displayed', 'in', 'reverse', 'order', 'to', 'that', 'in', 'the', 'printed', 'Journal)', '*', 'COMMISSION', 'DECISION', 'of', '16', 'July', '1997', 'amending', 'Decision', '97/285/EC', 'concerning', 'certain', 'protection', 'measures', 'relating', 'to', 'classical', 'swine', 'fever', 'in', 'Spain', '(Text', 'with', 'EEA', 'relevance)', 'COMMISSION', 'REGULATION', '(EC)', 'No', '1399/97', 'of', '18', 'July', '1997', 'establishing', 'the', 'standard', 'import', 'values', 'for', 'determining', 'the', 'entry', 'price', 'of', 'certain', 'fruit', 'and', 'vegetables', 'COMMISSION', 'REGULATION', '(EC)', 'No', '1398/97', 'of', '18', 'July', '1997', 'fixing', 'the', 'agricultural', 'conversion', 'rates', 'COMMISSION', 'REGULATION', '(EC)', 'No', '1397/97', 'of', '18', 'July', '1997', 'fixing,', 'for', 'the', '1996/97', 'marketing', 'year,', 'the', 'specific

### continue

In [16]:
device = torch.device('cpu')
train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train, val, test), batch_sizes=(16, 256, 256),
    sort_key=lambda x: len(x.text), device=device)


In [17]:
TEXT.build_vocab(train)
HEADLINE.build_vocab(train)
LABELS.build_vocab(train)

In [18]:
TEXT.vocab

<torchtext.vocab.Vocab at 0x7fadf1563898>

In [19]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 2500
Number of validation examples: 426
Number of testing examples: 500


In [20]:
TEXT.build_vocab(train_data, max_size=25000)
LABELS.build_vocab(train_data)

In [21]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABELS vocabulary: {len(LABELS.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABELS vocabulary: 2


In [22]:
print(TEXT.vocab.freqs.most_common(10))

[('the', 22529), ('of', 12791), ('to', 12004), ('and', 9792), ('in', 9599), ('a', 8831), ('on', 5491), ('said', 4417), ('for', 4282), ('The', 3677)]


In [23]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', 'of', 'to', 'and', 'in', 'a', 'on', 'said']


In [24]:
print(LABELS.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7fadf12af378>, {0: 0, 1: 1})


In [25]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    device=device)

### simple model

In [26]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):

        #x = [sent len, batch size]
        
        embedded = self.embedding(x)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [27]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50
HIDDEN_DIM = 256
OUTPUT_DIM = 126

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [28]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [29]:
criterion = nn.BCEWithLogitsLoss()

In [30]:
model = model.to(device)
criterion = criterion.to(device)

In [31]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [32]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.t).squeeze(1)
        
        loss = criterion(predictions, batch.l.float())
        
        acc = binary_accuracy(predictions, batch.l.float())
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [33]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.t).squeeze(1)
            
            loss = criterion(predictions, batch.l)
            
            acc = binary_accuracy(predictions, batch.l)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [34]:
N_EPOCHS = 1

for epoch in range(N_EPOCHS):
    print(epoch)

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

0


KeyboardInterrupt: 