In [23]:
import os
import torch
from torchvision.datasets.utils import download_url
from torchtext import data
from torchtext import datasets
import pandas as pd
import numpy as np

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
SEED = 2
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### Load data

In [4]:
# Read data - short version 3400+ documents
reuters = pd.read_pickle('input/reuters_small.pkl')
print(len(reuters))
reuters[0:2]

3426


Unnamed: 0,codes,headline,text,classes,classes_pad
0,"[C18, C181, CCAT]",Eureko is latest suitor for French insurer GAN.,"\nEureko, an alliance of six European financia...","[25, 26, 44]","[25, 26, 44, -1, -1, -1, -1, -1, -1, -1, -1, -..."
1,"[G15, GCAT]",Reuter EC Report Long-Term Diary for July 28 -...,\n****\nHIGHLIGHTS\n****\nLUXEMBOURG - Luxembo...,"[80, 90]","[80, 90, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"


In [4]:
# 8x small size
reuters = pd.read_pickle('input/reuters_small8.pkl')
print(len(reuters))
reuters[0:2]

24247


Unnamed: 0,codes,headline,text,classes
0,"[C18, C181, CCAT]",Eureko is latest suitor for French insurer GAN.,"\nEureko, an alliance of six European financia...","[25, 26, 44]"
1,"[G15, GCAT]",Reuter EC Report Long-Term Diary for July 28 -...,\n****\nHIGHLIGHTS\n****\nLUXEMBOURG - Luxembo...,"[80, 90]"


In [5]:
# load full data, if produced. 435 MB. NOT in Github currently. 
# You can produce this in about 30 min with preprocess_data.ipynb

reuters = pd.read_pickle('input/reuters_all.pkl')

In [5]:
# read classcodes
classcodes= pd.read_csv('input/classcodes.csv')
print(len(classcodes))
#classcodes[0:12]

# add index field to DataFrame
classcodes = classcodes.reset_index()
# Create dictionary index/int to classcode and classcode to int
itocode = dict(zip(classcodes.index, classcodes.Code))
codetoi = dict(zip(classcodes.Code, classcodes.index))
def listToInt(mylist):
    return [codetoi[item] for item in mylist]

reuters['codes'] = [listToInt(codelist) for codelist in reuters.codes]
reuters[0:3]
# Multihot, for single list - one row
def multihot(tags):
    return [1 if tag in tags else 0 for tag in taglist]

# list of classes, 126 int: [0...125]
taglist = list(classcodes.index)
Y_hot = [multihot(claslist) for claslist in reuters.codes]
reuters['codes'] = Y_hot

126


In [6]:
reuters[0:3]

Unnamed: 0,codes,headline,text,classes,classes_pad
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Eureko is latest suitor for French insurer GAN.,"\nEureko, an alliance of six European financia...","[25, 26, 44]","[25, 26, 44, -1, -1, -1, -1, -1, -1, -1, -1, -..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Reuter EC Report Long-Term Diary for July 28 -...,\n****\nHIGHLIGHTS\n****\nLUXEMBOURG - Luxembo...,"[80, 90]","[80, 90, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Official Journal contents - OJ L 190 of July 1...,\n*\n(Note - contents are displayed in reverse...,"[80, 90]","[80, 90, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"


In [7]:
len(reuters)

3426

### Using DataSet


Example in:
https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/A%20-%20Using%20TorchText%20with%20Your%20Own%20Datasets.ipynb

In [8]:
#Define the Fields
TEXT = data.Field()
HEADLINE = data.Field()

LABELS = data.LabelField(sequential=False, use_vocab=False)

In [9]:
# Put data in random order
idx = np.random.permutation(len(reuters))
reuters = reuters.iloc[idx]

In [10]:
# split it
size = len(reuters)
train_size = int(0.7*size)
test_size = int(0.85*size)

train = reuters[0: train_size]
val = reuters[train_size : test_size]
test = reuters[test_size : size]

train.to_json('input/train.json', orient='records', lines=True)
test.to_json('input/test.json', orient='records', lines=True)
val.to_json('input/val.json', orient='records', lines=True)

In [11]:
# Tell torchText which Fields to apply to which json elements

fields = {'headline': ('h', HEADLINE), 'text': ('t', TEXT), 'codes': ('l', LABELS)}
fields

{'headline': ('h', <torchtext.data.field.Field at 0x7f8ec70182b0>),
 'text': ('t', <torchtext.data.field.Field at 0x7f8ec7018198>),
 'codes': ('l', <torchtext.data.field.LabelField at 0x7f8ec7018320>)}

In [12]:
# Create dataset (TabularDataset)
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = 'input',
                                        train = 'train.json',
                                        validation = 'val.json',
                                        test = 'test.json',
                                        format = 'json',
                                        fields = fields
)
# test
#print(vars(train_data[2]))

### continue

In [20]:
# GLOVE

#TEXT.build_vocab(train)
TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.50d")
HEADLINE.build_vocab(train)
LABELS.build_vocab(train)

pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([25002, 50])


In [21]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')
print('------------')
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABELS vocabulary: {len(LABELS.vocab)}")

Number of training examples: 2398
Number of validation examples: 514
Number of testing examples: 514
------------
Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABELS vocabulary: 5


In [22]:
print(TEXT.vocab.freqs.most_common(10))
print('-----')
print(TEXT.vocab.itos[:10])
print(LABELS.vocab.stoi)

[('the', 22500), ('of', 12430), ('to', 11882), ('in', 9658), ('and', 9178), ('a', 8790), ('on', 5732), ('said', 4431), ('for', 4175), ('The', 3585)]
-----
['<unk>', '<pad>', 'the', 'of', 'to', 'in', 'and', 'a', 'on', 'said']
defaultdict(<function _default_unk_index at 0x7f8f28809268>, {'classes': 0, 'classes_pad': 1, 'codes': 2, 'headline': 3, 'text': 4})


In [18]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    device=device,
    sort_key= lambda x: len(x.t)
)

### F1 accuracy

The results will be ranked according to the highest micro-averaged F1 score. 
This will be calculated using the f1_score function found in scikit-learn, using a command like 
f1_score(y_true, y_pred, average='micro') where y_true is the matrix with the ground truth, and y_pred 
the predicted output. Both matrices are binary, a 1 in row i and column j means that the image/document
i contains the label j.

Scikit:  Micro-average in F1-score
 
'micro':
    Calculate metrics globally by counting the total true positives, false negatives and false positives.

In [36]:
# f1 score for BATCH
from sklearn.metrics import f1_score
def f1_accuracy(preds, y):
    """
    Returns f1 accuracy from sklearn
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    
    preds_cpu = rounded_preds.cpu().data.numpy()
    y_cpu = y.cpu().data.numpy()
    f1 = f1_score(y_cpu, preds_cpu, average='micro')
    return f1 

In [37]:
def f1_own_accuracy(preds, y):
    '''Returns counts of true_pos, false_pos and false_negative.
    For counting precision, recall and F1 globally
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    '''
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    
    preds = rounded_preds.cpu().data.numpy()
    y = y.cpu().data.numpy()
        
    # True positive
    tpos = np.sum(np.logical_and(preds == 1, y == 1))
 
    # True negative
    #tneg = np.sum(np.logical_and(preds == 0, y == 0))
 
    # False positive
    fpos = np.sum(np.logical_and(preds == 1, y == 0))
 
    # False negative
    fneg = np.sum(np.logical_and(preds == 0, y == 1))

    return tpos, fpos, fneg

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [38]:
# F1 version
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    
    epoch_tpos = 0
    epoch_fpos = 0
    epoch_fneg = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.t).squeeze(1)
        
        loss = criterion(predictions, batch.l.float())
        
        tpos, fpos, fneg = f1_own_accuracy(predictions, batch.l.float())
        epoch_tpos += tpos
        epoch_fpos += fpos
        epoch_fneg += fneg
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        #epoch_acc += acc.item()
    
    # Counted f1-score is Micro-average version
    # avoid div by zero with epsilon
    eps = 1e-7
    epoch_precision = epoch_tpos / (epoch_tpos + epoch_fpos +eps )
    epoch_recall = epoch_tpos / (epoch_tpos + epoch_fneg +eps)
    epoch_f1 = 2* (  ((epoch_precision * epoch_recall)+eps) / (epoch_precision + epoch_recall +eps))
    
    return epoch_loss / len(iterator), epoch_precision, epoch_recall, epoch_f1

In [39]:
# F1 version
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    #epoch_acc = 0
    epoch_tpos = 0
    epoch_fpos = 0
    epoch_fneg = 0    
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.t).squeeze(1)
            
            loss = criterion(predictions, batch.l.float())
            
            tpos, fpos, fneg = f1_own_accuracy(predictions, batch.l.float())
            epoch_tpos += tpos
            epoch_fpos += fpos
            epoch_fneg += fneg            

            epoch_loss += loss.item()
            #epoch_acc += acc.item()

    # avoid div by zero with epsilon
    eps = 1e-7
    epoch_precision = epoch_tpos / (epoch_tpos + epoch_fpos +eps)
    epoch_recall = epoch_tpos / (epoch_tpos + epoch_fneg +eps)
    epoch_f1 = 2* (  ((epoch_precision * epoch_recall)+eps) / (epoch_precision + epoch_recall +eps))            
        
    return epoch_loss / len(iterator), epoch_precision, epoch_recall, epoch_f1

### Load model

#### simple model

In [84]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):

        #x = [sent len, batch size]
        
        embedded = self.embedding(x)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [86]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50
HIDDEN_DIM = 256
OUTPUT_DIM = 126

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

#### OR CNN model

In [28]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs,embedding_dim)) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        x = x.permute(1, 0)
                
        #x = [batch size, sent len]
        
        embedded = self.embedding(x)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim=1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [140]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 126
DROPOUT = 0.5

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)

In [29]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50
N_FILTERS = 200
FILTER_SIZES = [3,5,7]
OUTPUT_DIM = 126
DROPOUT = 0.5

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)

# 14 epochs gave f1 0.84

In [32]:
model = model.to(device)
model.embedding.weight.data.copy_(pretrained_embeddings)

#optimizer = optim.SGD(model.parameters(), lr=1e-3)
optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

### Train

In [40]:
# Ver2 - F1
N_EPOCHS = 4

for epoch in range(N_EPOCHS):
    print(epoch)

    train_loss, train_precision, train_recall, train_f1 = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_precision, valid_recall, valid_f1 = evaluate(model, valid_iterator, criterion)
    
    #print(f'| Epoch:{epoch+1:02} | Train Loss: {train_loss:.3f} | Tr Precision: {train_precision:.3f} | Tr recall: {train_recall:.3f} | Tr f1: {train_f1:.3f} | Valid f1: {valid_f1:.3f} |')    
    print(f'| Ep:{epoch+1:02} |Tr Loss:{train_loss:.3f} |Prec:{train_precision:.3f} |Rec:{train_recall:.3f} |f1:{train_f1:.3f} |Val Loss:{valid_loss:.3f} |prec:{valid_precision:.3f} |rec:{valid_recall:.3f} |f1:{valid_f1:.3f} |')        

0
| Ep:01 |Tr Loss:0.040 |Prec:0.844 |Rec:0.608 |f1:0.707 |Val Loss:0.033 |prec:0.791 |rec:0.815 |f1:0.803 |
1
| Ep:02 |Tr Loss:0.028 |Prec:0.876 |Rec:0.733 |f1:0.798 |Val Loss:0.031 |prec:0.798 |rec:0.844 |f1:0.820 |
2
| Ep:03 |Tr Loss:0.025 |Prec:0.884 |Rec:0.760 |f1:0.817 |Val Loss:0.031 |prec:0.792 |rec:0.859 |f1:0.824 |
3
| Ep:04 |Tr Loss:0.024 |Prec:0.890 |Rec:0.776 |f1:0.829 |Val Loss:0.029 |prec:0.809 |rec:0.856 |f1:0.832 |


In [42]:
# Ver2 - F1
N_EPOCHS = 10

for epoch in range(N_EPOCHS):
    print(epoch)

    train_loss, train_precision, train_recall, train_f1 = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_precision, valid_recall, valid_f1 = evaluate(model, valid_iterator, criterion)
    
    #print(f'| Epoch:{epoch+1:02} | Train Loss: {train_loss:.3f} | Tr Precision: {train_precision:.3f} | Tr recall: {train_recall:.3f} | Tr f1: {train_f1:.3f} | Valid f1: {valid_f1:.3f} |')    
    print(f'| Ep:{epoch+1:02} |Tr Loss:{train_loss:.3f} |Prec:{train_precision:.3f} |Rec:{train_recall:.3f} |f1:{train_f1:.3f} |Val Loss:{valid_loss:.3f} |prec:{valid_precision:.3f} |rec:{valid_recall:.3f} |f1:{valid_f1:.3f} |')        

0
| Ep:01 |Tr Loss:0.023 |Prec:0.893 |Rec:0.787 |Tr f1:0.837 |Val prec:0.799 |Val rec:0.864 |Val f1:0.830 |
1
| Ep:02 |Tr Loss:0.022 |Prec:0.897 |Rec:0.796 |Tr f1:0.843 |Val prec:0.798 |Val rec:0.866 |Val f1:0.831 |
2
| Ep:03 |Tr Loss:0.021 |Prec:0.900 |Rec:0.804 |Tr f1:0.849 |Val prec:0.803 |Val rec:0.865 |Val f1:0.833 |
3
| Ep:04 |Tr Loss:0.020 |Prec:0.903 |Rec:0.811 |Tr f1:0.855 |Val prec:0.810 |Val rec:0.862 |Val f1:0.835 |
4
| Ep:05 |Tr Loss:0.020 |Prec:0.905 |Rec:0.817 |Tr f1:0.859 |Val prec:0.821 |Val rec:0.855 |Val f1:0.838 |
5
| Ep:06 |Tr Loss:0.019 |Prec:0.908 |Rec:0.823 |Tr f1:0.863 |Val prec:0.831 |Val rec:0.851 |Val f1:0.841 |
6
| Ep:07 |Tr Loss:0.019 |Prec:0.910 |Rec:0.828 |Tr f1:0.867 |Val prec:0.834 |Val rec:0.849 |Val f1:0.841 |
7
| Ep:08 |Tr Loss:0.018 |Prec:0.912 |Rec:0.834 |Tr f1:0.871 |Val prec:0.838 |Val rec:0.848 |Val f1:0.843 |
8
| Ep:09 |Tr Loss:0.018 |Prec:0.914 |Rec:0.838 |Tr f1:0.874 |Val prec:0.844 |Val rec:0.844 |Val f1:0.844 |
9
| Ep:10 |Tr Loss:0.017 |Pr

In [43]:
# Ver2 - F1
N_EPOCHS = 1

for epoch in range(N_EPOCHS):
    print(epoch)

    train_loss, train_precision, train_recall, train_f1 = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_precision, valid_recall, valid_f1 = evaluate(model, valid_iterator, criterion)
    
    #print(f'| Epoch:{epoch+1:02} | Train Loss: {train_loss:.3f} | Tr Precision: {train_precision:.3f} | Tr recall: {train_recall:.3f} | Tr f1: {train_f1:.3f} | Valid f1: {valid_f1:.3f} |')    
    print(f'| Ep:{epoch+1:02} |Tr Loss:{train_loss:.3f} |Prec:{train_precision:.3f} |Rec:{train_recall:.3f} |f1:{train_f1:.3f} |Val Loss:{valid_loss:.3f} |prec:{valid_precision:.3f} |rec:{valid_recall:.3f} |f1:{valid_f1:.3f} |')        

0
| Ep:01 |Tr Loss:0.017 |Prec:0.917 |Rec:0.846 |f1:0.880 |Val Loss:0.027 |prec:0.842 |rec:0.846 |f1:0.844 |


### Save the model

For this version create the model object with same paramters as when training. Then load weights.
This version saves also gradients etc.

In [44]:
torch.save(model.state_dict(), 'model.pkl')
#torch.save(model, 'filename.pt')

In [55]:
model2 = RNN(input_dim=25002, embedding_dim=50, hidden_dim=256, output_dim=126)

In [62]:
model2.load_state_dict(torch.load('model.pkl'))
                      

In [63]:
model2.eval()

RNN(
  (embedding): Embedding(25002, 50)
  (rnn): RNN(50, 256)
  (fc): Linear(in_features=256, out_features=126, bias=True)
)

#### easier but not so compatible
This version does not save the gradients, only final model. 
Here you dont ened to create model object frist.

In [45]:
torch.save(model, 'modelx.pt')


  "type " + obj.__name__ + ". It won't be checked "


In [111]:
modelx = torch.load('modelx.pt')

### Test it on test data

In [46]:
test_loss, test_precision, test_recall, test_f1 = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} ||Prec:{test_precision:.3f} |Rec:{test_recall:.3f} |Test F1:{test_f1: .3f} |')

| Test Loss: 0.027 ||Prec:0.843 |Rec:0.845 |Test F1: 0.844 |


### Use final scoring 

The results will be ranked according to the highest micro-averaged F1 score. This will be calculated using the f1_score function found in scikit-learn, using a command like f1_score(y_true, y_pred, average='micro') where y_true is the matrix with the ground truth, and y_pred the predicted output. Both matrices are binary, a 1 in row i and column j means that the image/document i contains the label j.

In [None]:
from sklearn.metrics import f1_score
f1_score()