In [14]:
import os
import torch
from torchtext import data
from torchtext import datasets
import pandas as pd
import numpy as np

SEED = 1
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

data_path = 'input/reuters_small.pkl'
labels_path = 'input/classcodes.csv'
BATCH_SIZE = 64

"""
def listToInt(mylist, dict_):
    return [dict_[item] for item in mylist]

def multihot(tags, taglist):
    return [1 if tag in tags else 0 for tag in taglist]

reuters = pd.read_pickle(data_path)
# read classcodes
classcodes= pd.read_csv(labels_path)
codetoi = dict(zip(classcodes.Code, range(len(classcodes))))
# convert classcodes to int
reuters['codes'] = [listToInt(codelist, codetoi) for codelist in reuters.codes]
# convert classcodes to multihot
reuters['codes'] = [multihot(claslist, range(126)) for claslist in reuters.codes]

# construct train/test/val sets
train = reuters[0:2500]
test = reuters[2500:3000]
val = reuters[3000:len(reuters)]
# train.to_json('input/train.json', orient='records', lines=True)
# test.to_json('input/test.json', orient='records', lines=True)
# val.to_json('input/val.json', orient='records', lines=True)
################################################################################
"""
train = 'input/train.json'
valid = 'input/val.json'
test = 'input/test.json'

#Define the Fields
TEXT = data.Field()
HEADLINE = data.Field()
LABELS = data.LabelField(sequential=False, use_vocab=False)
fields = {'headline': ('h', HEADLINE), 'text': ('t', TEXT), 'codes': ('l', LABELS)}

# Create dataset (TabularDataset)
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = '',
                                        train = train,
                                        validation = valid,
                                        test = test,
                                        format = 'json',
                                        fields = fields
)

# Without or with GLOVE
TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.50d")
HEADLINE.build_vocab(train)
LABELS.build_vocab(train)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_data), 
batch_size=BATCH_SIZE,
device=device,
sort_key= lambda x: len(x.t)
)



.vector_cache/glove.6B.zip: 862MB [02:04, 6.93MB/s]                           
 99%|█████████▉| 397710/400000 [00:12<00:00, 30392.10it/s]

In [52]:
import torch.optim as optim
import torch.nn.functional as F

def f1_own_accuracy(preds, y):
    '''Returns counts of true_pos, false_pos and false_negative.
    For counting precision, recall and F1 globally
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    '''

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    
    preds = rounded_preds.cpu().data.numpy()
    y = y.cpu().data.numpy()
        
    # True positive
    tpos = np.sum(np.logical_and(preds == 1, y == 1))
 
    # True negative
    #tneg = np.sum(np.logical_and(preds == 0, y == 0))
 
    # False positive
    fpos = np.sum(np.logical_and(preds == 1, y == 0))
 
    # False negative
    fneg = np.sum(np.logical_and(preds == 0, y == 1))

    return tpos, fpos, fneg
  
# F1 version
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    #epoch_acc = 0
    epoch_tpos = 0
    epoch_fpos = 0
    epoch_fneg = 0    
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.t).squeeze(1)
            
            loss = criterion(predictions, batch.l.float())
            
            tpos, fpos, fneg = f1_own_accuracy(predictions, batch.l.float())
            epoch_tpos += tpos
            epoch_fpos += fpos
            epoch_fneg += fneg            

            epoch_loss += loss.item()
            #epoch_acc += acc.item()

    # avoid div by zero with epsilon
    eps = 1e-7
    epoch_precision = epoch_tpos / (epoch_tpos + epoch_fpos +eps)
    epoch_recall = epoch_tpos / (epoch_tpos + epoch_fneg +eps)
    epoch_f1 = 2* (  (epoch_precision * epoch_recall) / (epoch_precision + epoch_recall +eps))            
        
    return epoch_loss / len(iterator), epoch_precision, epoch_recall, epoch_f1


# model
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50
HIDDEN_DIM = 256
OUTPUT_DIM = 126
N_EPOCHS = 50

import torch.nn as nn

class Net(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super(Net, self).__init__()
        self.em = nn.Embedding(input_dim, embedding_dim)
        self.conv4 = nn.Conv2d(1, 50, (5,50))
        self.conv1 = nn.Conv2d(1, 50, (4,50))
        self.conv2 = nn.Conv2d(1, 50, (3,50))
        self.conv3 = nn.Conv2d(1, 50, (2,50))
        self.fc1 = nn.Linear(50*4,150)
        self.fc2 = nn.Linear(150, 126)
        
        
    def forward(self, x):
        x = self.em(x).permute(1,0,2).unsqueeze(1)
        c1 = F.relu(self.conv1(x))
        c2 = F.relu(self.conv2(x))
        c3 = F.relu(self.conv3(x))
        c4 = F.relu(self.conv4(x))
        c1 = F.max_pool1d(c1.squeeze(3), c1.shape[2])
        c2 = F.max_pool1d(c2.squeeze(3), c2.shape[2])
        c3 = F.max_pool1d(c3.squeeze(3), c3.shape[2])
        c4 = F.max_pool1d(c4.squeeze(3), c4.shape[2])
        x = F.dropout(F.relu(self.fc1(torch.cat((c1, c2, c3, c4), dim=1).squeeze(2))))
        output = self.fc2(x)
        return output
    
model = Net(INPUT_DIM, EMBEDDING_DIM)
model.em.weight.data.copy_(TEXT.vocab.vectors)
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_tpos = 0
    epoch_fpos = 0
    epoch_fneg = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.t).squeeze(1)
        loss = criterion(predictions, batch.l.float())
        tpos, fpos, fneg = f1_own_accuracy(predictions, batch.l.float())
        epoch_tpos += tpos
        epoch_fpos += fpos
        epoch_fneg += fneg
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    eps = 1e-7
    epoch_precision = epoch_tpos / (epoch_tpos + epoch_fpos +eps )
    epoch_recall = epoch_tpos / (epoch_tpos + epoch_fneg +eps)
    epoch_f1 = 2* (  (epoch_precision * epoch_recall) / (epoch_precision + epoch_recall +eps))
    
    return epoch_loss / len(iterator), epoch_precision, epoch_recall, epoch_f1

for epoch in range(N_EPOCHS):
    print(epoch)
    train_loss, train_precision, train_recall, train_f1 = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_precision, valid_recall, valid_f1 = evaluate(model, valid_iterator, criterion)
    #print(f'| Epoch:{epoch+1:02} | Train Loss: {train_loss:.3f} | Tr Precision: {train_precision:.3f} | Tr recall: {train_recall:.3f} | Tr f1: {train_f1:.3f} | Valid f1: {valid_f1:.3f} |')    
    print(f'| Ep:{epoch+1:02} |Tr Loss:{train_loss:.3f} |Prec:{train_precision:.3f} |Rec:{train_recall:.3f} |f1:{train_f1:.3f} |Val prec:{valid_precision:.3f} |Val rec:{valid_recall:.3f} |Val f1:{valid_f1:.3f} |')     

0
| Ep:01 |Tr Loss:0.255 |Prec:0.059 |Rec:0.175 |f1:0.089 |Val prec:0.294 |Val rec:0.163 |Val f1:0.209 |
1
| Ep:02 |Tr Loss:0.100 |Prec:0.392 |Rec:0.127 |f1:0.192 |Val prec:0.400 |Val rec:0.139 |Val f1:0.207 |
2
| Ep:03 |Tr Loss:0.090 |Prec:0.500 |Rec:0.127 |f1:0.203 |Val prec:0.516 |Val rec:0.123 |Val f1:0.199 |
3
| Ep:04 |Tr Loss:0.085 |Prec:0.635 |Rec:0.159 |f1:0.254 |Val prec:0.588 |Val rec:0.150 |Val f1:0.239 |
4
| Ep:05 |Tr Loss:0.078 |Prec:0.781 |Rec:0.251 |f1:0.380 |Val prec:0.709 |Val rec:0.159 |Val f1:0.259 |
5
| Ep:06 |Tr Loss:0.070 |Prec:0.840 |Rec:0.329 |f1:0.473 |Val prec:0.676 |Val rec:0.256 |Val f1:0.372 |
6
| Ep:07 |Tr Loss:0.065 |Prec:0.824 |Rec:0.369 |f1:0.510 |Val prec:0.733 |Val rec:0.281 |Val f1:0.406 |
7
| Ep:08 |Tr Loss:0.060 |Prec:0.848 |Rec:0.394 |f1:0.538 |Val prec:0.735 |Val rec:0.316 |Val f1:0.442 |
8
| Ep:09 |Tr Loss:0.056 |Prec:0.849 |Rec:0.423 |f1:0.565 |Val prec:0.736 |Val rec:0.321 |Val f1:0.447 |
9
| Ep:10 |Tr Loss:0.054 |Prec:0.876 |Rec:0.443 |f1:0.5