In [1]:
import os, sys, json, random, time
sys.path.append(os.path.abspath('./src'))
from utils import save_value, load_value, load_env_keys
import numpy as np
from numpy.linalg import norm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
# PATHS
data_path = '/scratch/juanmoo1'
EMA_dump_path = os.path.join(data_path, './jsons/EMA_dump.json')
EMA_xmls_path = os.path.join(data_path, './xmls/')
EMA_annotations_path = os.path.join(data_path, './bayer/VendorEMAforMIT/annotations.xlsx')

pickle_dumps_path = os.path.join(data_path, './pickle_dumps/')
checkpoint_path = os.path.join(pickle_dumps_path, 'checkpoint.pickle')

models_path = os.path.join(data_path, './models')

In [5]:
labeled_raw_documents = load_value('labeled_raw_documents', path=checkpoint_path)

In [2]:
import re, spacy

NLP = spacy.load('en_core_web_sm')
MAX_CHARS = 20000

def tokenizer(comment):
    comment = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\!+", "!", comment)
    comment = re.sub(r"\,+", ",", comment)
    comment = re.sub(r"\?+", "?", comment)
    if (len(comment) > MAX_CHARS):
        comment = comment[:MAX_CHARS]
    return [x.text for x in NLP.tokenizer(comment) if x.text != " "]


In [188]:
minimum_paragraph_length = 5
vocab = set()
label_vocab = set()
tag_vocab = set()

processed_docs = dict()

for doc_name in labeled_raw_documents:
    doc = labeled_raw_documents[doc_name]
    
    texts = []
    labels = []
    tags = []
    
    for text, label, tag in zip(doc['paragraphs'], doc['labels'], doc['tags']):
        words = tokenizer(text.lower())
        vocab |= set(words)
        tag_vocab.add(tag.lower())

        
        if len(words) >= minimum_paragraph_length and tag in ['head', 'p']:
            vocab |= set(words)
            label_vocab.add(label.lower())

            texts.append(words)
            labels.append(label.lower())
            tags.append(tag.lower())
    
    processed_docs[doc_name] = {
        'texts': texts,
        'labels': labels,
        'tags': tags
    }



# Match text with the previous header
for doc_name in processed_docs:
    texts = processed_docs[doc_name]['texts']
    tags = processed_docs[doc_name]['tags']
    
    # First Header
    header1 = [-1] * len(texts)    
    last_header = 0
    
    while(last_header < len(tags) and tags[last_header] != 'head'):
        last_header += 1
        
    i = last_header + 1
    
    while i < len(tags):
        header1[i] = last_header
        if tags[i] == 'head':
            last_header = i
        i += 1
    
    # Second Header
    header2 = [-1] * len(header1)
    last_header = 0
    while(last_header < len(tags) and header1[last_header] == -1):
        last_header += 1
        
    i = last_header + 1
    while i < len(header2):
        header2[i] = header1[last_header]
        if header1[i] != -1:
            last_header = i
        i += 1
        
    first_header = [texts[i] if i != -1 else [] for i in header1]
    second_header = [texts[i] if i != -1 else [] for i in header2]
    
    processed_docs[doc_name]['first_header'] = first_header
    processed_docs[doc_name]['second_header'] = second_header

save_value('processed_documents_2', processed_docs, path=checkpoint_path)

In [192]:
lab2index = {l:i for i, l in enumerate(label_vocab)}
print(lab2index)



## Google's Pretrained word2vec embeddings

In [268]:
import gensim

# Load Google's pre-trained Word2Vec model.
word2vec = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(models_path, 'GoogleNews-vectors-negative300.bin'), binary=True)

In [269]:
# Extract pretrained embeading params
google_weights = word2vec.vectors
google_index2word = word2vec.index2word
google_word2index = word2vec.vocab

In [270]:
google_vocab = set(word2vec.vocab)
unknown_vocab = list(vocab - google_vocab)

In [196]:
# Portion of unknown words
print('Unknown Portion:', len(unknown_vocab)/len(vocab))

Unknown Portion: 0.36820633384040935


In [271]:
# Create random embedding for unknown words
unknown = nn.Embedding(len(unknown_vocab), 300)
unknown_weights = unknown.weight.detach().numpy()

In [272]:
# Joint Embeddings
weights = torch.tensor(np.vstack([google_weights, unknown_weights]), dtype=torch.float)
index2word = google_index2word + unknown_vocab
word2index = {w:i for i, w in enumerate(index2word)}

In [273]:
# Create new Torch embedding
embedding = nn.EmbeddingBag.from_pretrained(weights)

In [274]:
embedding(torch.tensor([0]), torch.tensor([0])).shape

torch.Size([1, 300])

In [275]:
embedding(torch.tensor([[word2index['their']], [word2index['our']]])).shape

torch.Size([2, 300])

## MLP w/ Embeddings Model

In [323]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Classifier(nn.Module):
    def __init__(self, embedding, num_classes):
        super().__init__()
        self.embedding = embedding
        # Input to first fc layer is 3 * embedding size to account for prev two headers
        self.fc = nn.Linear(embedding.embedding_dim * 3, 500)
        self.fc2 = nn.Linear(500, num_classes)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        # Embedding doesn't get randomly initialized
#         self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc2.bias.data.zero_()
        
    def forward(self, data, offsets):
        emb_text = self.embedding(data[0], offsets[0]) # text
        emb_h1 = self.embedding(data[1], offsets[1]) # first header
        emb_h2 = self.embedding(data[2], offsets[2]) # second header
        embedded = torch.cat([emb_text, emb_h1, emb_h2], dim=1)
        
        out = self.fc(embedded)
        out = F.relu(out)
        out = self.fc2(out)
        
        return out

In [324]:
num_classes = len(label_vocab)
model = Classifier(embedding, num_classes)#.to(device)

In [325]:
[e.shape for e in model.parameters()]

[torch.Size([3005325, 300]),
 torch.Size([500, 900]),
 torch.Size([500]),
 torch.Size([12, 500]),
 torch.Size([12])]

In [326]:
names = list(processed_docs)
random.shuffle(names)

i = int(.8 * len(names))
train_docs = names[:i]
test_docs = names[i:]

# (texts, header1, header2), label

train_data = []
for doc_name in train_docs:
    doc = processed_docs[doc_name]
    texts = doc['texts']
    header1s = doc['first_header']
    header2s = doc['second_header']
    labels = doc['labels']
    
    for t, h1, h2, l in zip(texts, header1s, header2s, labels):
        data = (t, h1, h2)
        train_data.append((data, l))
        
test_data = []
for doc_name in test_docs:
    doc = processed_docs[doc_name]
    texts = doc['texts']
    header1s = doc['first_header']
    header2s = doc['second_header']
    labels = doc['labels']
    
    for t, h1, h2, l in zip(texts, header1s, header2s, labels):
        data = (t, h1, h2)
        test_data.append((data, l))

### Training & Testing

In [327]:
def process_batch(batch): #batch <- [((text, header_1, header_2), label), ...]
    data, labels = zip(*batch)
    
    label = torch.tensor([lab2index[entry[1]] for entry in batch])
    texts, h1s, h2s = zip(*data)
    
    texts = [ torch.tensor([word2index[w] for w in t], dtype=torch.long) for t in texts ]
    h1s = [ torch.tensor([word2index[w] for w in h], dtype=torch.long) for h in h1s ]
    h2s = [ torch.tensor([word2index[w] for w in h], dtype=torch.long) for h in h2s ]
    
    text_offsets = [0] + [len(entry) for entry in texts[:-1]]
    text_offsets = torch.tensor(text_offsets).cumsum(dim=0)
    
    h1_offsets = [0] + [len(entry) for entry in h1s[:-1]]
    h1_offsets = torch.tensor(h1_offsets).cumsum(dim=0)
    
    h2_offsets = [0] + [len(entry) for entry in h2s[:-1]]
    h2_offsets = torch.tensor(h2_offsets).cumsum(dim=0)
    
    texts = torch.cat(texts)
    h1s = torch.cat(h1s)
    h2s = torch.cat(h2s)
    
    data = [texts, h1s, h2s]
    offsets = [text_offsets, h1_offsets, h2_offsets]
    
    return label, data, offsets
    

In [328]:
import numpy as np
BATCH_SIZE = 16

# Training Function
def train_func(data):
    
    # Metrics
    train_loss = 0.0
    train_acc = 0.0
    
    # Mini-batch training
    random.shuffle(data)
    for i in range(len(data)//BATCH_SIZE):
        batch = data[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]
        optimizer.zero_grad()
        
        labels, d, offsets = process_batch(batch)
        output = model(d, offsets)
        loss = criterion(output, labels)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == labels).sum().item()
    
    scheduler.step()
    
    return train_loss/len(data), train_acc/len(data)

# Testing Function
def test_func(data):
    test_loss = 0
    test_acc = 0
    all_pred = torch.tensor([], dtype=torch.long)
    all_lab = torch.tensor([], dtype=torch.long)
    
    for i in range(len(data)//BATCH_SIZE):
        batch = data[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]
        labels, d, offsets = process_batch(batch)
        
        with torch.no_grad():
            output = model(d, offsets)
            loss = criterion(output, labels)
            test_loss += loss.item()
            pred = output.argmax(1)
            all_pred = torch.cat([all_pred, pred])
            all_lab = torch.cat([all_lab, labels])
            test_acc += (pred == labels).sum().item()
    
    return all_pred, all_lab, test_loss/len(data), test_acc/len(data)

# Precision & Recall from confusion matrix
def get_metrics(cm, other_index = None):
    # Diagonal elemetns were correctly classified
    diagonal = cm.diagonal()
    
    # Input class Counts
    class_sum = cm.sum(axis=1)
    
    # Predicted class counts
    pred_sum = cm.sum(axis=0)
    
    # Per-class performance w/ no-examples -> 0 perf
    precision = np.where(class_sum == 0, 0, diagonal/class_sum)
    recall = np.where(pred_sum == 0, 0, diagonal/pred_sum)
    
    # Frequency Weighted Performance
    c_freq = cm.sum(axis=1)/cm.sum()
    pres = c_freq * precision
    rec = c_freq * recall

    # Remove 'other' Category
    if other_index is not None:
        c_freq = np.hstack([c_freq[0:other_index], c_freq[other_index + 1:]])
        pres = np.hstack([pres[0:other_index], pres[other_index + 1:]])
        rec = np.hstack([rec[0:other_index], rec[other_index + 1:]])
    
    return pres.sum()/c_freq.sum(), rec.sum()/c_freq.sum()

In [337]:
N_EPOCHS = 30
min_valid_loss = float('inf')
criterion = torch.nn.CrossEntropyLoss()#.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

In [345]:
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train_func(train_data)
    pred, lab, valid_loss, valid_acc = test_func(test_data)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'Train:\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'Test:\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')
    print('-' * 70)

Epoch: 1  | time in 0 minutes, 4 seconds
Train:	Loss: 0.0098(train)	|	Acc: 94.3%(train)
Test:	Loss: 0.0248(valid)	|	Acc: 93.6%(valid)
----------------------------------------------------------------------
Epoch: 2  | time in 0 minutes, 3 seconds
Train:	Loss: 0.0097(train)	|	Acc: 94.3%(train)
Test:	Loss: 0.0242(valid)	|	Acc: 94.2%(valid)
----------------------------------------------------------------------
Epoch: 3  | time in 0 minutes, 4 seconds
Train:	Loss: 0.0096(train)	|	Acc: 94.5%(train)
Test:	Loss: 0.0245(valid)	|	Acc: 94.5%(valid)
----------------------------------------------------------------------
Epoch: 4  | time in 0 minutes, 4 seconds
Train:	Loss: 0.0095(train)	|	Acc: 94.4%(train)
Test:	Loss: 0.0244(valid)	|	Acc: 94.3%(valid)
----------------------------------------------------------------------
Epoch: 5  | time in 0 minutes, 3 seconds
Train:	Loss: 0.0094(train)	|	Acc: 94.5%(train)
Test:	Loss: 0.0246(valid)	|	Acc: 94.4%(valid)
----------------------------------------------

In [346]:
from sklearn.metrics import confusion_matrix
pred = pred.numpy()
lab = lab.numpy()

cm = confusion_matrix(lab, pred)
print(cm)

[[   0    1    0    0    0    0    0    0]
 [   0 2665   52    0    0    3    0    6]
 [   0   65   72    0    0    0    0    0]
 [   0    1    0    0    0    0    0    0]
 [   0    1    0    0    0    0    0    0]
 [   0   14    2    0    0    4    0    0]
 [   0    4    0    0    0    0    0    0]
 [   0   21    1    0    0    0    0   32]]


### Reorder based on frequency

In [352]:
class_count = dict(zip(*np.unique(pred, return_counts=True)))
rev_map = sorted(class_count, key=class_count.get, reverse=True)
missing = list(set(range(12)) - set(rev_map))
rev_map.extend(missing)

mapping = list(range(12))
for i, e in enumerate(rev_map):
    mapping[e] = i

new_pred = [mapping[e] for e in pred]
new_lab = [mapping[e] for e in lab]

cm = confusion_matrix(new_pred, new_lab)
print(cm)

[[2665   65   21   14    1    1    1    4]
 [  52   72    1    2    0    0    0    0]
 [   6    0   32    0    0    0    0    0]
 [   3    0    0    4    0    0    0    0]
 [   0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0]]


In [353]:
precision, recall = get_metrics(cm, other_index=None)
print('Precision:', precision)
print('Recall:', recall)

print('Without \'other\':')
precision, recall = get_metrics(cm, other_index=0)
print('Precision:', precision)
print('Recall:', recall)

Precision: 0.9419157608695652
Recall: 0.9513022068741
Without 'other':
Precision: 0.627906976744186
Recall: 0.5271107213137429




## MLP, Embeddings, No header

In [356]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Classifier(nn.Module):
    def __init__(self, embedding, num_classes):
        super().__init__()
        self.embedding = embedding
        # Input to first fc layer is 3 * embedding size to account for prev two headers
        self.fc = nn.Linear(embedding.embedding_dim, 500)
        self.fc2 = nn.Linear(500, num_classes)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        # Embedding doesn't get randomly initialized
#         self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc2.bias.data.zero_()
        
    def forward(self, data, offsets):
        embedded = self.embedding(data[0], offsets[0]) # text
        out = self.fc(embedded)
        out = F.relu(out)
        out = self.fc2(out)
        
        return out

In [357]:
num_classes = len(label_vocab)
model = Classifier(embedding, num_classes)#.to(device)

In [358]:
N_EPOCHS = 30
min_valid_loss = float('inf')
criterion = torch.nn.CrossEntropyLoss()#.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

In [362]:
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train_func(train_data)
    pred, lab, valid_loss, valid_acc = test_func(test_data)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'Train:\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'Test:\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')
    print('-' * 70)

Epoch: 1  | time in 0 minutes, 2 seconds
Train:	Loss: 0.0051(train)	|	Acc: 97.1%(train)
Test:	Loss: 0.0223(valid)	|	Acc: 93.1%(valid)
----------------------------------------------------------------------
Epoch: 2  | time in 0 minutes, 2 seconds
Train:	Loss: 0.0050(train)	|	Acc: 97.2%(train)
Test:	Loss: 0.0233(valid)	|	Acc: 92.7%(valid)
----------------------------------------------------------------------
Epoch: 3  | time in 0 minutes, 2 seconds
Train:	Loss: 0.0049(train)	|	Acc: 97.2%(train)
Test:	Loss: 0.0223(valid)	|	Acc: 93.2%(valid)
----------------------------------------------------------------------
Epoch: 4  | time in 0 minutes, 2 seconds
Train:	Loss: 0.0048(train)	|	Acc: 97.3%(train)
Test:	Loss: 0.0222(valid)	|	Acc: 93.5%(valid)
----------------------------------------------------------------------
Epoch: 5  | time in 0 minutes, 2 seconds
Train:	Loss: 0.0048(train)	|	Acc: 97.4%(train)
Test:	Loss: 0.0232(valid)	|	Acc: 93.2%(valid)
----------------------------------------------

In [360]:
from sklearn.metrics import confusion_matrix
pred = pred.numpy()
lab = lab.numpy()
class_count = dict(zip(*np.unique(pred, return_counts=True)))
rev_map = sorted(class_count, key=class_count.get, reverse=True)
missing = list(set(range(12)) - set(rev_map))
rev_map.extend(missing)

mapping = list(range(12))
for i, e in enumerate(rev_map):
    mapping[e] = i

new_pred = [mapping[e] for e in pred]
new_lab = [mapping[e] for e in lab]

cm = confusion_matrix(new_pred, new_lab)
print(cm)

[[2699  121   41    8    2    1    0    1    1]
 [  12   16    0    0    0    0    0    0    0]
 [   6    0   13    0    0    0    0    0    0]
 [   4    0    0   12    0    0    0    0    0]
 [   4    0    0    0    0    0    0    0    0]
 [   0    0    0    0    2    0    0    0    0]
 [   1    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0]]


In [361]:
precision, recall = get_metrics(cm, other_index=None)
print('Precision:', precision)
print('Recall:', recall)

print('\nWithout \'other\':')
precision, recall = get_metrics(cm, other_index=0)
print('Precision:', precision)
print('Recall:', recall)

Precision: 0.9307065217391305
Recall: 0.9724790295473974

Without 'other':
Precision: 0.5857142857142856
Recall: 0.24920210095392573


