### Researcher Name Extraction Dataset

Dataset statistics:

| Data file  | Documents | Sentences | Tokens | Names |
|------------|-----------|-----------|--------|-------|
| Training   | 80        | 24728     | 110269 | 5822  |
| Validation | 35        | 8743      | 36757  | 1788  |
| Test       | 35        | 10399     | 44795  | 2723  |
| Total      | 145       | 43870     | 191821 | 10333 |

In [None]:
import numpy as np
import time
import os
import random
import sys
sys.path.insert(1, os.path.realpath(os.path.pardir))

from optparse import OptionParser
from pathlib import Path
from model.hmm import HiddenMarkov, load_dataset

start_time = time.time()
for name in ['train', 'valid', 'test']:
    _, Y, T = load_dataset('../data/ner_on_html/' + name)
    t = [[['O', 'B-PER', 'I-PER'][t__] for t__ in t_] for t_ in Y]
    p = [[['O', 'B-PER', 'I-PER'][p__] for p__ in p_] for p_ in Y]
    w = T
    
    with Path('../results/score/{}.preds.txt'.format(name)).open('wb') as f:
        for words, preds, tags in zip(w, p, t):
            f.write(b'\n')
            for word, pred, tag in zip(words, preds, tags):
                f.write(' '.join([word, tag, pred]).encode() + b'\n')

!cd .. && ./eval.sh | grep processed

In [None]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

def plot_word_frequency(directory, color):
    my_counter = Counter()
    for fname in ['train', 'valid', 'test']:
        with open(directory + '/' + fname) as f:
            words = [line.strip().lower().split()[0] for line in f if len(line.strip()) > 0]
            words = [w for w in words if w != '-docstart-']
            my_counter.update(words)

    data = [(key, my_counter[key]) for key in my_counter]    
    data.sort(key=lambda x: x[1], reverse=True)
    
    print([(i, x[1]) for i, x in enumerate(data)][:100])
    plt.plot([x[1] for x in data][:100], color)
    return data[:50]
    
plt.title('Word frequencies')
data1 = plot_word_frequency('../data/conll2003', 'r')
data2 = plot_word_frequency('../data/ner_on_html', 'b')

print(' '.join([d[0] for d in data1[:10]]))
print()
print(' '.join([d[0] for d in data2[:10]]))

for d1, d2 in zip(data1, data2):
    print('%s & %d & %s & %d' % (d1[0], d1[1], d2[0], d2[1]))

In [None]:
import pandas as pd
from dython import nominal

def load_raw_dataset(f):
    with open(f, 'r', encoding='utf8') as f:
        data = f.read().strip()
        sentences = [s.split('\n') for s in data.split('\n\n') if not s.startswith('-DOCSTART-')]
        X = [t.split(' ') for s in sentences for t in s if len(s) > 0]
        for i, s in enumerate(X):
            X[i] = X[i][2:5] + X[i][7:]
        return X

X = load_raw_dataset('../data/ner_on_html/train')
X += load_raw_dataset('../data/ner_on_html/valid')
X += load_raw_dataset('../data/ner_on_html/test')

data = {}
data['words']         = [x[0 ] for x in X]
data['exact_match']   = [int(x[1]) for x in X]
data['partial_match'] = [int(x[2]) for x in X]
data['email']         = [int(x[3]) for x in X]
data['number']        = [int(x[4]) for x in X]
data['honorific']     = [int(x[5]) for x in X] 
data['url']           = [int(x[6]) for x in X]
data['capitalized']   = [int(x[7]) for x in X]
data['punctuation']   = [int(x[8]) for x in X]
data['html_tag']      = [x[9 ] for x in X]
data['css_class']     = [x[10] for x in X]

data['words'][0]
df = pd.DataFrame(data)

nominal.associations(df, nominal_columns=['words','html_tag', 'css_class'])

### How to do it: https://github.com/shakedzy/dython/issues/2

Calculates Cramer's V statistic for categorical-categorical association.
Uses correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328.
This is a symmetric coefficient: V(x,y) = V(y,x)

https://github.com/shakedzy/dython/blob/master/dython/nominal.py
https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V

### Nested cross-validation

5-fold cross validation


Partition the training data randomly in five folds

Nested CV
https://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html

Common error with cross validation
https://www.youtube.com/watch?v=S06JpVoNaA0

https://www.kdnuggets.com/2017/08/dataiku-predictive-model-holdout-cross-validation.html

https://www.datarobot.com/wiki/training-validation-holdout/

The dataset is split into 3 different files: train, valid, and test. Also, we provide 11 features alongside each token.

| Feature                          | Type        |
|----------------------------------|-------------|
| Unaccented lowercase token       | Categorical |
| Exact dictionary match           | Binary      |
| Partial dictionary match         | Binary      |
| Email                            | Binary      |
| Number                           | Binary      |
| Honorific (Mr., Mrs., Dr., etc.) | Binary      |
| Matches a URL                    | Binary      |
| Is capitalized                   | Binary      |
| Is a punctuation sign            | Binary      |
| HTML tag + parent                | Categorical |
| CSS class                        | Categorical |

### Hidden Markov Models

In [None]:
import numpy as np
import time
import os
import random
import sys
sys.path.insert(1, os.path.realpath(os.path.pardir))

from optparse import OptionParser
from pathlib import Path
from model.hmm import HiddenMarkov, load_dataset

def test_hmm(timesteps, use_features, self_train, dataset):
    start_time = time.time()
    naive_bayes = timesteps == 0
    if naive_bayes:
        timesteps = 1
        
    print('Fitting...')
    X1, Y1, T1 = load_dataset(dataset + '/train')
    X2, Y2, T2 = load_dataset(dataset + '/valid')
    X3, Y3, T3 = load_dataset(dataset + '/test')
    
    training_set = [x for x in zip(X1 + X2 + X3, Y1 + Y2 + Y3, T1 + T2 + T3)]

    documents = []
    for p in training_set:     
        if p[0][0][0] == '-DOCSTART-':
            documents.append([])
        else:
            documents[len(documents)-1].append(p)

    random.shuffle(documents)
    fold_size = len(documents) // 5
    
    folds = []
    for i in range(5):
        start = i * fold_size
        end = start + fold_size if (i < 4) else len(documents)
        folds.append(documents[start:end])
    print('Fold size:', fold_size)

    """
    random.shuffle(training_set)
    fold_size = len(training_set) // 5
    
    folds = []
    for i in range(5):
        start = i * fold_size
        end = start + fold_size if (i < 4) else len(training_set)
        folds.append(training_set[start:end])
    print('Fold size:', fold_size)
    """
    
    for i in range(5):
        train = []        
        for j in range(5):        
            if i != j:
                train = train + folds[j]
        test = folds[i]

        aux = []    
        for d in train:
            aux = aux + d
        train = aux    
        
        """
        aux = []    
        for d in test:
            aux = aux + d    
        test = aux
        """
        
        map(list, zip(*train))
        train_X, train_Y, train_T = [list(t) for t in zip(*train)]
        
        # map(list, zip(*test))
        # test_X, test_Y, test_T = [list(t) for t in zip(*test)]
        
        hmm = HiddenMarkov(timesteps, naive_bayes=naive_bayes, use_features=use_features, self_train=self_train)
        hmm.fit(train_X, train_Y)

        # p = hmm.predict(test_X)
        
        test_Y = []
        test_T = []
        p = []
        for d in test:
            x, y, t = [list(z) for z in zip(*d)]            
            test_Y = test_Y + y
            test_T = test_T + t
            p = p + hmm.predict(x)

        t = test_Y
        t = [[['O', 'B-PER', 'I-PER'][t__] for t__ in t_] for t_ in t]
        p = [[['O', 'B-PER', 'I-PER'][p__] for p__ in p_] for p_ in p]
        w = test_T

        name = 'fold_' + str(i)
        print('Writing', name)
        with Path('../results/score/{}.preds.txt'.format(name)).open('wb') as f:
            for words, preds, tags in zip(w, p, t):
                f.write(b'\n')
                for word, pred, tag in zip(words, preds, tags):
                    f.write(' '.join([word, tag, pred]).encode() + b'\n')

    print('Elapsed time: %.4f' % (time.time() - start_time))

#### Naive Bayes

In [None]:
# test_hmm(0, False, False, '../data/ner_on_html')

!cd .. && ./eval_model.sh
!mkdir -p ../results/cross_validation/nb
!mv ../results/score/fold* ../results/cross_validation/nb

#### HMM

In [None]:
test_hmm(2, True, True, '../data/ner_on_html')

!cd .. && ./eval_model.sh
!mkdir -p ../results/cross_validation/hmm_1_features
!mv ../results/score/fold* ../results/cross_validation/hmm_1

### Maximum Entropy, CRFs, NNs

#### Maxent

In [None]:
import numpy as np
import time
import os
import sys
sys.path.insert(1, os.path.realpath(os.path.pardir))

import tensorflow as tf
from pathlib import Path
from model.estimator import Estimator

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable debug logs Tensorflow.
tf.logging.set_verbosity(tf.logging.ERROR)

estimator = Estimator()
estimator.set_dataset_params({
    'datadir': '../data/ner_on_html',
    'dataset_mode': 'sentences',
    "model": "maxent",  
    "epochs": 5,
    "batch_size": 10,
    "use_features": False,
    "word_embeddings": "one_hot",
    "char_representation": "none",
    "decoder": "crf",  
    "loss": "cross_entropy"
})
estimator.train_cv()

!cd .. && ./eval_model.sh
!mkdir -p ../results/cross_validation/maxent
!mv ../results/score/fold* ../results/cross_validation/maxent

#### CRF

In [None]:
import numpy as np
import time
import os
import sys
sys.path.insert(1, os.path.realpath(os.path.pardir))

import tensorflow as tf
from pathlib import Path
from model.estimator import Estimator

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable debug logs Tensorflow.
tf.logging.set_verbosity(tf.logging.ERROR)

estimator = Estimator()
estimator.set_dataset_params({
    'datadir': '../data/ner_on_html',
    'dataset_mode': 'sentences',
    "model": "maxent",  
    "epochs": 5,
    "batch_size": 10,
    "use_features": False,
    "word_embeddings": "one_hot",
    "char_representation": "none",
    "decoder": "crf"
    # "loss": "cross_entropy"
})
estimator.train_cv()

!cd .. && ./eval_model.sh
!mkdir -p ../results/cross_validation/crf_no_features
!mv ../results/score/fold* ../results/cross_validation/crf_no_features

#### CRF with features

In [None]:
import numpy as np
import time
import os
import sys
sys.path.insert(1, os.path.realpath(os.path.pardir))

import tensorflow as tf
from pathlib import Path
from model.estimator import Estimator

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable debug logs Tensorflow.
tf.logging.set_verbosity(tf.logging.ERROR)

estimator = Estimator()
estimator.set_dataset_params({
    'datadir': '../data/ner_on_html',
    'dataset_mode': 'sentences',
    "model": "crf",  
    "epochs": 5,
    "batch_size": 10,
    "use_features": True,
    "word_embeddings": "one_hot",
    "char_representation": "none",
    "decoder": "crf"
    # "loss": "cross_entropy"
})
estimator.train_cv()

!cd .. && ./eval_model.sh
!mkdir -p ../results/cross_validation/crf_group_a
!mv ../results/score/fold* ../results/cross_validation/crf_group_a