### Researcher Name Extraction Dataset

In [None]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

def plot_word_frequency(directory, color):
    my_counter = Counter()
    for fname in ['train', 'valid', 'test']:
        with open(directory + '/' + fname) as f:
            words = [line.strip().lower().split()[0] for line in f if len(line.strip()) > 0]
            words = [w for w in words if w != '-docstart-']
            my_counter.update(words)

    data = [(key, my_counter[key]) for key in my_counter]    
    data.sort(key=lambda x: x[1], reverse=True)
    
    print([(i, x[1]) for i, x in enumerate(data)][:100])
    plt.plot([x[1] for x in data][:100], color)
    return data[:50]
    
plt.title('Word frequencies')
data1 = plot_word_frequency('../data/conll2003', 'r')
data2 = plot_word_frequency('../data/ner_on_html', 'b')

print(' '.join([d[0] for d in data1[:10]]))
print()
print(' '.join([d[0] for d in data2[:10]]))

for d1, d2 in zip(data1, data2):
    print('%s & %d & %s & %d' % (d1[0], d1[1], d2[0], d2[1]))

In [None]:
import pandas as pd
from dython import nominal

def load_raw_dataset(f):
    with open(f, 'r', encoding='utf8') as f:
        data = f.read().strip()
        sentences = [s.split('\n') for s in data.split('\n\n') if not s.startswith('-DOCSTART-')]
        X = [t.split(' ') for s in sentences for t in s if len(s) > 0]
        for i, s in enumerate(X):
            X[i] = X[i][2:5] + X[i][7:]
        return X

X = load_raw_dataset('../data/ner_on_html/train')
X += load_raw_dataset('../data/ner_on_html/valid')
X += load_raw_dataset('../data/ner_on_html/test')

data = {}
data['words']         = [x[0 ] for x in X]
data['exact_match']   = [int(x[1]) for x in X]
data['partial_match'] = [int(x[2]) for x in X]
data['email']         = [int(x[3]) for x in X]
data['number']        = [int(x[4]) for x in X]
data['honorific']     = [int(x[5]) for x in X] 
data['url']           = [int(x[6]) for x in X]
data['capitalized']   = [int(x[7]) for x in X]
data['punctuation']   = [int(x[8]) for x in X]
data['html_tag']      = [x[9 ] for x in X]
data['css_class']     = [x[10] for x in X]

data['words'][0]
df = pd.DataFrame(data)

nominal.associations(df, nominal_columns=['words','html_tag', 'css_class'])

### How to do it: https://github.com/shakedzy/dython/issues/2

Calculates Cramer's V statistic for categorical-categorical association.
Uses correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328.
This is a symmetric coefficient: V(x,y) = V(y,x)

https://github.com/shakedzy/dython/blob/master/dython/nominal.py
https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V

### Nested cross-validation

5-fold cross validation


Partition the training data randomly in five folds

Nested CV
https://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html

Common error with cross validation
https://www.youtube.com/watch?v=S06JpVoNaA0

https://www.kdnuggets.com/2017/08/dataiku-predictive-model-holdout-cross-validation.html

https://www.datarobot.com/wiki/training-validation-holdout/

### Naive bayes

In [24]:
import numpy as np
import time
import os
import random
import sys
sys.path.insert(1, os.path.realpath(os.path.pardir))

from optparse import OptionParser
from pathlib import Path
from model.hmm import HiddenMarkov, load_dataset

def test_hmm(timesteps, dataset):
    start_time = time.time()
    naive_bayes = timesteps == 0
    if naive_bayes:
        timesteps = 1
        
    print('Fitting...')
    X1, Y1, _ = load_dataset(dataset + '/train')
    X2, Y2, _ = load_dataset(dataset + '/valid')
    training_set = [x for x in zip(X1 + X2, Y1 + Y2)]

    random.shuffle(training_set)
    fold_size = len(training_set) // 5
    
    folds = []
    for i in range(5):
        start = i * fold_size
        end = start + fold_size if (i < 4) else len(training_set)
        folds.append(training_set[start:end])
    print('Fold size:', fold_size)
    
    for f in folds:
        map(list, zip(*f))
        X, Y = [list(t) for t in zip(*f)]
  
        hmm = HiddenMarkov(1, naive_bayes=naive_bayes, use_features=True, self_train=True)
        hmm.fit(X, Y)

        for name in ['train', 'valid', 'test']:
        print('Predicting ' + name)
        x, t, w = load_raw_dataset('data/ner_on_html/' + name)
        p = hmm.predict(x)

        t = [[['O', 'B-PER', 'I-PER'][t__] for t__ in t_] for t_ in t]
        p = [[['O', 'B-PER', 'I-PER'][p__] for p__ in p_] for p_ in p]

        """
        with Path('results/score/{}.preds.txt'.format(name)).open('wb') as f:
          for words, preds, tags in zip(w, p, t):
            f.write(b'\n')
            for word, pred, tag in zip(words, preds, tags):
              f.write(' '.join([word, tag, pred]).encode() + b'\n')
        """
    print('Elapsed time: %.4f' % (time.time() - start_time))
    
test_hmm(0, '../data/ner_on_html')

# 'data/conll2003_person'

Fitting...
Fold size: 6694
[[['Staff', 'staff', '0', '1', '2', '7', '0', '0', '0', '0', '1', '0', 'a.li', 'menu-item.menu-item-type-post_type.menu-item-object-page.menu-item-766']], [['Ph.D.', 'ph.d.', '0', '0', '0', '0', '0', '0', '1', '0', '1', '0', 'div.div', ''], ['Electrical', 'electrical', '0', '0', '0', '4', '0', '0', '0', '0', '1', '0', 'div.div', ''], ['and', 'and', '0', '1', '0', '10', '0', '0', '0', '0', '0', '0', 'div.div', ''], ['Computer', 'computer', '0', '0', '0', '7', '0', '0', '0', '0', '1', '0', 'div.div', ''], ['Engineering', 'engineering', '0', '0', '0', '7', '0', '0', '0', '0', '1', '0', 'div.div', ''], [',', ',', '0', '1', '0', '0', '0', '0', '0', '0', '0', '1', 'div.div', ''], ['Auburn', 'auburn', '0', '1', '1', '2', '0', '0', '0', '0', '1', '0', 'div.div', ''], ['University', 'university', '0', '1', '1', '8', '0', '0', '0', '0', '1', '0', 'div.div', '']], [['Education', 'education', '0', '0', '0', '8', '0', '0', '0', '0', '1', '0', 'li.ul', 'megaChild-inner-nav