In [10]:
import os
import numpy as np
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC

from generator import Generator
from corpus import ConllCorpusReaderX

import warnings
warnings.filterwarnings('ignore')

TRAINSET_PATH = "./factrueval_trainset.npz"
TESTSET_PATH = "./factrueval_testset.npz"

In [11]:
factrueval_devset = ConllCorpusReaderX('./factrueval2016_dataset/',
                                       fileids='devset.txt', 
                                       columntypes=['words', 'offset', 'len', 'ne'])

factrueval_testset = ConllCorpusReaderX('./factrueval2016_dataset/', 
                                        fileids='testset.txt', 
                                        columntypes=['words', 'offset', 'len', 'ne'])

def clean(Y_pred, Y_test):
    Y_pred = np.array(Y_pred)
    Y_test = np.array(Y_test)

    Y_pred_i = np.array([Y_pred != 'O'])
    Y_test_i = np.array([Y_test != 'O'])

    indexes = (Y_pred_i | Y_test_i).reshape(Y_pred.shape)

    Y_pred_fixed = Y_pred[indexes]
    Y_test_fixed = Y_test[indexes]
    return Y_pred_fixed, Y_test_fixed

In [12]:
def test_percentage(percentage):
    if os.path.exists(TRAINSET_PATH):
        os.remove(TRAINSET_PATH)
    
    if os.path.exists(TESTSET_PATH):
        os.remove(TESTSET_PATH)
    
    gen = Generator(column_types=['WORD'], context_len=2, weight_percentage=percentage)

    Y_train = [el[1] for el in factrueval_devset.get_ne()]
    Y_test = [el[1] for el in factrueval_testset.get_ne()] 

    X_train = gen.fit_transform([[el] for el in factrueval_devset.words()], 
                                Y_train, 
                                path=TRAINSET_PATH)
    X_test = gen.transform([[el] for el in factrueval_testset.words()], 
                           path=TESTSET_PATH)
    
    clf = LogisticRegression()
    clf.fit(X_train, Y_train)
    
    Y_pred = clf.predict(X_test)
    Y_pred_c, Y_test_c = clean(Y_pred, Y_test)

    def get_el(el):
        if el == "O":
            return el
        else:
            return el[2:]

    Y_pred_c_light = [get_el(el) for el in Y_pred_c]
    Y_test_c_light = [get_el(el) for el in Y_test_c]

    light_counter = Counter(Y_test_c_light)
    light_labels = list(light_counter.keys())
    light_labels.remove("O")
    a = f1_score(Y_test_c_light, Y_pred_c_light, average="weighted", labels=light_labels)

    Y_pred = clf.predict(X_train)
    Y_pred_c, Y_train_c = clean(Y_pred, Y_train)

    Y_pred_c_light = [get_el(el) for el in Y_pred_c]
    Y_train_c_light = [get_el(el) for el in Y_train_c]

    light_counter = Counter(Y_train_c_light)
    light_labels = list(light_counter.keys())
    light_labels.remove("O")
    b = f1_score(Y_train_c_light, Y_pred_c_light, average="weighted", labels=light_labels)
    
    return b, a

In [None]:
poss_percentage = [0.6, 0.7, 0.8, 0.9, 0.95]
results = [test_percentage(el) for el in poss_percentage]