# (1) Load Data
First, we load all the data we need into pandas dataframes.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.wsd import lesk
import nltk

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', 500)

In [3]:
from collections import namedtuple
from collections import defaultdict

Dataset = namedtuple('Dataset', 'name, train, test')
FeatureDataset = namedtuple('FeatureDataset', 'name, fc, agg, train, test')
FeatureCategory = namedtuple('FeatureCategory', 'name, func')
Aggregation = namedtuple('Aggregation', 'name, agg')

In [4]:
from nltk import word_tokenize

columns = ['id', 'sentence', "start", "end", "target", 
           "nat", "non_nat", "nat_marked", "non_nat_marked", "binary", "prob"]

def load_df(path, d_type, header):
    df = pd.read_csv(path, header=header, sep = "\t")
    if len(df.columns) == len(columns):
        df.columns = columns
    if d_type == 'word':
        df = df.loc[df.target.map(lambda target : len(word_tokenize(target)))<=1,]
    elif d_type == 'phrase':
        df = df.loc[df.target.map(lambda target : len(word_tokenize(target)))>1,]
    return df

def load_datasets(names, train_name, test_name, type_train = None, type_test = None, header=None):
    MAIN_PATH_DATASET = "../cwishareddataset/traindevset/english/"
    datasets = [Dataset(name, load_df(MAIN_PATH_DATASET + name + '_' + train_name + '.tsv', type_train, header),
                              load_df(MAIN_PATH_DATASET + name + '_' + test_name + '.tsv', type_test, header))
                              for name in names]
    return datasets

# (2.1) Preprocessing
Here we compute preprocessed variants of the target words. We provide a preprocessed target word with whitespace removel, lowercasing etc. In addition, we provide the lemma of the target and the preprocessed versions of the lemma. Finall, we also compute the POS tags and the PennTreebank POS tags, so later feature functions requiring POS tags can easily access the precomputed tags.

In [5]:
from nltk.stem.wordnet import *
from nltk import word_tokenize
from functools import lru_cache
from utils import penn_to_wn
import re
import unicodedata
import sys

wordNetLemmatizer = WordNetLemmatizer()

def overlaps(start1, end1, start2, end2):
    return bool(range(max(start1, start2), min(end1, end2)+1))

tbl = dict.fromkeys(i for i in range(sys.maxunicode)
                      if unicodedata.category(chr(i)).startswith('P'))

def remove_punctuation(text):
    return text.translate(tbl)

def ratio_non_alpha(target):
    return 1 - (np.sum([1 for letter in target if (ord(letter)>=65 and ord(letter)<=90) 
             or (ord(letter)>=97 and ord(letter)<=122)]) / len(target))

@lru_cache(maxsize=None)
def targets_with_index(start, end, context):
    curr_pos = 0
    targets = []
    j = 0
    w = 0
    curr_split = ''
    ctx_split = context.split()
    whitespaces = re.findall('\s+', context)
    num_whitespaces = [len(token) for token in whitespaces]
    num_whitespaces.append(1)
    tokens = word_tokenize(context)
    tokens = ['"' if token not in context else token for token in tokens]
    for index, token in enumerate(tokens, 1):
        targets.append((token, index, curr_pos, (curr_pos + len(token))))
        curr_pos += len(token)
        curr_split += token
        if ctx_split[j] == curr_split:
            curr_pos += num_whitespaces[w]
            j += 1
            w += 1
            curr_split = ''
    vals = [(target[0], target[1]) for target in targets \
            if overlaps(start, end, target[2], target[3])]
    return [val for val in vals if val[0] != '"']

@lru_cache(maxsize=None)
def wordnet_pos_tagging(sentence):
    tokens = word_tokenize(sentence)
    return nltk.pos_tag(tokens)

def pos_tags(start, end, target, sentence):
    wordPOSPairs = wordnet_pos_tagging(sentence)
    targets_index = targets_with_index(start, end, sentence)
    results = [wordPOSPairs[tpl[1]-1][1] for tpl in targets_index]
    filtered_results = [result for result in results 
                        if remove_punctuation(result).strip() and result != 'POS']
    if len(nltk.word_tokenize(target)) != len(filtered_results):
            return ['n' for word in target.split()]
    return filtered_results if len(filtered_results) > 0 else None

def wordnet_lemma(target, pos):
    #tokens = nltk.word_tokenize(target)
    tokens = target.split()
    if pos:
        if len(pos) != len(tokens):
            return target
        pos = [penn_to_wn(poss) if penn_to_wn(poss) else 'n' for poss in pos]
        lemmas = [wordNetLemmatizer.lemmatize(token, poss)
                     for token, poss in zip(tokens, pos)]
        return ' '.join(lemmas)
    return target

def preprocessing(dataframe):
    df = dataframe.copy()
    df['sentence'] = df.sentence.apply(lambda sent : sent.replace("''", "``"))
    df['p_target'] = df.target.apply(lambda target : target.strip().lower())
    df['pos_tags'] = df[['start', 'end', 'target', 'sentence']].apply(lambda vals : pos_tags(*vals), axis = 1)
    df['pos_tags_pt'] = df.pos_tags.apply(lambda pos : [penn_to_wn(poss) if penn_to_wn(poss) else 'n' for poss in pos] 
                                          if pos else [])
    df['lemma'] = df[['target', 'pos_tags']].apply(lambda vals : wordnet_lemma(*vals), axis = 1)
    df['p_lemma'] = df.lemma.apply(lambda lemma : lemma.strip().lower())
    return df

In [6]:
def preprocess_datasets(datasets):
    return [Dataset(ds.name, preprocessing(ds.train), 
                             preprocessing(ds.test)) 
                             for ds in datasets]

# (2.1.2) Regularization
Here we provide some functions to compute a regularized binary label based on thresholds of the probability, the number of native annotations, the number of non-native annotations and the sum of native and non-native annotations. Setting the threshold up in order to require more than a single mark for a word to be complex, may help in regularizing the model. Note that this regularized binary label of course should only used on the training set.

In [7]:
def create_regularied_label_prob(dataframe, prob_thresh = 0.05):
    df = dataframe.copy()
    df['binary'] = df.prob.apply(lambda prob : 1 if prob >= prob_thresh else 0)
    return df

def create_regularized_label_nat(dataframe, nat_thresh = 1):
    df = dataframe.copy()
    df['binary'] = df.nat_marked.apply(lambda nat : 1 if nat >= nat_thresh else 0)
    return df

def create_regularized_label_non_nat(dataframe, non_nat_thresh = 1):
    df = dataframe.copy()
    df['binary'] = df.non_nat_marked.apply(lambda nat : 1 if nat >= non_nat_thresh else 0)
    return df

def create_regularized_label_marks_sum(dataframe, sum_thresh = 1):
    df = dataframe.copy()
    df['binary'] = df[['nat_marked','non_nat_marked']].apply(lambda marks : 1 \
                                        if sum(marks) > sum_thresh else 0, axis=1)
    return df

In [8]:
def apply_regularization(datasets, regularizer, val):
     return [Dataset(ds.name, regularizer(ds.train, val), 
                        ds.test) for ds in datasets]

# (2.2) Aggregation (A2)
Since many labels are multi-word expression, we first of all define some aggregation functions that aggregate feature values over multiple tokens. Implementing this seperately allows to easily exchange the used aggregation function and keeps the feature computation functions clean. These feature computation functions should only compute features for a single target word.

In [9]:
word_freq_wiki = {}
sum_counts = 0
with open("resources/word-freq-dumps/enwiki-20150602-words-frequency.txt", encoding="utf8") as file:
    for line in file:
        word, freq = line.partition(" ")[::2]
        sum_counts+=int(freq)
        word_freq_wiki[word.strip()] = int(freq)
        
def get_unigram_probability(word):
    return word_freq_wiki.get(word,1) / (sum_counts + len(word_freq_wiki))

In [10]:
from nltk.tokenize import word_tokenize

def agg_feat_num_average(target, func_feature, *args, **kwargs):
    if 'pos' in kwargs:
        pos = kwargs.pop('pos')
        return np.mean([func_feature(token, *args, pos=poss) 
                for token, poss in zip(word_tokenize(target), pos)])
    return np.mean([func_feature(token, *args) for token in word_tokenize(target)])

def agg_feat_num_weighted_average(target, func_feature, alpha, *args, **kwargs):
    if 'pos' in kwargs:
        pos = kwargs.pop('pos')
        prob_sum = np.sum([(alpha/(alpha+get_unigram_probability(token))) for token in word_tokenize(target)])
        return np.mean([((alpha/(alpha+get_unigram_probability(token)))/prob_sum) * 
                func_feature(token, *args, pos=poss) for token, poss in zip(word_tokenize(target), pos)])
    prob_sum = np.sum([(alpha/(alpha+get_unigram_probability(token))) for token in word_tokenize(target)])
    return np.sum([((alpha/(alpha+get_unigram_probability(token)))/prob_sum) * 
                func_feature(token, *args) for token in word_tokenize(target)])

agg_feat_num_weighted_average_medium = lambda target, func_feature, *args, **kwargs: \
                        agg_feat_num_weighted_average(target, func_feature, 0.0001, *args, **kwargs)

def agg_feat_num_median(target, func_feature, *args, **kwargs):
    if 'pos' in kwargs:
        pos = kwargs.pop('pos')
        return np.median([func_feature(token, *args, pos=poss) 
                for token, poss in zip(word_tokenize(target), pos)])
    return np.median([func_feature(token, *args) for token in word_tokenize(target)])

def agg_feat_num_max(target, func_feature, *args, **kwargs):
    if 'pos' in kwargs:
        pos = kwargs.pop('pos')
        return np.max([func_feature(token, *args, pos=poss) for token, poss in zip(word_tokenize(target), pos)])
    return np.max([func_feature(token, *args) for token in word_tokenize(target)])

def agg_feat_num_min(target, func_feature, *args, **kwargs):
    if 'pos' in kwargs:
        pos = kwargs.pop('pos')
        return np.min([func_feature(token, *args, pos=poss) for token, poss in zip(word_tokenize(target), pos)])
    return np.min([func_feature(token, *args) for token in word_tokenize(target)])

In [11]:
def simple(target):
    return len(target)

agg_feat_num_weighted_average_medium('and web science group', simple)

4.400640363656745

In [12]:
agg_mean = [Aggregation('mean', agg_feat_num_average)]
agg_max = [Aggregation('max', agg_feat_num_max)]
agg_weighted = [Aggregation('weighted_mean', agg_feat_num_weighted_average_medium)]
agg_default = [Aggregation('mean', agg_feat_num_average)]
aggs_small = [Aggregation('mean', agg_feat_num_average), Aggregation('max', agg_feat_num_max)]
aggs_all = [Aggregation('mean', agg_feat_num_average),
            Aggregation('max', agg_feat_num_max), Aggregation('min', agg_feat_num_min),
           Aggregation('weighted_mean', agg_feat_num_weighted_average_medium)]

In [13]:
aggs = agg_default 

In [14]:
def concat_feature_datasets(*args, name=None):
    zipped = zip(*args)
    concat_features = []
    for dataset in zipped:
        df_train = None
        df_test = None
        fcs = []
        aggs = []
        for tpl in dataset:
            if not fcs:
                df_train = tpl.train.copy()
                df_test = tpl.test.copy()
            else:
                df_train = pd.concat([df_train, tpl.train.copy()], axis = 1)
                df_test = pd.concat([df_test, tpl.test.copy()], axis = 1)
            fcs.append(tpl.fc)
            aggs.append(tpl.agg)
        if name:
            data_name = (name,)
        else:
            data_name = fcs
        concat_features.append(FeatureDataset(tpl.name, data_name, aggs,
                    df_train.loc[:,~df_train.columns.duplicated()], 
                    df_test.loc[:,~df_test.columns.duplicated()]))
    return concat_features

# (3) Features 

## (3.0.1) Baseline I
The baseline I feature set covers only the two most relevant features as previous work has been shown. In many research work, only these two features, namely the word length and the word frequency are employed as features to compute complexity. Hence, we set this as our first feature baseline.

In [15]:
word_freq_wiki = {}
freq_sum_wiki = 0
with open("resources/word-freq-dumps/enwiki-20150602-words-frequency.txt", encoding="utf8") as file:
    for line in file:
        word, freq = line.partition(" ")[::2]
        word_freq_wiki[word.strip()] = int(freq)
        freq_sum_wiki+=int(freq)
        
def get_dict_count(target, freqs):
    return freqs.get(target.strip().lower(), 0)

In [16]:
def features_baseline_1(dataframe, agg, drop_features):
    df = dataframe.copy()
    df['length (bl1)'] = df.target.apply(lambda target : agg(target, len))
    df['freq_wiki (bl1)'] = df.p_target.apply(lambda target : agg(target, get_dict_count, word_freq_wiki))
    df['log_freq_wiki (bl1)'] = df['freq_wiki (bl1)'].apply(lambda freq : np.log(freq))
    df = df.drop(drop_features, axis = 1)
    return df

fc_baseline_1 = FeatureCategory('baseline_1', features_baseline_1)

In [17]:
def compute_features_baseline_1(datasets, aggs = agg_default, drop_features = []):
    return [FeatureDataset(ds.name, fc_baseline_1, agg,
                        fc_baseline_1.func(ds.train, agg.agg, drop_features), 
                        fc_baseline_1.func(ds.test, agg.agg, drop_features)) 
                        for ds in datasets
                        for agg in aggs]

## (3.0.2) Basline II

In [18]:
from wordmodel import Word

words_mrc_database = {}
with open("resources/mrc-database/mrc2.dct", encoding="utf8") as file:
    for index, line in enumerate(file):
        line = line.strip()
        word, phon, dphon, stress = line[51:].split('|')
        w = Word(
                wid = index,
                nlet = int(line[0:2]),
                nphon = int(line[2:4]),
                nsyl = int(line[4]),
                kf_freq = int(line[5:10]),
                kf_ncats = int(line[10:12]),
                kf_nsamp = int(line[12:15]),
                tl_freq = int(line[15:21]),
                brown_freq = int(line[21:25]),
                fam = int(line[25:28]),
                conc = int(line[28:31]),
                imag = int(line[31:34]),
                meanc = int(line[34:37]),
                meanp = int(line[37:40]),
                aoa = int(line[40:43]),
                tq2 = line[43],
                wtype = line[44],
                pdwtype = line[45],
                alphasyl = line[46],
                status = line[47],
                var = line[48],
                cap = line[49],
                irreg = line[50],
                word=word,
                phon=phon,
                dphon=dphon,
                stress=stress)
        words_mrc_database[w.word.strip().lower()] = w

def mrc_database(target, func, missing_val):
    word = words_mrc_database.get(target.strip().lower())
    val = func(word) if word else missing_val
    return val if val != 0 else missing_val

word_concreteness = {}
with open("resources/word-freq-dumps/concreteness_brysbaert_et_al.txt", encoding="utf8") as file:
    for line in file:
        word, bigram, conc_m, conc_sd, \
        unknown, total, percent_known, \
        subtlex, dom_pos = line.split('\t')
        word_concreteness[word.strip()] = float(conc_m)

word_freq_wiki = {}
freq_sum_wiki = 0
with open("resources/word-freq-dumps/enwiki-20150602-words-frequency.txt", encoding="utf8") as file:
    for line in file:
        word, freq = line.partition(" ")[::2]
        word_freq_wiki[word.strip()] = int(freq)
        freq_sum_wiki+=int(freq)
        
def get_dict_count(target, freqs):
    return freqs.get(target.strip().lower(), 0)

In [19]:
def features_baseline_2(dataframe, agg, drop_features):
    df = dataframe.copy()
    df['length (bl2)'] = df.target.apply(lambda target : agg(target, len))
    df['freq_wiki (bl2)'] = df.p_target.apply(lambda target : agg(target, get_dict_count, word_freq_wiki))
    df['log_freq_wiki (bl2)'] = df[['freq_wiki (bl2)']].apply(lambda freq : np.log(freq))
    df['mrc_fam (bl2)'] = df.p_target.apply(lambda target : agg(target, mrc_database, lambda word : word.fam, 400))
    df['mrc_conc (bl2)'] = df.p_target.apply(lambda target : agg(target, mrc_database, lambda word : word.conc, 400))
    df['mrc_imag (bl2)'] = df.p_target.apply(lambda target : agg(target, mrc_database, lambda word : word.imag, 400))
    df['mrc_meanc (bl2)'] = df.p_target.apply(lambda target : agg(target, mrc_database, lambda word : word.meanc, 400))
    df['concreteness (bl2)'] = df.p_target.apply(lambda target : agg(target, \
                                                lambda target : word_concreteness.get(target, 2.5)))
    df = df.drop(drop_features, axis = 1)
    return df

fc_baseline_2 = FeatureCategory('baseline_2', features_baseline_2)

In [20]:
def compute_features_baseline_2(datasets, aggs = agg_default, drop_features = []):
     return [FeatureDataset(ds.name, fc_baseline_2, agg,
                        fc_baseline_2.func(ds.train, agg.agg, drop_features), 
                        fc_baseline_2.func(ds.test, agg.agg, drop_features)) 
                        for ds in datasets
                        for agg in aggs]

## (3.1) Linguistic Features
Here we compute linguistic word features like the number of vowels the word has.

In [21]:
from nltk.corpus import cmudict
import numpy as np
import pronouncing as pnc
from wordmodel import Word
from nltk.stem.porter import *
from nltk.stem.wordnet import *
from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize import word_tokenize
import os
from functools import lru_cache
from collections import Counter
from mezmorize import Cache
from nltk.tokenize import word_tokenize
import string
import pronouncing as pnc
from utils import penn_to_wn
from nltk.parse.corenlp import *
from collections import Counter
import pickle


java_path = "C:/Program Files (x86)/Java/jdk1.8.0_144/bin/java.exe"
os.environ['JAVAHOME'] = java_path
path_to_jar = 'resources/stanford-dependency-parser/stanford-parser.jar'
path_to_models_jar = 'resources/stanford-dependency-parser/stanford-parser-3.9.1-models.jar'

porterStemmer = PorterStemmer()
wordNetLemmatizer = WordNetLemmatizer()
nerTagger = StanfordNERTagger('resources/stanford-ner-tagger/classifiers/english.all.3class.distsim.crf.ser.gz',
               'resources/stanford-ner-tagger/stanford-ner.jar',
               encoding='utf-8')

words_mrc_database = {}
with open("resources/mrc-database/mrc2.dct", encoding="utf8") as file:
    for index, line in enumerate(file):
        line = line.strip()
        word, phon, dphon, stress = line[51:].split('|')
        w = Word(
                wid = index,
                nlet = int(line[0:2]),
                nphon = int(line[2:4]),
                nsyl = int(line[4]),
                kf_freq = int(line[5:10]),
                kf_ncats = int(line[10:12]),
                kf_nsamp = int(line[12:15]),
                tl_freq = int(line[15:21]),
                brown_freq = int(line[21:25]),
                fam = int(line[25:28]),
                conc = int(line[28:31]),
                imag = int(line[31:34]),
                meanc = int(line[34:37]),
                meanp = int(line[37:40]),
                aoa = int(line[40:43]),
                tq2 = line[43],
                wtype = line[44],
                pdwtype = line[45],
                alphasyl = line[46],
                status = line[47],
                var = line[48],
                cap = line[49],
                irreg = line[50],
                word=word,
                phon=phon,
                dphon=dphon,
                stress=stress)
        words_mrc_database[w.word.strip().lower()] = w

def mrc_database(target, func, missing_val):
    word = words_mrc_database.get(target.strip().lower())
    val = func(word) if word else missing_val
    return val if val != 0 else missing_val

d = cmudict.dict()

def num_syllables_rule_based(target):
    vowels = "aeiouy"
    numVowels = 0
    lastWasVowel = False
    for wc in target:
        foundVowel = False
        for v in vowels:
            if v == wc:
                if not lastWasVowel: numVowels+=1  
                foundVowel = lastWasVowel = True
                break
        if not foundVowel:  
            lastWasVowel = False
    if len(target) > 2 and target[-2:] == "es":
        numVowels-=1
    elif len(target) > 1 and target[-1:] == "e":
        numVowels-=1
    return numVowels

def num_syllables(target):
    if target in d:
        return np.mean([len(list(y for y in x if y[-1].isdigit())) for x in d[target.lower()]])
    else:
        return num_syllables_rule_based(target)
    
def num_vowels(target):
    return np.sum([target.lower().count(vowel) for vowel in 'aeiouy'])

def cognate_across_languages_sim(target, sim_func, agg_func, translations):
    targ = target.strip().lower()
    translated = translations.get(targ)
    if not translated:
        return 0
    trans_texts = set([trans_word.text for trans_word in translated])
    similarities = [sim_func(targ,trans_text) 
                    for trans_text in trans_texts]
    return agg_func(similarities)

def porter_stem_len(target):
    return len(str(porterStemmer.stem(target)))

def porter_stemmer_num_steps(target):
    stem = target.lower()
    applied_steps = 0
    if porterStemmer.mode == porterStemmer.NLTK_EXTENSIONS and target in porterStemmer.pool:
            return applied_steps
    if porterStemmer.mode != porterStemmer.ORIGINAL_ALGORITHM and len(target) <= 2:
            return applied_steps
    step_funcs = [porterStemmer._step1a, porterStemmer._step1b, porterStemmer._step1c,
                  porterStemmer._step2, porterStemmer._step3, porterStemmer._step3,
                  porterStemmer._step4, porterStemmer._step5a, porterStemmer._step5b]
    for step_func in step_funcs:
        stem_step = step_func(stem)
        if stem_step != stem:
            stem = stem_step
            applied_steps += 1
    return applied_steps

def is_named_entity(sentence, target):
    tokenized_sent = word_tokenize(sentence)
    tagged_sent = nerTagger.tag(tokenized_sent)
    for token, tag in tagged_sent:
        if token == target and tag != 'O':
            return 1
    return 0

def named_entity_type(sentence, target):
    tokenized_sent = word_tokenize(sentence)
    tagged_sent = nerTagger.tag(tokenized_sent)
    return [tag for token, tag in tagged_sent if token == target][0]

def ratio_cap_letters(target):
    return np.sum([1 for letter in target if letter.isupper()]) / len(target)

def ratio_num_letters(target):
    return np.sum([1 for letter in target if letter.isdigit()]) / len(target)

def ratio_non_ascii_letters(target):
    ascii = set(string.printable)   
    return 1 - (np.sum([1 for letter in target if letter in ascii]) / len(target))

def ratio_non_alpha(target):
    return 1 - (np.sum([1 for letter in target if (ord(letter)>=65 and ord(letter)<=90) 
             or (ord(letter)>=97 and ord(letter)<=122)]) / len(target))

def grapheme_to_phoneme_ratio(target):
    phoneme_lengths = [len(prons.split()) 
            for prons in pnc.phones_for_word(target)]
    if phoneme_lengths:
        return len(target) / np.mean(phoneme_lengths)
    return 1

def num_pronounciations(target):
    length = len(pnc.phones_for_word(target))
    return length if length != 0 else 1

# First make sure that the StanfordCoreNLP Server is running under port 9011
# cd to stanfordCoreNLP directory
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9011 -timeout 15000
parser = CoreNLPDependencyParser(url='http://localhost:9011/')

@lru_cache(maxsize=None)
def dependency_parse_with_root(sentence):
    try:
        dependency_parser = parser.raw_parse(sentence)
        dependencies = []
        parsetree = list(dependency_parser)[0]
        for index, node in parsetree.nodes.items():
            for relation, dependant in parsetree.nodes[index]['deps'].items():
                for dep in dependant:
                    triple = ((node['word'], index), relation, \
                              (parsetree.nodes[dep]['word'], dep))
                    dependencies.append(triple)
        return dependencies
    except:
        return []

@lru_cache(maxsize=None)
def dependency_parse(sentence):
    dependencies = dependency_parse_with_root(sentence)
    filtered_dependencies = [triple for triple in dependencies if triple[1] != 'ROOT']
    return filtered_dependencies


def dep_dist_to_head(target, start, end, context):
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    return np.nan_to_num(np.mean([np.abs(triple[0][1] - triple[2][1])-1 
                                for triple in triples if triple[2] in targets]))

def dep_dist_to_root(target, start, end, context):
    targets = targets_with_index(start, end, context)
    triples = dependency_parse_with_root(context)
    root_nodes = list(filter(lambda triple : triple[1] == 'ROOT' , triples))
    if root_nodes: 
        root_node = root_nodes[0]
    else:
        return 0
    dist = np.nan_to_num(np.mean([np.abs(root_node[2][1] - triple[2][1])-1 
                                for triple in triples if triple[2] in targets]))
    return dist if dist != -1 else 0

def dep_relation_to_head(target, start, end, context):
    targets = targets_with_index(start, end, context)
    triples = dependency_parse_with_root(context)
    relations = [triple[1] for triple in triples if triple[2] in targets]
    return relations[0] if len(relations) == 1 else 'misc'
    
def dep_head_word_len(target, start, end, context):
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    return np.nan_to_num(np.mean([len(triple[0][0]) 
        for triple in triples if triple[2] in targets]))

def dep_num_dependents(target, start, end, context):
    targets = targets_with_index(start, end, context)
    triples = dependency_parse_with_root(context)
    return len([triple[1] for triple in triples if triple[0] in targets])

def dep_max_num_dependents(context):
    triples = dependency_parse_with_root(context)
    most = Counter([triple[0][0] for triple in triples]).most_common(1)
    return most[0][1] if most else 0

In [22]:
datasets = load_datasets(['Wikipedia', 'WikiNews', 'News'], 'TrainDev', 'Test')
targets_train = list(set([ngram for ds in datasets for mwe in ds.train['target'].tolist() for ngram in mwe.split()]))
targets_test = list(set([ngram for ds in datasets for mwe in ds.test['target'].tolist() for ngram in mwe.split()]))
targets = targets_train.copy()
targets.extend(targets_test)
print('Len ta_train : {}'.format(len(targets_train)))
print('Len ta_test : {}'.format(len(targets_test)))
print('Len targets : {}'.format(len(targets)))

Len ta_train : 8251
Len ta_test : 2097
Len targets : 10348


from googletrans import Translator
from collections import defaultdict
import pickle
translator = Translator()
targets = [target.strip().lower() for target in targets]

trans_word = translator.translate(word, dest='de')

from googletrans import Translator
from collections import defaultdict
import pickle

translator = Translator()
targets = [target.strip().lower() for target in targets]
languages = ['fr', 'de', 'es']
translations = defaultdict(list)
for index, word in enumerate(targets):
    print(word)
    translator = Translator()
    for lang in languages:
        trans_word = translator.translate(word, dest=lang)
        translations[word].append(trans_word)
        print(str(index) + " " + word + " " + trans_word.text)
with open('resources/translations/translations.json', 'wb') as fp:
    pickle.dump(translations, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [23]:
from googletrans import Translator
from collections import defaultdict
import pickle
with open('resources/translations/data.json', 'rb') as fp:
    data = pickle.load(fp)

if not data:
    translator = Translator()
    targets = [target.strip().lower() for target in targets]
    languages = ['fr', 'de', 'es']
    translations = defaultdict(list)
    for index, word in enumerate(targets):
        translator = Translator()
        for lang in languages:
            trans_word = translator.translate(word, dest=lang)
            translations[word].append(trans_word)
            print(str(index) + " " + word + " " + trans_word.text)
    with open('resources/translations/data.json', 'wb') as fp:
        pickle.dump(translations, fp, protocol=pickle.HIGHEST_PROTOCOL)
else:
    translations = data

In [24]:
from similarity.ngram import NGram
bigram_dist = NGram(2)
trigram_dist = NGram(3)

def features_linguistic(dataframe, agg, drop_features):
    df = dataframe.copy()
    df['length (lin)'] = df.target.apply(lambda target : agg(target, len))
    df['phrase_length (lin)'] = df.target.apply(lambda target : len(target))
    df['target_num_words (lin)'] = df.target.apply(lambda target : len(word_tokenize(target)))
    # Relative positions of the target word based on character counting
    df['relative_position_left (lin)'] = df[['sentence', 'start']].apply(lambda vals : vals[1] / len(vals[0]), axis = 1)
    df['relative_position_centered (lin)'] = df[['sentence', 'start', 'end']].apply(lambda vals : 
                ((vals[1] + vals[2]) / 2) / len(vals[0]), axis = 1)
    df['relative_position_right (lin)'] = df[['sentence', 'end']].apply(lambda vals : vals[1] / len(vals[0]), axis = 1)
    df['ratio_cap_letters (lin)'] = df.target.apply(lambda target : agg(target, ratio_cap_letters))
    df['all_caps (lin)'] = df[['ratio_cap_letters (lin)']] == 1
    df['ratio_num_letters (lin)'] = df.target.apply(lambda target : agg(target, ratio_num_letters))
    df['ratio_non_ascii_letters (lin)'] = df.target.apply(lambda target : agg(target, ratio_non_ascii_letters))
    df['ratio_non_alpha (lin)'] = df.target.apply(lambda target : agg(target, ratio_non_alpha))
    df['grapheme_to_phoneme_ratio (lin)'] = df.target.apply(lambda target : agg(target, grapheme_to_phoneme_ratio))
    df['num_pronounciations (lin)'] = df.target.apply(lambda target : agg(target, num_pronounciations))
    df['hyphenated (lin)'] = df.target.apply(lambda target : int('-' in target))
    df['is_title (lin)'] = df.target.apply(lambda target : target.istitle())
    df['mrc_nphon (lin)'] = df.p_target.apply(lambda target : agg(target, mrc_database, lambda word : word.nphon, 0))
    df['cal_ngram_2_sim_min (lin)'] = df.p_target.apply(lambda target : agg(target, cognate_across_languages_sim, \
                                lambda source, dest : 1 - bigram_dist.distance(source, dest), np.min, translations))
    df['cal_ngram_2_sim_max (lin)'] = df.p_target.apply(lambda target : agg(target, cognate_across_languages_sim, \
                                lambda source, dest : 1 - bigram_dist.distance(source, dest), np.max, translations))
    df['cal_ngram_2_sim_mean (lin)'] = df.p_target.apply(lambda target : agg(target, cognate_across_languages_sim, \
                                lambda source, dest : 1 - bigram_dist.distance(source, dest), np.mean, translations))
    df['cal_ngram_3_sim_min (lin)'] = df.p_target.apply(lambda target : agg(target, cognate_across_languages_sim, \
                                lambda source, dest : 1 - trigram_dist.distance(source, dest), np.min, translations))
    df['cal_ngram_3_sim_max (lin)'] = df.p_target.apply(lambda target : agg(target, cognate_across_languages_sim, \
                                lambda source, dest : 1 - trigram_dist.distance(source, dest), np.max, translations))
    df['cal_ngram_3_sim_mean (lin)'] = df.p_target.apply(lambda target : agg(target, cognate_across_languages_sim, \
                                lambda source, dest : 1 - trigram_dist.distance(source, dest), np.mean, translations))
    df['num_syllables (lin)'] = df.p_target.apply(lambda target : agg(target, num_syllables))
    df['num_vowels (lin)'] = df.p_target.apply(lambda target : agg(target, num_vowels))
    df['vowel_consonant_ratio (lin)'] = df.p_target.apply(lambda target : agg(target, \
                                            lambda target : num_vowels(target) / (len(target) - num_vowels(target))))
    # Porter stemmer stem length, number of applied steps,
    # difference of stem length to target and reduction ratio
    df['porter_stem_len (lin)'] = df.p_target.apply(lambda target : agg(target, porter_stem_len))
    df['porter_stemmer_num_steps (lin)'] = df.p_target.apply(lambda target : agg(target, porter_stemmer_num_steps))
    df['diff_len_stem_len (lin)'] = df['length (lin)'] - df['porter_stem_len (lin)']
    df['reduction_stem_len (lin)'] = 1 - df['porter_stem_len (lin)'] / df['length (lin)']
    df['norm_num_syllables (lin)'] = df['num_syllables (lin)'] / df['length (lin)']
    df['norm_num_vowels (lin)'] = df['num_vowels (lin)'] / df['length (lin)']
    df['lemma_len (lin)'] = df.lemma.apply(lambda lemma : agg(lemma, len))
    df['reduction_lemma_len (lin)'] = 1 - df['lemma_len (lin)'] / df['length (lin)']
    df['diff_len_lemma_len (lin)'] = df['length (lin)'] - df['lemma_len (lin)']
    df['dep_dist_to_head (lin)'] = df[['target', 'start', 'end', 'sentence']].apply(lambda vals : 
                                               dep_dist_to_head(*vals), axis=1)
    df['dep_dist_to_root (lin)'] = df[['target', 'start', 'end', 'sentence']].apply(lambda vals : 
                                                dep_dist_to_root(*vals), axis=1)
    df['dep_dist_to_root_norm (lin)'] = df[['dep_dist_to_root (lin)', 'sentence']].apply(lambda vals : \
                                                float(vals[0]) / (len(word_tokenize(vals[1]))-1), axis=1)
    df['dep_relation_to_head (lin)'] = df[['target', 'start', 'end', 'sentence']].apply(lambda vals : \
                                                dep_relation_to_head(*vals), axis = 1)
    df['dep_num_dependents (lin)'] = df[['target', 'start', 'end', 'sentence']].apply(lambda vals : \
                                                dep_num_dependents(*vals), axis = 1)
    df['dep_max_num_dependents (lin)'] = df.sentence.apply(lambda sentence : dep_max_num_dependents(sentence))
    df['dep_num_dependents_norm (lin)'] = df['dep_num_dependents (lin)'] / df['dep_max_num_dependents (lin)']
    df['dep_head_word_len (lin)'] = df[['target', 'start', 'end', 'sentence']].apply(lambda vals : \
                                                dep_head_word_len(*vals), axis = 1)
    df = df.drop(drop_features, axis = 1)
    return df
    
fc_linguistic = FeatureCategory('linguistic', features_linguistic)

In [25]:
def compute_features_linguistic(datasets, aggs = agg_default, drop_features = []):
    return [FeatureDataset(ds.name, fc_linguistic, agg,
                        fc_linguistic.func(ds.train, agg.agg, drop_features), 
                        fc_linguistic.func(ds.test, agg.agg, drop_features)) 
                        for ds in datasets
                        for agg in aggs]

# (3.3) Corpus-Based Features
Here we compute features which are based on larger corpora. In this category we distinguish e.g. between frequency counts and N-Gram Language Model probabilites.

### (1) Frequency

In [26]:
from nltk.stem.wordnet import *
from collections import defaultdict
from wordmodel import Word
import pandas as pd

words_mrc_database = {}
with open("resources/mrc-database/mrc2.dct", encoding="utf8") as file:
    for index, line in enumerate(file):
        line = line.strip()
        word, phon, dphon, stress = line[51:].split('|')
        w = Word(
                wid = index,
                nlet = int(line[0:2]),
                nphon = int(line[2:4]),
                nsyl = int(line[4]),
                kf_freq = int(line[5:10]),
                kf_ncats = int(line[10:12]),
                kf_nsamp = int(line[12:15]),
                tl_freq = int(line[15:21]),
                brown_freq = int(line[21:25]),
                fam = int(line[25:28]),
                conc = int(line[28:31]),
                imag = int(line[31:34]),
                meanc = int(line[34:37]),
                meanp = int(line[37:40]),
                aoa = int(line[40:43]),
                tq2 = line[43],
                wtype = line[44],
                pdwtype = line[45],
                alphasyl = line[46],
                status = line[47],
                var = line[48],
                cap = line[49],
                irreg = line[50],
                word=word,
                phon=phon,
                dphon=dphon,
                stress=stress)
        words_mrc_database[w.word.strip().lower()] = w

def mrc_database(target, func, missing_val):
    word = words_mrc_database.get(target.strip().lower())
    val = func(word) if word else missing_val
    return val if val != 0 else missing_val

word_freq_wiki = {}
freq_sum_wiki = 0
with open("resources/word-freq-dumps/enwiki-20150602-words-frequency.txt", encoding="utf8") as file:
    for line in file:
        word, freq = line.partition(" ")[::2]
        word_freq_wiki[word.strip()] = int(freq)
        freq_sum_wiki+=int(freq)
        
word_freq_simple_wiki = {}
freq_sum_simple_wiki = 0
with open("resources/word-freq-dumps/simple_wiki_word_freqs.txt", encoding="ISO-8859-1") as file:
    for line in file:
        word, freq = line.split()
        word_freq_simple_wiki[word.strip()] = int(freq)
        freq_sum_simple_wiki+=int(freq)
        
word_freq_lang8 = {}
freq_sum_lang8 = 0
with open("resources/word-freq-dumps/word_freqs_lang8.txt", encoding="ISO-8859-1") as file:
    for line in file:
        word, freq = line.split()
        word_freq_lang8[word.strip()] = int(freq)
        freq_sum_lang8+=int(freq)

word_freq_bnc = {}
with open("resources/word-freq-dumps/bnc_freq_all.al", encoding="utf8") as file:
    for line in file:
        freq, word, pos, num_files = line.split()
        word_freq_bnc[word.strip()] = (int(freq), pos, int(num_files))

word_freq_bnc_lemma = {}
with open("resources/word-freq-dumps/bnc_lemma.al", encoding="utf8") as file:
    for line in file:
        sort_order, frequency, word, word_class = line.split()
        word_freq_bnc_lemma[word.strip()] = (int(sort_order), word_class, int(frequency))

        
word_pknown_nobs_prev_freqZipf = {}
with open("resources/word-freq-dumps/word_prevelance.csv", encoding="utf8") as file:
    for line in file:
        word, p_known, nobs, prevelance, freqZipf = line.split(";")
        word_pknown_nobs_prev_freqZipf[word.strip()] = (float(p_known.replace(',','.')), 
                                                        float(nobs.replace(',','.')), 
                                                        float(prevelance.replace(',','.')), 
                                                        float(freqZipf.replace(',','.')))
        
subtlex_us = {}
with open("resources/dictionaries/SUBTLEXus.txt", encoding="utf8") as file:
    for line in file:
        word, freq, cd_count, freq_low, cd_low, subtl_wf, lg10_wf, Subtlcd, lg10_cd = line.split('\t')
        subtlex_us[word.strip().lower()] = (int(freq), int(cd_count))
        
subtlex_uk = pd.read_csv("resources/dictionaries/SUBTLEXuk.txt", sep = "\t")
subtlex_uk_dict = dict(zip(subtlex_uk['Spelling'], subtlex_uk['CD_count']))

def get_dict_count(target, freqs):
    return freqs.get(target.strip().lower(), 0)

def freqZipf_func(target):
    stats = word_pknown_nobs_prev_freqZipf.get(target)
    return stats[3] if stats else 3.5


WEIGHT_WIKI_SIMPLE_WIKI = freq_sum_wiki / freq_sum_simple_wiki
WEIGHT_WIKI_LANG_8 = freq_sum_wiki / freq_sum_lang8

def weighted_freq_ratio(target, word_freq_n, word_freq_m, weight):
    freq_n = word_freq_n.get(target.strip().lower(), 1)
    freq_m = word_freq_m.get(target.strip().lower(), 1)
    return -1 + (2 * (freq_n / ((freq_m * weight) + freq_n)))

  interactivity=interactivity, compiler=compiler, result=result)


In [27]:
mwe_targets_train = list(set([mwe for ds in datasets for mwe in ds.train['target'].tolist()]))
mwe_targets_test = list(set([mwe for ds in datasets for mwe in ds.test['target'].tolist()]))
mwe_targets = mwe_targets_train.copy()
mwe_targets.extend(mwe_targets_test)
print('Len ta_train : {}'.format(len(mwe_targets_train)))
print('Len ta_test : {}'.format(len(mwe_targets_test)))
print('Len targets : {}'.format(len(mwe_targets)))

Len ta_train : 11555
Len ta_test : 2546
Len targets : 14101


In [28]:
import phrasefinder as pf

google_books_n_grams = {}
options = pf.SearchOptions()
options.topk = 10
n_grams = mwe_targets

with open('resources/word-freq-dumps/ngram_google.json', 'rb') as fp:
    google_books_n_grams = pickle.load(fp)

if not google_books_n_grams:
    for index, n_gram in enumerate(n_grams):
        try:
            print(index, n_gram)
            result = pf.search(pf.Corpus.AMERICAN_ENGLISH, n_gram, options)
            vals = [(phrase.match_count, phrase.volume_count, phrase.first_year, phrase.last_year)
                        for phrase in result.phrases]
            mean_vals = [np.sum(elem) / len(elem) for elem in zip(*vals)]
            google_books_n_grams[n_gram] = mean_vals
            if result.status != pf.Status.Ok:
                print('Request was not successful: {}'.format(result.status))
        except Exception as error:
            pass
    with open('resources/word-freq-dumps/ngram_google.json', 'wb') as fp:
        pickle.dump(google_books_n_grams, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [29]:
def features_frequency(dataframe, agg, drop_features):   
    df = dataframe.copy()
    df['mrc_kf_freq (cor)'] = df.p_target.apply(lambda target : agg(target, mrc_database, \
                                                                    lambda word : word.kf_freq, 0))
    df['mrc_kf_ncats (cor)'] = df.p_target.apply(lambda target : agg(target, mrc_database, \
                                                                    lambda word : word.kf_ncats, 0))
    df['mrc_tl_freq (cor)'] = df.p_target.apply(lambda target : agg(target, mrc_database, \
                                                                    lambda word : word.tl_freq, 0))
    df['mrc_brown_freq (cor)'] = df.p_target.apply(lambda target : agg(target, mrc_database, \
                                                                    lambda word : word.brown_freq, 0))
    df['freq_wiki (cor)'] = df.p_target.apply(lambda target : agg(target, get_dict_count, word_freq_wiki))
    df['log_freq_wiki (cor)'] = df['freq_wiki (cor)'].apply(lambda freq : np.log(freq))
    df['freq_simple_wiki (cor)'] = df.p_target.apply(lambda target : agg(target, get_dict_count, word_freq_simple_wiki))
    df['log_freq_simple_wiki (cor)'] = df['freq_simple_wiki (cor)'].apply(lambda freq : np.log(freq))
    df['freq_bnc (cor)'] = df.p_target.apply(lambda target : agg(target, \
                                    lambda target : word_freq_bnc.get(target)[0] if word_freq_bnc.get(target) else 0))
    df['log_freq_bnc (cor)'] = df['freq_bnc (cor)'].apply(lambda freq : np.log(freq))
    df['freq_bnc_lemma (cor)'] = df.p_lemma.apply(lambda target : agg(target, \
                    lambda target : word_freq_bnc_lemma.get(target)[2] \
                                         if word_freq_bnc_lemma.get(target) else 0))
    df['log_freq_bnc_lemma (cor)'] = df['freq_bnc_lemma (cor)'].apply(lambda freq : np.log(freq))
    df['freqZipf (cor)'] = df.p_target.apply(lambda target : agg(target, freqZipf_func))
    df['google_books_n_gram_freq (cor)'] = df.p_target.apply(lambda target : google_books_n_grams.get(target)[0] \
                                                     if google_books_n_grams.get(target) else 0)
    df['log_google_books_n_gram_freq (cor)'] = df['google_books_n_gram_freq (cor)'].apply(lambda freq : np.log(freq))
    df['google_books_n_gram_doc_freq (cor)'] = df.p_target.apply(lambda target : google_books_n_grams.get(target)[1] \
                                                        if google_books_n_grams.get(target)  else 0)
    df['log_google_books_n_gram_doc_freq (cor)'] = df['google_books_n_gram_doc_freq (cor)'].apply(lambda freq : np.log(freq))
    df['google_books_n_gram_first_year (cor)'] = df.p_target.apply(lambda target : google_books_n_grams.get(target)[2] \
                                                          if google_books_n_grams.get(target) else 1900)
    df['google_books_n_gram_last_year (cor)'] = df.p_target.apply(lambda target : google_books_n_grams.get(target)[3] \
                                                         if google_books_n_grams.get(target)  else 1900)
    df['subtlex_cd_us (cor)'] = df.p_target.apply(lambda target : agg(target, \
                    lambda target : subtlex_us[target.strip().lower()][1] if subtlex_us.get(target.strip().lower()) else 0))
    df['subtlex_cd_uk (cor)'] = df.p_target.apply(lambda target : agg(target, \
                                    lambda target : subtlex_uk_dict.get(target, 0)))
    df['weighted_wiki_simple_wiki_ratio (cor)'] = df.p_target.apply(lambda target : agg(target, \
                                    weighted_freq_ratio, word_freq_wiki, word_freq_simple_wiki, WEIGHT_WIKI_SIMPLE_WIKI))
    df['weighted_wiki_lang8_ratio (cor)'] = df.p_target.apply(lambda target : agg(target, \
                                    weighted_freq_ratio, word_freq_wiki, word_freq_lang8, WEIGHT_WIKI_SIMPLE_WIKI))
    df = df.drop(drop_features, axis = 1)
    df = df.fillna(0)
    return df

fc_frequency = FeatureCategory('frequency', features_frequency)

In [30]:
def compute_features_frequency(datasets, aggs = agg_default, drop_features = []):
    return [FeatureDataset(ds.name, fc_frequency, agg,
                        fc_frequency.func(ds.train, agg.agg, drop_features), 
                        fc_frequency.func(ds.test, agg.agg, drop_features)) 
                        for ds in datasets
                        for agg in aggs]

### (2) Language Model
Here we load the different Kneser-Ney n-gram models we trained previously.

In [31]:
import pickle

with open('resources/language-models/ngram_char_1.json', 'rb') as fp:
    ngram_char_1 = pickle.load(fp)
    
with open('resources/language-models/ngram_word_1.json', 'rb') as fp:
    ngram_word_1 = pickle.load(fp)

with open('resources/language-models/ngram_char_2.json', 'rb') as fp:
    ngram_char_2 = pickle.load(fp)
    
with open('resources/language-models/ngram_word_2.json', 'rb') as fp:
    ngram_word_2 = pickle.load(fp)
    
with open('resources/language-models/ngram_char_3.json', 'rb') as fp:
    ngram_char_3 = pickle.load(fp)
    
with open('resources/language-models/ngram_char_2_complex.json', 'rb') as fp:
    ngram_char_2_complex = pickle.load(fp)
    
with open('resources/language-models/ngram_char_2_non_complex.json', 'rb') as fp:
    ngram_char_2_non_complex = pickle.load(fp)
    
with open('resources/language-models/ngram_char_2_complex_cleaned.json', 'rb') as fp:
    ngram_char_2_complex_cleaned = pickle.load(fp)
    
with open('resources/language-models/ngram_char_2_non_complex_cleaned.json', 'rb') as fp:
    ngram_char_2_non_complex_cleaned = pickle.load(fp)
    
def kneser_ney_word_uni_gram(target):
    return ngram_word_1.cond_prob(target)

def kneser_ney_word_bi_gram(target):
    words = target.split()
    if len(words) <= 1:
        return ngram_word_2.cond_prob(target)
    return np.mean([ngram_word_2.cond_prob(words[index+1], (word,)) 
                for index, word in enumerate(words) 
                if index <= len(words)-2])
    
def kneser_ney_char_uni_gram_avg(target):
    return np.mean([ngram_char_1.cond_prob(character) 
            for character in target])

def kneser_ney_char_bi_gram_avg(target):
    return np.mean([ngram_char_2.cond_prob(target[index+1], (character,)) 
            for index, character in enumerate(target) if index <= len(target)-2])

def kneser_ney_char_bi_gram_avg_model(target, kn_model):
    return np.mean([kn_model.cond_prob(target[index+1], (character,)) 
            for index, character in enumerate(target) if index <= len(target)-2])

def kneser_ney_char_tri_gram_avg(target):
    return np.mean([ngram_char_3.cond_prob(target[index+2], (character, target[index+1])) 
            for index, character in enumerate(target) if index <= len(target)-3])

In [32]:
from nltk.stem.wordnet import *
wordNetLemmatizer = WordNetLemmatizer()

def features_language_model(dataframe, agg, drop_features):
    df = dataframe.copy()
    df['kneser_ney_word_uni_gram (cor)'] = df.p_target.apply(lambda target : agg(target, \
                                                    kneser_ney_word_uni_gram))
    df['kneser_ney_word_bi_gram (cor)'] = df.p_target.apply(lambda target :  kneser_ney_word_bi_gram(target))
    df['kneser_ney_char_uni_gram_avg (cor)'] = df.p_target.apply(lambda target : agg(target, \
                                                    kneser_ney_char_uni_gram_avg))
    df['kneser_ney_char_bi_gram_avg (cor)'] = df.p_target.apply(lambda target : agg(target, \
                                                    kneser_ney_char_bi_gram_avg))
    df['kneser_ney_char_tri_gram_avg (cor)'] = df.p_target.apply(lambda target : agg(target, \
                                                    kneser_ney_char_tri_gram_avg))
    df['kneser_ney_char_bi_complex (cor)'] = df.p_target.apply(lambda target : agg(target, \
                                                    kneser_ney_char_bi_gram_avg_model, ngram_char_2_complex))
    df['kneser_ney_char_bi_non_complex (cor)'] = df.p_target.apply(lambda target : agg(target, \
                                                    kneser_ney_char_bi_gram_avg_model, ngram_char_2_non_complex))
    df['kneser_ney_char_bi_c_nc_ratio (cor)'] = df['kneser_ney_char_bi_complex (cor)'] / df['kneser_ney_char_bi_non_complex (cor)']
    df['kneser_ney_char_bi_complex_cl (cor)'] = df.p_target.apply(lambda target : agg(target, \
                                                    kneser_ney_char_bi_gram_avg_model, ngram_char_2_complex_cleaned))
    df['kneser_ney_char_bi_non_complex_cl (cor)'] = df.p_target.apply(lambda target : agg(target, \
                                                    kneser_ney_char_bi_gram_avg_model, ngram_char_2_non_complex_cleaned))
    df['kneser_ney_char_bi_c_nc_ratio_cl (cor)'] = df['kneser_ney_char_bi_complex_cl (cor)'] / df['kneser_ney_char_bi_non_complex_cl (cor)']
    df = df.fillna(0)
    df = df.drop(drop_features, axis = 1)
    return df

fc_language_model = FeatureCategory('language_model', features_language_model)

In [33]:
def compute_features_language_model(datasets, aggs = agg_default, drop_features = []):
    return [FeatureDataset(ds.name, fc_language_model, agg,
                        fc_language_model.func(ds.train, agg.agg, drop_features), 
                        fc_language_model.func(ds.test, agg.agg, drop_features)) 
                        for ds in datasets
                        for agg in aggs]

fc_corpus = FeatureCategory('corpus', [fc_frequency, fc_language_model])

def compute_features_corpus(datasets):
    return [FeatureDataset(ds.name, fc_corpus,  ds.agg,
            ds.train, ds.test) for ds in concat_feature_datasets(*datasets)]

# (3.4) Psycholinguistic Features based on MRC Database


In [34]:
from wordmodel import Word

words_mrc_database = {}
with open("resources/mrc-database/mrc2.dct", encoding="utf8") as file:
    for index, line in enumerate(file):
        line = line.strip()
        word, phon, dphon, stress = line[51:].split('|')
        w = Word(
                wid = index,
                nlet = int(line[0:2]),
                nphon = int(line[2:4]),
                nsyl = int(line[4]),
                kf_freq = int(line[5:10]),
                kf_ncats = int(line[10:12]),
                kf_nsamp = int(line[12:15]),
                tl_freq = int(line[15:21]),
                brown_freq = int(line[21:25]),
                fam = int(line[25:28]),
                conc = int(line[28:31]),
                imag = int(line[31:34]),
                meanc = int(line[34:37]),
                meanp = int(line[37:40]),
                aoa = int(line[40:43]),
                tq2 = line[43],
                wtype = line[44],
                pdwtype = line[45],
                alphasyl = line[46],
                status = line[47],
                var = line[48],
                cap = line[49],
                irreg = line[50],
                word=word,
                phon=phon,
                dphon=dphon,
                stress=stress)
        words_mrc_database[w.word.strip().lower()] = w

def mrc_database(target, func, missing_val):
    word = words_mrc_database.get(target.strip().lower())
    val = func(word) if word else missing_val
    return val if val != 0 else missing_val

word_concreteness = {}
with open("resources/word-freq-dumps/concreteness_brysbaert_et_al.txt", encoding="utf8") as file:
    for line in file:
        word, bigram, conc_m, conc_sd, \
        unknown, total, percent_known, \
        subtlex, dom_pos = line.split('\t')
        word_concreteness[word.strip()] = float(conc_m)
        
word_age_of_aquisition = {}
with open("resources/word-freq-dumps/AoA_ratings_Kuperman_et_al_BRM.csv", encoding="utf8") as file:
    for line in file:
        word, occur_total, occur_num, freq_pm, rating_Mean, rating_SD, dunno = line.split()
        word_age_of_aquisition[word.strip()] = float(rating_Mean.replace(',', '.')) if rating_Mean != 'NA' else 0

word_pknown_nobs_prev_freqZipf = {}
with open("resources/word-freq-dumps/word_prevelance.csv", encoding="utf8") as file:
    for line in file:
        word, p_known, nobs, prevelance, freqZipf = line.split(";")
        word_pknown_nobs_prev_freqZipf[word.strip()] = (float(p_known.replace(',','.')), 
                                                        float(nobs.replace(',','.')), 
                                                        float(prevelance.replace(',','.')), 
                                                        float(freqZipf.replace(',','.')))

def perc_known_func(target, missing_value):
    stats = word_pknown_nobs_prev_freqZipf.get(target)
    return stats[0] if stats else missing_value

def nobs_func(target):
    stats = word_pknown_nobs_prev_freqZipf.get(target)
    return stats[1] if stats else 0

def prevelance_func(target):
    stats = word_pknown_nobs_prev_freqZipf.get(target)
    return stats[2] if stats else 0

In [35]:
def features_psycholingusitic(dataframe, agg, drop_features):
    df = dataframe.copy()
    df['mrc_fam (psy)'] = df.p_target.apply(lambda target : agg(target, mrc_database, lambda word : word.fam, 400))
    df['mrc_conc (psy)'] = df.p_target.apply(lambda target : agg(target, mrc_database, lambda word : word.conc, 400))
    df['mrc_imag (psy)'] = df.p_target.apply(lambda target : agg(target, mrc_database, lambda word : word.imag, 400))
    df['mrc_meanc (psy)'] = df.p_target.apply(lambda target : agg(target, mrc_database, lambda word : word.meanc, 400))
    df['mrc_meanp (psy)'] = df.p_target.apply(lambda target : agg(target, mrc_database, lambda word : word.meanp, 400))
    df['mrc_aoa (psy)'] = df.p_target.apply(lambda target : agg(target, mrc_database, lambda word : word.aoa, 3.5))
    df['perc_known (psy)'] = df.p_target.apply(lambda target : agg(target, perc_known_func, 0.5))
    df['nobs (psy)'] = df.p_target.apply(lambda target : agg(target, nobs_func))
    df['prevelance (psy)'] = df.p_target.apply(lambda target : agg(target, prevelance_func))
    df['concreteness (psy)'] = df.p_target.apply(lambda target : agg(target, \
                                                lambda target : word_concreteness.get(target, 2.5)))
    df['age_of_aquisition (psy)'] = df.p_target.apply(lambda target : agg(target, \
                                                    lambda target : word_age_of_aquisition.get(target, 8.5)))
    df = df.drop(drop_features, axis = 1)
    df = df.fillna(0)
    return df

fc_psycholinguistic = FeatureCategory('psycholinguistic', features_psycholingusitic)

In [36]:
def compute_features_psycholinguistic(datasets, aggs = agg_default, drop_features = []):
    return [FeatureDataset(ds.name, fc_psycholinguistic, agg,
                        fc_psycholinguistic.func(ds.train, agg.agg, drop_features), 
                        fc_psycholinguistic.func(ds.test, agg.agg, drop_features)) 
                        for ds in datasets
                        for agg in aggs]

# (3.5) Semantic Features
Here we implement all the relevant features based on WordNet and SentiWordNet. For example, the number of synsets the target word is contained in or the average length of the lemmas of all the synsets the target word is contained in.

### (3.5.1) WordNet

In [37]:
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.wsd import lesk
from nltk.stem.wordnet import *
from utils import penn_to_wn

wordNetLemmatizer = WordNetLemmatizer()

def wn_synset_freq(target):
    return len(wn.synsets(target))

def wn_synset_avg_lemma_freq(target):
    return np.nan_to_num(np.mean([len(synset.lemmas()) 
            for synset in wn.synsets(target)]))

def wn_synset_avg_lemma_len(target):
    return np.nan_to_num(np.nanmean([len(lemma.name()) 
            for synset in wn.synsets(target) 
            for lemma in synset.lemmas()]))

def wn_synset_avg_hypernyms(target):
    return np.nan_to_num(np.nanmean([len(synset.hypernyms()) 
            for synset in wn.synsets(target)]))

def wn_synset_avg_hyponyms(target):
    return np.nan_to_num(np.mean([len(synset.hyponyms()) 
            for synset in wn.synsets(target)]))

def wn_synset_sum_hypernyms(target):
    return np.sum(([len(synset.hypernyms()) 
            for synset in wn.synsets(target)]))

def wn_synset_avg_definition_len(target):
    return np.nan_to_num(np.mean([len(str(synset.definition())) 
            for synset in wn.synsets(target)]))

def wn_synset_avg_hyptree_depth(target):
    return np.nan_to_num(np.mean([synset.max_depth() 
            for synset in wn.synsets(target)]))

def wn_synset_num_distinct_pos(target):
    return len(set([synset.pos() for synset in wn.synsets(target)]))

def wn_synset_avg_num_relations(target):
    return np.nan_to_num(np.mean([np.sum([len(synset.hypernyms()), len(synset.hyponyms()), 
             len(synset.instance_hypernyms()), len(synset.instance_hyponyms()),
             len(synset.member_holonyms()), len(synset.substance_holonyms()),
             len(synset.part_holonyms()), len(synset.member_meronyms()),
             len(synset.substance_meronyms()), len(synset.part_meronyms())]) 
             for synset in wn.synsets(target)]))

def wn_synset_avg_freq_pos(target, pos):
    return len(wn.synsets(target, pos = pos))

def wn_synset_pos_ratio_1(target, pos):
    tokens = word_tokenize(target)
    ratios = []
    for token, poss in zip(tokens, pos):
        synsets_freqs = len(wn.synsets(token))
        ratios.append(len(wn.synsets(token, pos = poss)) / synsets_freqs \
                if synsets_freqs != 0 else 0.25)
    return np.mean(ratios)

def wn_synset_pos_ratio_2(target, pos):
    tokens = word_tokenize(target)
    ratios = []
    for token, poss in zip(tokens, pos):
        synsets_counts = np.sum([lemma.count() 
                for sn in wn.synsets(token) for lemma in sn.lemmas()])
        ratios.append(np.sum([lemma.count() for sn in wn.synsets(token, pos = poss) 
                    for lemma in sn.lemmas()]) / synsets_counts if synsets_counts != 0 else 0.25)
    return np.mean(ratios)

def wn_synset_sense_entropy_uniform(target):
    num_senses = len(wn.synsets(target))
    return -np.sum([((1 / num_senses) * np.log2(1 / num_senses)) 
                     for index in range(0, num_senses)])

def wn_synset_sense_entropy_pos_uniform(target):
    num_senses = len(wn.synsets(target))
    pos_distribution = [len(wn.synsets(target, pos = wn.NOUN)),
                        len(wn.synsets(target, pos = wn.VERB)),
                        len(wn.synsets(target, pos = wn.ADJ)),
                        len(wn.synsets(target, pos = wn.ADV))]
    return -np.sum([(np.nan_to_num((count / num_senses) * np.log2(count / num_senses))) 
            for count in pos_distribution]) if num_senses != 0 else 0

def wn_synsets_sense_entropy_pos_central(target, pos):
    num_senses_pos = len(wn.synsets(target, pos = pos))
    return -np.sum([((1 / num_senses_pos) * np.log2(1 / num_senses_pos))
                     for index in range(0, num_senses_pos)])

def wn_synset_pos_probability_1(target, pos):
    synsets = wn.synsets(target)
    syn_freq_other_pos = np.sum([1 for synset in synsets if synset.pos() != pos])
    return len(wn.synsets(target, pos = pos)) / syn_freq_other_pos

def wn_synsets_avg_lemma_freq(target, freqs_func, freqs):
    synsets = wn.synsets(target)
    if not synsets:
        return 0
    return np.mean([np.nan_to_num(freqs_func(lemma.name(), freqs)) for synset in synsets
                    for lemma in synset.lemmas()])

def wn_synsets_freq_ratio_to_max_agg_min(target, freqs_func, freqs):
    lemmas = [lemma.name().split('_') for synset in wn.synsets(target) 
                  for lemma in synset.lemmas()]
    if not lemmas:
        return 1
    freqis = [np.min([freqs_func(lemma, freqs) for lemma in lemmata]) 
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    if target_freq not in freqis: freqis.append(target_freq)
    max_freq = np.max(freqis)
    return target_freq / max_freq

def wn_synsets_freq_ratio_to_max_agg_mean(target, freqs_func, freqs):
    lemmas = [lemma.name().split('_') for synset in wn.synsets(target) 
                  for lemma in synset.lemmas()]
    if not lemmas:
        return 1
    freqis = [np.mean([freqs_func(lemma, freqs) for lemma in lemmata]) 
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    if target_freq not in freqis: freqis.append(target_freq)
    max_freq = np.max(freqis)
    return target_freq / max_freq

def wn_synsets_freq_ratio_to_max_agg_median(target, freqs_func, freqs):
    lemmas = [lemma.name().split('_') for synset in wn.synsets(target) 
                  for lemma in synset.lemmas()]
    if not lemmas:
        return 1
    freqis = [np.median([freqs_func(lemma, freqs) for lemma in lemmata]) 
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    if target_freq not in freqis: freqis.append(target_freq)
    max_freq = np.max(freqis)
    return target_freq / max_freq
    
def swn_avg_objective_score(target):
    return np.nan_to_num(np.mean([senti_synset.obj_score() 
                for senti_synset in swn.senti_synsets(target)]))

def wn_synset_lesk_wsd_ratio_hi_freq(target, sentence, freqs_func, freqs, pos):
    wsd_synset = lesk(sentence.split(), target, pos)
    if not wsd_synset:
        return 0
    lemmas = [lemma.name().split('_') for lemma in wsd_synset.lemmas()]
    if not lemmas:
        return 0
    freqis = [np.min([freqs_func(lemma, freqs) for lemma in lemmata])
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    return np.sum([1 for freq in freqis if freq > target_freq]) / len(freqis)

def wn_synset_lesk_wsd_ratio_hi_freq_sum(target, sentence, freqs_func, freqs, pos):
    wsd_synset = lesk(sentence.split(), target, pos)
    if not wsd_synset:
        return 0
    lemmas = [lemma.name().split('_') for lemma in wsd_synset.lemmas()]
    if not lemmas:
        return 0
    freqis = [np.min([freqs_func(lemma, freqs) for lemma in lemmata])
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    return np.sum([freq for freq in freqis if freq > target_freq]) / np.sum(freqis)

def wn_synset_lesk_wsd_ratio_hi_nopos_freq(target, sentence, freqs_func, freqs):
    wsd_synset = lesk(sentence.split(), target)
    if not wsd_synset:
        return 0
    lemmas = [lemma.name().split('_') for lemma in wsd_synset.lemmas()]
    if not lemmas:
        return 0
    freqis = [np.min([freqs_func(lemma, freqs) for lemma in lemmata])
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    return np.sum([1 for freq in freqis if freq > target_freq]) / len(freqis)

def wn_synset_lesk_wsd_ratio_low_freq(target, sentence, freqs_func, freqs, pos):
    wsd_synset = lesk(sentence.split(), target, pos)
    if not wsd_synset:
        return 0
    lemmas = [lemma.name().split('_') for lemma in wsd_synset.lemmas()]
    if not lemmas:
        return 0
    freqis = [np.min([freqs_func(lemma, freqs) for lemma in lemmata])
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    return np.sum([1 for freq in freqis if freq < target_freq]) / len(freqis)

def wn_synset_lesk_wsd_ratio_low_freq_sum(target, sentence, freqs_func, freqs, pos):
    wsd_synset = lesk(sentence.split(), target, pos)
    if not wsd_synset:
        return 0
    lemmas = [lemma.name().split('_') for lemma in wsd_synset.lemmas()]
    if not lemmas:
        return 0
    freqis = [np.min([freqs_func(lemma, freqs) for lemma in lemmata])
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    return np.sum([freq for freq in freqis if freq < target_freq]) / np.sum(freqis)

def wn_synset_lesk_wsd_ratio_low_nopos_freq(target, sentence, freqs_func, freqs):
    wsd_synset = lesk(sentence.split(), target)
    if not wsd_synset:
        return 0
    lemmas = [lemma.name().split('_') for lemma in wsd_synset.lemmas()]
    if not lemmas:
        return 0
    freqis = [np.min([freqs_func(lemma, freqs) for lemma in lemmata])
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    return np.sum([1 for freq in freqis if freq < target_freq]) / len(freqis)

def wn_synset_lesk_wsd_ratio_to_freq_sum(target, sentence, freqs_func, freqs, pos):
    wsd_synset = lesk(sentence.split(), target, pos)
    if not wsd_synset:
        return 0
    lemmas = [lemma.name().split('_') for lemma in wsd_synset.lemmas()]
    if not lemmas:
        return 0
    freqis = [np.min([freqs_func(lemma, freqs) for lemma in lemmata])
              for lemmata in lemmas]
    target_freq = freqs_func(target, freqs)
    return target_freq / np.sum(freqis)

def wn_synset_lesk_wsd__norm_sense_rank(target, sentence, freqs_func, freqs, wsd_func, pos):
    wsd_synset = wsd_func(sentence.split(), target, pos)
    senses = wn.synsets(target)
    if not wsd_synset:
        return 0
    wsd_synset = lesk(sentence.split(), target, pos)
    sense_freqs = sorted([(sense, np.sum([lemma.count() for lemma in sense.lemmas()])) 
                   for sense in senses], key = lambda tpl : tpl[1], reverse=True)
    sense_index = [sense for sense, cnt in sense_freqs].index(wsd_synset)
    return sense_index / len(senses)

In [38]:
from nltk.wsd import lesk
from pywsd.lesk import adapted_lesk

def features_wordnet(dataframe, agg, drop_features):
    df = dataframe.copy()
    df['wn_synset_freq (sem)'] = df.p_target.apply(lambda target : agg(target, wn_synset_freq))
    df['wn_synset_avg_lemma_freq (sem)'] = df.p_target.apply(lambda target : agg(target, wn_synset_avg_lemma_freq))
    df['wn_synset_avg_lemma_len (sem)'] = df.p_target.apply(lambda target : agg(target, wn_synset_avg_lemma_len))
    
    df['length'] = df.target.apply(lambda target : agg(target, len))
    df['wn_synset_diff_len_avg_lemma_len (sem)'] = df['wn_synset_avg_lemma_len (sem)'] - df.length
    df['wn_synset_avg_hypernyms (sem)'] = df.p_target.apply(lambda target : agg(target, wn_synset_avg_hypernyms))
    df['wn_synset_sum_hypernyms (sem)'] = df.p_target.apply(lambda target : agg(target, wn_synset_sum_hypernyms))
    df['wn_synset_avg_hyponyms (sem)'] = df.p_target.apply(lambda target : agg(target, wn_synset_avg_hyponyms))

    df['wn_synset_avg_definition_len (sem)'] = df.p_target.apply(lambda target : 
                                                         agg(target, wn_synset_avg_definition_len))
    df['wn_synset_avg_hyptree_depth (sem)'] = df.p_target.apply(lambda target :
                                                         agg(target, wn_synset_avg_hyptree_depth))
    df['wn_synset_num_distinct_pos (sem)'] = df.p_target.apply(lambda target : 
                                                         agg(target, wn_synset_num_distinct_pos))
    df['wn_synset_avg_num_relations (sem)'] = df.p_target.apply(lambda target : 
                                                         agg(target, wn_synset_avg_num_relations))

    df['wn_synset_avg_freq_pos_noun (sem)'] = df.p_target.apply(lambda target : 
                                                        agg(target, wn_synset_avg_freq_pos, wn.NOUN))
    df['wn_synset_avg_freq_pos_verb (sem)'] = df.p_target.apply(lambda target : 
                                                        agg(target, wn_synset_avg_freq_pos, wn.VERB))
    df['wn_synset_avg_freq_pos_adj (sem)'] = df.p_target.apply(lambda target : 
                                                       agg(target, wn_synset_avg_freq_pos, wn.ADJ))
    df['wn_synset_avg_freq_pos_adv (sem)'] = df.p_target.apply(lambda target : 
                                                       agg(target, wn_synset_avg_freq_pos, wn.ADV))

    df['wn_synset_avg_freq_pos_noun_norm (sem)'] = np.nan_to_num(df['wn_synset_avg_freq_pos_noun (sem)'] / \
                                                                 df['wn_synset_freq (sem)'])
    df['wn_synset_avg_freq_pos_verb_norm (sem)'] = np.nan_to_num(df['wn_synset_avg_freq_pos_verb (sem)'] / \
                                                                 df['wn_synset_freq (sem)'])
    df['wn_synset_avg_freq_pos_adj_norm (sem)'] = np.nan_to_num(df['wn_synset_avg_freq_pos_adj (sem)'] / \
                                                                df['wn_synset_freq (sem)'])
    df['wn_synset_avg_freq_pos_adv_norm (sem)'] = np.nan_to_num(df['wn_synset_avg_freq_pos_adv (sem)'] / \
                                                                df['wn_synset_freq (sem)'])

    df['wn_synset_sense_entropy_uniform (sem)'] = df.p_target.apply(lambda target : 
                                            agg(target, wn_synset_sense_entropy_uniform))
    df['wn_synset_sense_entropy_pos_uniform (sem)'] = df.p_target.apply(lambda target :
                                            agg(target, wn_synset_sense_entropy_pos_uniform))
    df['wn_synsets_sense_entropy_pos_central (sem)'] = df[['p_target', 'pos_tags_pt']].apply(
        lambda vals : wn_synsets_sense_entropy_pos_central(vals[0], vals[1]), axis = 1)
    
    df['wn_synset_pos_ratio_1 (sem)'] = df[['p_target', 'pos_tags_pt']].apply(
                    lambda vals : wn_synset_pos_ratio_1(vals[0], vals[1]), axis = 1)
    
    df['wn_synset_pos_ratio_2 (sem)'] = df[['p_target', 'pos_tags_pt']].apply(
                    lambda vals : wn_synset_pos_ratio_2(vals[0], vals[1]), axis = 1)

    df['swn_avg_objective_score (sem)'] = df.p_target.apply(lambda target : agg(target, swn_avg_objective_score))

    df['wn_synsets_freq_ratio_to_max_agg_min (sem)'] = df.p_target.apply(lambda target : \
                                                    agg(target, wn_synsets_freq_ratio_to_max_agg_min, \
                                                                         get_dict_count, word_freq_wiki))
    df['wn_synsets_freq_ratio_to_max_agg_mean (sem)'] = df.p_target.apply(lambda target : \
                                                    agg(target, wn_synsets_freq_ratio_to_max_agg_mean, \
                                                                         get_dict_count, word_freq_wiki))
    df['wn_synsets_freq_ratio_to_max_agg_median (sem)'] = df.p_target.apply(lambda target : \
                                                    agg(target, wn_synsets_freq_ratio_to_max_agg_median, \
                                                                         get_dict_count, word_freq_wiki))
    df['wn_synsets_avg_lemma_freq (sem)'] = df.p_target.apply(lambda target : \
                                                    agg(target, wn_synsets_avg_lemma_freq, \
                                                                         get_dict_count, word_freq_wiki))
    df['freq_wiki'] = df.p_target.apply(lambda target : agg(target, get_dict_count, word_freq_wiki))
    df['wn_synsets_freq_ratio_to_avg (sem)'] = df['wn_synsets_avg_lemma_freq (sem)'] / df.freq_wiki
    df['wn_synset_lesk_wsd_ratio_hi_freq (sem)'] = df[['p_target','sentence', 'pos_tags_pt']].apply(lambda vals : \
                agg(vals[0], wn_synset_lesk_wsd_ratio_hi_freq, vals[1], \
                                     get_dict_count, word_freq_wiki, pos=vals[2]), axis = 1)
    df['wn_synset_lesk_wsd_ratio_low_freq (sem)'] = df[['p_target','sentence', 'pos_tags_pt']].apply(lambda vals : \
                agg(vals[0], wn_synset_lesk_wsd_ratio_low_freq, vals[1], \
                                     get_dict_count, word_freq_wiki, pos=vals[2]), axis = 1)
    df['wn_synset_lesk_wsd_ratio_hi_nopos_freq (sem)'] = df[['p_target','sentence']].apply(lambda vals : \
                agg(vals[0], wn_synset_lesk_wsd_ratio_hi_nopos_freq, vals[1], \
                                     get_dict_count, word_freq_wiki), axis = 1)
    df['wn_synset_lesk_wsd_ratio_low_nopos_freq (sem)'] = df[['p_target','sentence']].apply(lambda vals : \
                agg(vals[0], wn_synset_lesk_wsd_ratio_low_nopos_freq, vals[1], \
                                     get_dict_count, word_freq_wiki), axis = 1)
    df['wn_synset_lesk_wsd_ratio_hi_freq_sum (sem)'] = df[['p_target','sentence', 'pos_tags_pt']].apply(lambda vals : \
                agg(vals[0], wn_synset_lesk_wsd_ratio_hi_freq_sum, vals[1], \
                                     get_dict_count, word_freq_wiki, pos=vals[2]), axis = 1)
    df['wn_synset_lesk_wsd_ratio_low_freq_sum (sem)'] = df[['p_target','sentence', 'pos_tags_pt']].apply(lambda vals : \
                agg(vals[0], wn_synset_lesk_wsd_ratio_low_freq_sum, vals[1], \
                                     get_dict_count, word_freq_wiki, pos=vals[2]), axis = 1)
    df['wn_synset_lesk_wsd_ratio_to_freq_sum (sem)'] = df[['p_target','sentence', 'pos_tags_pt']].apply(lambda vals : \
                agg(vals[0], wn_synset_lesk_wsd_ratio_to_freq_sum, vals[1], \
                                     get_dict_count, word_freq_wiki, pos=vals[2]), axis = 1)
    df['wn_synset_lesk_wsd__norm_sense_rank (sem)'] = df[['p_target','sentence', 'pos_tags_pt']].apply(lambda vals : \
                agg(vals[0], wn_synset_lesk_wsd__norm_sense_rank, vals[1], \
                                     get_dict_count, word_freq_wiki, lesk, pos=vals[2]), axis = 1)
    df = df.drop(['length', 'freq_wiki'], axis = 1)
    df = df.drop(drop_features, axis = 1)
    df = df.fillna(0)
    return df

fc_wordnet = FeatureCategory('wordnet', features_wordnet)

Warming up PyWSD (takes ~10 secs)... took 8.758508920669556 secs.


In [39]:
def compute_features_wordnet(datasets, aggs = agg_default, drop_features = []):
     return [FeatureDataset(ds.name, fc_wordnet, agg,
                        fc_wordnet.func(ds.train, agg.agg, drop_features), 
                        fc_wordnet.func(ds.test, agg.agg, drop_features)) 
                        for ds in datasets
                        for agg in aggs]

### (3.5.2) DBpedia

In [40]:
import pickle

with open('resources/dbpedia-cache/dbpedia_annotations_00.json', 'rb') as fp:
    dbpedia_00 = pickle.load(fp)
    
with open('resources/dbpedia-cache/dbpedia_annotations_25.json', 'rb') as fp:
    dbpedia_25 = pickle.load(fp)
    
with open('resources/dbpedia-cache/dbpedia_annotations_50.json', 'rb') as fp:
    dbpedia_50 = pickle.load(fp)
    
with open('resources/dbpedia-cache/dbpedia_annotations_75.json', 'rb') as fp:
    dbpedia_75 = pickle.load(fp)
    
with open('resources/dbpedia-cache/pagerank.json', 'rb') as fp:
        page_rank = pickle.load(fp)

In [41]:
def overlaps(start1, end1, start2, end2):
    return bool(range(max(start1, start2), min(end1, end2)+1))

def dbp_match_entities(sentence, target, start, end, annotations):
    an_sents = annotations.get(sentence)
    if an_sents:
        ans = [(an['offset'], an['offset']+len(an['surfaceForm']), an) for an in an_sents]
        return [an for s, e, an in ans if overlaps(start, end, s, e)]
    return []

def dbp_entity_ratio(sentence, target, start, end, annotations):
    entities = dbp_match_entities(sentence, target, start, end, annotations)
    if entities:
        return np.min([np.sum([len(entity['surfaceForm']) 
                for entity in entities]) / len(target), 1])
    return 0

def dbp_support(sentence, target, start, end, annotations):
    entities = dbp_match_entities(sentence, target, start, end, annotations)
    if entities:
        return np.mean([entity['support'] for entity in entities])
    return 0

def dbp_type_hierachy_depth(sentence, target, start, end, annotations):
    entities = dbp_match_entities(sentence, target, start, end, annotations)
    if entities:
        return np.mean([np.sum([1 for cat in entity['types'].split(',') if 'DBpedia' in cat])
                 for entity in entities])
    return 0

def dbp_freq_types(sentence, target, start, end, annotations):
    entities = dbp_match_entities(sentence, target, start, end, annotations)
    if entities:
        return np.mean([len(entity['types'].split(',')) for entity in entities])
    return 0

def dbp_confidence(sentence, target, start, end):
    entities = dbp_match_entities(sentence, target, start, end, dbpedia_75)
    if entities:
        return 0.75
    entities = dbp_match_entities(sentence, target, start, end, dbpedia_50)
    if entities:
        return 0.5
    entities = dbp_match_entities(sentence, target, start, end, dbpedia_25)
    if entities:
        return 0.25
    return 0

def dbp_pagerank(sentence, target, start, end, annotations):
    entities = dbp_match_entities(sentence, target, start, end, annotations)
    return np.nan_to_num(np.mean([page_rank.get(entity['URI'], 0) for entity in entities]))

dbp_types = [('DBpedia:Place', 1, 'dbo:Place'), ('DBpedia:Person',2, 'dbo:Person'), 
             ('DBpedia:Organisation',3, 'dbo:Organisation'), ('DBpedia:Timeperiod', 4, 'dbo:Timeperiod')]

def dbp_extract_type(entity):
    types = [(cat, rank, name) for cat, rank, name in dbp_types if cat in entity['types']]
    if not types and not entity['types']:
        return ('dbo:notype', 0, 'dbo:notype')
    if not types and entity['types']:
        return ('dbo:misc', 5, 'dbo:misc')
    else:
        return types[0]

def dbp_type(sentence, target, start, end, annotations):
    entities = dbp_match_entities(sentence, target, start, end, annotations)
    if not entities:
        return 'dbo:missing'
    types = [dbp_extract_type(entity) for entity in entities]
    sorted(types, key=lambda tpl : tpl[1])
    return types[0][2]

In [42]:
def features_dbpedia(dataframe, agg, drop_features):
    df = dataframe.copy()
    df['dbp_confidence (sem)'] = df[['sentence', 'target', 'start', 'end']].apply(lambda vals :
                                                          dbp_confidence(*vals),axis = 1)
    
    df['dbp_entity_ratio_25 (sem)'] = df[['sentence', 'target', 'start', 'end']].apply(lambda vals :
                                                          dbp_entity_ratio(*vals, dbpedia_25),axis = 1)
    df['dbp_entity_support_25 (sem)'] = df[['sentence', 'target', 'start', 'end']].apply(lambda vals :
                                                          dbp_support(*vals, dbpedia_25),axis = 1)
    df['dbp_type_hierachy_depth_25 (sem)'] = df[['sentence', 'target', 'start', 'end']].apply(lambda vals :
                                                          dbp_type_hierachy_depth(*vals, dbpedia_25),axis = 1)
    df['dbp_freq_types_25 (sem)'] = df[['sentence', 'target', 'start', 'end']].apply(lambda vals :
                                                          dbp_freq_types(*vals, dbpedia_25),axis = 1)
    df['dbp_pagerank_25 (sem)'] = df[['sentence', 'target', 'start', 'end']].apply(lambda vals :
                                                          dbp_pagerank(*vals, dbpedia_25),axis = 1)
    max_page_rank = np.max(df['dbp_pagerank_25 (sem)'])
    df['dbp_norm_pagerank_25 (sem)'] = df['dbp_pagerank_25 (sem)'] / max_page_rank
    df['dbp_type_25 (sem)'] = df[['sentence', 'target', 'start', 'end']].apply(lambda vals :
                                                          dbp_type(*vals, dbpedia_25),axis = 1)
    
    df['dbp_entity_ratio_00 (sem)'] = df[['sentence', 'target', 'start', 'end']].apply(lambda vals :
                                                          dbp_entity_ratio(*vals, dbpedia_00),axis = 1)
    df['dbp_entity_support_00 (sem)'] = df[['sentence', 'target', 'start', 'end']].apply(lambda vals :
                                                          dbp_support(*vals, dbpedia_00),axis = 1)
    df['dbp_type_hierachy_depth_00 (sem)'] = df[['sentence', 'target', 'start', 'end']].apply(lambda vals :
                                                          dbp_type_hierachy_depth(*vals, dbpedia_00),axis = 1)
    df['dbp_freq_types_00 (sem)'] = df[['sentence', 'target', 'start', 'end']].apply(lambda vals :
                                                          dbp_freq_types(*vals, dbpedia_00),axis = 1)
    df['dbp_pagerank_00 (sem)'] = df[['sentence', 'target', 'start', 'end']].apply(lambda vals :
                                                          dbp_pagerank(*vals, dbpedia_00),axis = 1)
    max_page_rank = np.max(df['dbp_pagerank_00 (sem)'])
    df['dbp_norm_pagerank_00 (sem)'] = df['dbp_pagerank_00 (sem)'] / max_page_rank
    df['dbp_type_00 (sem)'] = df[['sentence', 'target', 'start', 'end']].apply(lambda vals :
                                                          dbp_type(*vals, dbpedia_00),axis = 1)
    
    df = df.drop(drop_features, axis = 1)
    df = df.fillna(0)
    return df

fc_dbpedia = FeatureCategory('dbpedia', features_dbpedia)

In [43]:
def compute_features_dbpedia(datasets, aggs = agg_default, drop_features = []):
    return [FeatureDataset(ds.name, fc_dbpedia, agg,
                        fc_dbpedia.func(ds.train, agg.agg, drop_features), 
                        fc_dbpedia.func(ds.test, agg.agg, drop_features)) 
                        for ds in datasets
                        for agg in aggs]

## (3.5.3) Brown Clustering

In [44]:
brown_cluster_word2cluster = {}
brown_cluster_cluster2words = defaultdict(list)
with open("resources/brown-clustering/paths/rcv1.clean-c6000-p1.paths", encoding="utf8") as file:
    for line in file:
        binary_cluster, word, _ = line.split()
        brown_cluster_word2cluster[word] = binary_cluster
        brown_cluster_cluster2words[binary_cluster].append(word)

def brown_clustering_cluster_size(target):
    cluster = brown_cluster_word2cluster.get(target)
    return len(brown_cluster_cluster2words[cluster]) if cluster else 0

def brown_clustering_cluster_depth_simple(target):
    cluster = brown_cluster_word2cluster.get(target)
    return int(cluster, 2) if cluster else 0

def brown_clustering_cluster_depth_bit(target):
    cluster = brown_cluster_word2cluster.get(target)
    if not cluster:
        return 8.75
    return np.sum([1 for bit in cluster if bit == '1'])

def brown_clustering_cluster_size_all(target):
    cluster = brown_cluster_word2cluster.get(target)
    if not cluster:
        return 0
    upper_clusters = [cluster[0:(len(cluster) - index)] + '0' * index \
         for index, bit in enumerate(reversed(cluster)) if bit == '1']
    cluster_counts = [len(brown_cluster_cluster2words.get(clu, [])) \
                         for clu in upper_clusters]
    return np.sum(cluster_counts)

In [45]:
def features_brown_clustering(dataframe, agg, drop_features):
    df = dataframe.copy()
    df['brown_clustering_cluster_size (sem)'] = df.p_target.apply(lambda target : agg(target, \
                                                            brown_clustering_cluster_size))
    df['brown_clustering_cluster_size_all (sem)'] = df.p_target.apply(lambda target : agg(target, \
                                                            brown_clustering_cluster_size_all))
    df['brown_clustering_cluster_depth_simple (sem)'] = df.p_target.apply(lambda target : agg(target, \
                                                            brown_clustering_cluster_depth_simple))
    df['brown_clustering_cluster_depth_bit (sem)'] = df.p_target.apply(lambda target : agg(target, \
                                                            brown_clustering_cluster_depth_bit))
    df = df.drop(drop_features, axis = 1)
    df = df.fillna(0)
    return df

fc_brown_clustering = FeatureCategory('brown_clustering', features_brown_clustering)

In [46]:
def compute_features_brown_clustering(datasets, aggs = agg_default, drop_features = []):
    return [FeatureDataset(ds.name, fc_brown_clustering, agg,
                        fc_brown_clustering.func(ds.train, agg.agg, drop_features), 
                        fc_brown_clustering.func(ds.test, agg.agg, drop_features)) 
                        for ds in datasets
                        for agg in aggs]

In [47]:
fc_semantic = FeatureCategory('semantic', [fc_wordnet, fc_dbpedia, fc_brown_clustering])

def compute_features_semantic(datasets):
    return [FeatureDataset(ds.name, fc_semantic,  ds.agg,
            ds.train, ds.test) for ds in concat_feature_datasets(*datasets)]

# (3.6) Dictionary Features


In [48]:
import textatistic
from collections import Counter

academic_words = {}
with open("resources/dictionaries/academic_word_list.txt", encoding="utf8") as file:
    for line in file:
        word, rank = line.split()
        academic_words[word.strip()] = rank

prefixes = {}
with open("resources/dictionaries/prefixes.txt", encoding="utf8") as file:
    for line in file:
        prefix, definition, examples = line.split('\t')
        prefixes[prefix.replace('-', '').strip()] = definition

suffixes = {}
with open("resources/dictionaries/suffixes.txt", encoding="utf8") as file:
    for line in file:
        suffix, definition, examples = line.split('\t')
        suffixes[suffix.replace('-', '').strip()] = definition

with open("resources/dictionaries/biology_glossary.csv", encoding="utf8") as file:
    content = [line.strip().lower() for line in file.readlines()]
    gloss_biology = set(content)

with open("resources/dictionaries/geography_glossary.csv", encoding="utf8") as file:
    content = [line.strip().lower() for line in file.readlines()]
    gloss_geography = set(content)
    
with open("resources/dictionaries/physics_glossary.csv", encoding="utf8") as file:
    content = [line.strip().lower() for line in file.readlines()]
    gloss_physics = set(content)
    
with open("resources/dictionaries/stopwords_en.txt", encoding="utf8") as file:
    content = [line.strip().lower() for line in file.readlines()]
    stop_words = set(content)
    
with open("resources/dictionaries/most_freq_used_3000_words.txt", encoding="utf8") as file:
    content = [line.strip().lower() for line in file.readlines()]
    most_freq_used_3000_words = set(content)
    
with open("resources/dictionaries/most_freq_used_5000_words.txt", encoding="utf8") as file:
    content = [line.split()[1].strip().lower() for line in file.readlines()]
    most_freq_used_5000_words = set(content)
    

'''
Extract all words that are exactly identified as either complex
or non-complex and use this as the vocabulary. Words that occur
as both complex and non-complex are neglected for the vocabulary
'''
def build_clean_vocabulary(train):
    targets_complex = set([mwe.strip().lower() for mwe in 
                train.loc[train['binary'] == 1,]['target'].tolist()])
    targets_non_complex = set([mwe.strip().lower() for mwe in
                train.loc[train['binary'] == 0,]['target'].tolist()])
    targets_complex_cleaned = list(targets_complex.difference(targets_non_complex))
    targets_non_complex_cleaned = list(targets_non_complex.difference(targets_complex))
    vocabulary = {}
    for target in targets_complex_cleaned:
        vocabulary[target] = 1
    for target in targets_non_complex_cleaned:
        vocabulary[target] = 0
    return vocabulary

'''
Extract all words that are identified as either complex
or non-complex and use this as the vocabulary. Words that occur
as both complex and non-complex are weighted based on the number
of occurrences. If the word has been tagged more times as non-complex
we save it as non-complex otherwise it is complex
'''
def build_weighted_vocabulary(train):
    targets_complex = [mwe.strip().lower() for mwe in 
                train.loc[train['binary'] == 1,]['target'].tolist()]
    targets_non_complex = [mwe.strip().lower() for mwe in
                train.loc[train['binary'] == 0,]['target'].tolist()]
    counts_complex = Counter(targets_complex)
    counts_non_complex = Counter(targets_non_complex)
    vocabulary = {}
    for word, count in counts_complex.items():
        count_nc = counts_non_complex.get(word, None)
        if count_nc and count_nc > count:
            vocabulary[word] = 0
        else:
            vocabulary[word] = 1
    for word, count in counts_non_complex.items():
        if word not in vocabulary:
            vocabulary[word] = 0
    return vocabulary

def build_confidence_vocabulary_1(train, confidence):
    targets_complex = [mwe.strip().lower() for mwe in 
                train.loc[train['prob'] >= confidence,]['target'].tolist()]
    targets_non_complex = [mwe.strip().lower() for mwe in 
                train.loc[train['prob'] < confidence,]['target'].tolist()]
    counts_complex = Counter(targets_complex)
    counts_non_complex = Counter(targets_non_complex)
    vocabulary = {}
    for word, count in counts_complex.items():
        count_nc = counts_non_complex.get(word, None)
        if count_nc and count_nc > count:
            vocabulary[word] = 0
        else:
            vocabulary[word] = 1
    for word, count in counts_non_complex.items():
        if word not in vocabulary:
            vocabulary[word] = 0
    return vocabulary

def build_confidence_vocabulary_2(train, confidence):
    targets_complex = [mwe.strip().lower() for mwe in 
                train.loc[train['prob'] >= confidence,]['target'].tolist()]
    targets_non_complex = [mwe.strip().lower() for mwe in 
                train.loc[train['binary'] == 0,]['target'].tolist()]
    counts_complex = Counter(targets_complex)
    counts_non_complex = Counter(targets_non_complex)
    vocabulary = {}
    for word, count in counts_complex.items():
        count_nc = counts_non_complex.get(word, None)
        if count_nc and count_nc > count:
            vocabulary[word] = 0
        else:
            vocabulary[word] = 1
    for word, count in counts_non_complex.items():
        if word not in vocabulary:
            vocabulary[word] = 0
    return vocabulary

def build_confidence_vocabulary_mean(train):
    train['target'] = train.target.apply(lambda target : target.strip().lower())
    agg = train[['target', 'prob']].groupby('target',
                        as_index=False).mean().values
    tuples = [tuple(val) for val in agg]
    vocabulary = {}
    for target, confidence in tuples:
        vocabulary[target] = confidence
    return vocabulary

def build_confidence_vocabulary_max(train):
    train['target'] = train.target.apply(lambda target : target.strip().lower())
    agg = train[['target', 'prob']].groupby('target',
                        as_index=False).max().values
    tuples = [tuple(val) for val in agg]
    vocabulary = {}
    for target, confidence in tuples:
        vocabulary[target] = confidence
    return vocabulary

In [49]:
def features_dictionary(dataframe, agg, drop_features):
    df = dataframe.copy()
    df['dict_dale_chall (dic)'] = df.p_target.apply(lambda target : agg(target, \
                            lambda target :  0 if textatistic.notdalechall_count(target) >= 1 else 1))
    df['dict_570_academic_words (dic)'] = df.p_target.apply(lambda target : agg(target, \
                                                    lambda target : int(target in academic_words)))
    df['common_prefix (dic)'] = df.p_target.apply(lambda target : int(np.sum([1 for prefix in prefixes if target.startswith(prefix)]) > 0))
    df['common_suffix (dic)'] = df.p_target.apply(lambda target : int(np.sum([1 for suffix in suffixes if target.endswith(suffix)]) > 0))
    df['gloss_biology (dic)'] = df.p_target.apply(lambda target : int(target in gloss_biology))
    df['gloss_physics (dic)'] = df.p_target.apply(lambda target : int(target in gloss_physics))
    df['gloss_geography (dic)'] = df.p_target.apply(lambda target : int(target in gloss_geography))
    df['stop_word (dic)'] = df.p_target.apply(lambda target : int(target in stop_words))
    df['most_freq_used_3000_words (dic)'] = df.p_target.apply(lambda target : agg(target, \
                                                    lambda target : int(target in most_freq_used_3000_words)))
    df['most_freq_used_5000_words (dic)'] = df.p_target.apply(lambda target : agg(target, \
                                                    lambda target : int(target in most_freq_used_5000_words)))
    df = df.drop(drop_features, axis = 1)
    df = df.fillna(0)
    return df

fc_dictionary = FeatureCategory('dictionary', features_dictionary)

In [50]:
def compute_features_dictionary(datasets, aggs = agg_default, drop_features = []):
    return [FeatureDataset(ds.name, fc_dictionary, agg,
                        fc_dictionary.func(ds.train, agg.agg, drop_features), 
                        fc_dictionary.func(ds.test, agg.agg, drop_features)) 
                        for ds in datasets
                        for agg in aggs]

# 4. Classification Models
Here we compute individual feature importance based on different metrics. For example, we implement and compute the F-Score, providing an idea of the discrimination power the feature has.

In [51]:
from collections import namedtuple
Result = namedtuple('Result', 'dataset, fc, agg, measure')
Dataset = namedtuple('Dataset', 'name, train, test')
FeatureDataset = namedtuple('FeatureDataset', 'name, fc, agg, train, test')
FeatureCategory = namedtuple('FeatureCategory', 'name, func')
Feature = namedtuple('Feature', 'name, fc_name, train, test')
Metric = namedtuple('Metric', 'name, func')

## (4.1) Utility Functions
Here we provide several utility functions for working with the datasets and classification algorithms. For example, we provide functions to clean the datasets from all non-features (such as id, sentence, the annotator information etc.) and functions to transform the feature datasets into a proper representation for the algorithms (such as one-hot-encoding of categorical attributes).

In [52]:
def remove_labels_for_binary_df(dataframe, drop=[]):
    drop_list = ['id', 'sentence', 'target', 'nat', 'non_nat', 
                  'nat_marked', 'non_nat_marked', 'prob', 'start', 
                  'end', 'p_target', 'lemma', 'p_lemma', 'pos_tags', 'pos_tags_pt']
    drop_list.extend(drop)
    df = dataframe.copy()
    df = df.drop(drop_list, axis = 1)
    return df

def remove_labels_phrase_for_binary_df(dataframe, drop=[]):
    drop_list = ['id', 'sentence', 'target', 'nat', 'non_nat', 
                  'nat_marked', 'non_nat_marked', 'prob', 'start', 
                  'end', 'p_target', 'lemma', 'p_lemma', 'pos_tags', 'pos_tags_pt', 'phrase_index']
    drop_list.extend(drop)
    df = dataframe.copy()
    df = df.drop(drop_list, axis = 1)
    return df

def remove_labels_for_regr_df(dataframe, drop=[]):
    drop_list = ['id', 'sentence', 'target', 'nat', 'non_nat', 
                  'nat_marked', 'non_nat_marked', 'binary', 'start', 
                  'end', 'p_target', 'lemma', 'p_lemma', 'pos_tags', 'pos_tags_pt']
    drop_list.extend(drop)
    df = dataframe.copy()
    df = df.drop(drop_list, axis = 1)
    return df
    
def transform_feat_to_num(train, test):
    train_copy = train.copy()
    test_copy = test.copy()
    train_copy = train_copy.replace([np.inf, -np.inf], np.nan)
    train_copy = train_copy.fillna(0)
    test_copy = test_copy.replace([np.inf, -np.inf], np.nan)
    test_copy = test_copy.fillna(0)
    shape_train = train.shape
    shape_test = test.shape
    df = train_copy.append(test_copy, ignore_index=True)
    df = pd.get_dummies(df)
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0)
    df = df.applymap(lambda x: 1 if x == True else x)
    df = df.applymap(lambda x: 0 if x == False else x)
    return (df.loc[0:(shape_train[0]-1),], 
            df.loc[shape_train[0]:df.shape[0],])

def prep_data(train, test):
    x_train = train.loc[:, train.columns != 'binary']
    y_train = train['binary'].values
    x_test = test.loc[:, test.columns != 'binary']
    y_test = test.binary.values
    return x_train, y_train, x_test, y_test

def create_eval_df_from_results(results, remove_agg=True):
    if remove_agg:
        evaluation = [{'dataset' : result.dataset.name,
                        'zc' : result.fc[0], 'prec' : result.measure[0][1],
                   'rec' : result.measure[1][1], 'f1' : result.measure[2][1]} 
                       for result in results]
    else:
        evaluation = [{'dataset' : result.dataset.name, 'agg' : result.agg[0],
                        'zc' : result.fc[0], 'prec' : result.measure[0][1],
                   'rec' : result.measure[1][1], 'f1' : result.measure[2][1]} 
                       for result in results]
    return pd.DataFrame.from_records(evaluation)

def create_eval_df_from_results_macro(results, remove_agg=True):
    if remove_agg:
        evaluation = [{'dataset' : result.dataset.name,
                        'zc' : result.fc[0], 'prec' : result.measure[0],
                   'rec' : result.measure[1], 'f1' : result.measure[2]} 
                       for result in results]
    else:
        evaluation = [{'dataset' : result.dataset.name, 'agg' : result.agg[0],
                        'zc' : result.fc[0], 'prec' : result.measure[0],
                   'rec' : result.measure[1], 'f1' : result.measure[2]} 
                       for result in results]
    return pd.DataFrame.from_records(evaluation)

## (4.2.1) Baseline Always Complex

In [53]:
from collections import Counter

def always_complex_prediction(train, test):
    y_test = test.binary.values
    prediction = [1 for val in y_test]
    f1score = precision_recall_fscore_support(y_test, prediction, average='macro')
    return f1score

def baseline_always_complex(dataset):
    results = [Result(ds.name, 'always_complex', agg_default[0],
        always_complex_prediction(remove_labels_for_binary_df(ds.train), 
            remove_labels_for_binary_df(ds.test))) for ds in datasets]
    evaluation = [{'dataset' : result.dataset, 
                        'zc' : result.fc, 'prec' : result.measure[0],
                  'rec' : result.measure[1], 'f1' : result.measure[2]} 
                       for result in results]
    counts = [(ds.name, Counter(ds.test.binary)) for ds in datasets]
    return pd.DataFrame.from_records(evaluation)

## (4.3.2) Baseline Memorize Vocabulary

In [54]:
from collections import Counter
from sklearn.metrics import precision_recall_fscore_support

'''
Extract all words that are exactly identified as either complex
or non-complex and use this as the vocabulary. Words that occur
as both complex and non-complex are neglected for the vocabulary
'''
def build_clean_vocabulary(train):
    targets_complex = set([mwe.strip().lower() for mwe in 
                train.loc[train['binary'] == 1,]['target'].tolist()])
    targets_non_complex = set([mwe.strip().lower() for mwe in
                train.loc[train['binary'] == 0,]['target'].tolist()])
    targets_complex_cleaned = list(targets_complex.difference(targets_non_complex))
    targets_non_complex_cleaned = list(targets_non_complex.difference(targets_complex))
    vocabulary = {}
    for target in targets_complex_cleaned:
        vocabulary[target] = 1
    for target in targets_non_complex_cleaned:
        vocabulary[target] = 0
    return vocabulary

'''
Extract all words that are identified as either complex
or non-complex and use this as the vocabulary. Words that occur
as both complex and non-complex are weighted based on the number
of occurrences. If the word has been tagged more times as non-complex
we save it as non-complex otherwise it is complex
'''
def build_weighted_vocabulary(train):
    targets_complex = [mwe.strip().lower() for mwe in 
                train.loc[train['binary'] == 1,]['target'].tolist()]
    targets_non_complex = [mwe.strip().lower() for mwe in
                train.loc[train['binary'] == 0,]['target'].tolist()]
    counts_complex = Counter(targets_complex)
    counts_non_complex = Counter(targets_non_complex)
    vocabulary = {}
    for word, count in counts_complex.items():
        count_nc = counts_non_complex.get(word, None)
        if count_nc and count_nc > count:
            vocabulary[word] = 0
        else:
            vocabulary[word] = 1
    for word, count in counts_non_complex.items():
        if word not in vocabulary:
            vocabulary[word] = 0
    return vocabulary

def build_confidence_vocabulary_1(train, confidence):
    targets_complex = [mwe.strip().lower() for mwe in 
                train.loc[train['prob'] >= confidence,]['target'].tolist()]
    targets_non_complex = [mwe.strip().lower() for mwe in 
                train.loc[train['prob'] < confidence,]['target'].tolist()]
    counts_complex = Counter(targets_complex)
    counts_non_complex = Counter(targets_non_complex)
    vocabulary = {}
    for word, count in counts_complex.items():
        count_nc = counts_non_complex.get(word, None)
        if count_nc and count_nc > count:
            vocabulary[word] = 0
        else:
            vocabulary[word] = 1
    for word, count in counts_non_complex.items():
        if word not in vocabulary:
            vocabulary[word] = 0
    return vocabulary

def build_confidence_vocabulary_2(train, confidence):
    targets_complex = [mwe.strip().lower() for mwe in 
                train.loc[train['prob'] >= confidence,]['target'].tolist()]
    targets_non_complex = [mwe.strip().lower() for mwe in 
                train.loc[train['binary'] == 0,]['target'].tolist()]
    counts_complex = Counter(targets_complex)
    counts_non_complex = Counter(targets_non_complex)
    vocabulary = {}
    for word, count in counts_complex.items():
        count_nc = counts_non_complex.get(word, None)
        if count_nc and count_nc > count:
            vocabulary[word] = 0
        else:
            vocabulary[word] = 1
    for word, count in counts_non_complex.items():
        if word not in vocabulary:
            vocabulary[word] = 0
    return vocabulary

def build_confidence_vocabulary_mean(train):
    train['target'] = train.target.apply(lambda target : target.strip().lower())
    agg = train[['target', 'prob']].groupby('target',
                        as_index=False).mean().values
    tuples = [tuple(val) for val in agg]
    vocabulary = {}
    for target, confidence in tuples:
        vocabulary[target] = confidence
    return vocabulary

def build_confidence_vocabulary_max(train):
    train['target'] = train.target.apply(lambda target : target.strip().lower())
    agg = train[['target', 'prob']].groupby('target',
                        as_index=False).max().values
    tuples = [tuple(val) for val in agg]
    vocabulary = {}
    for target, confidence in tuples:
        vocabulary[target] = confidence
    return vocabulary
    

def evaluate_label_target_predictions(test, vocabulary):
    dict_test = list(zip(test.target, test.binary))
    data = [(binary, (vocabulary[target.strip().lower()] if target.strip().lower() in vocabulary else 1)) 
            for target, binary in dict_test]
    y_true = [vals[0] for vals in data]
    prediction = [vals[1] for vals in data]
    return precision_recall_fscore_support(y_true, prediction, average='macro')

In [55]:
def baseline_vocab_clean(datasets):
    evaluation_clean = [Result(ds.name, 'vocab_clean', agg_default[0], 
                    evaluate_label_target_predictions(ds.test, 
                    build_clean_vocabulary(ds.train))) for ds in datasets]
    results_clean = [{'dataset' : result.dataset, 
                        'zc' : result.fc, 'prec' : result.measure[0],
                  'rec' : result.measure[1], 'f1' : result.measure[2]} 
                       for result in evaluation_clean]
    return pd.DataFrame.from_records(results_clean)

In [56]:
def baseline_vocab_weighted(datasets):
    evaluation_weighted = [Result(ds.name, 'vocab_weighted', agg_default[0], 
                        evaluate_label_target_predictions(ds.test, 
                    build_weighted_vocabulary(ds.train))) for ds in datasets]
    results_weighted = [{'dataset' : result.dataset, 
                        'zc' : result.fc, 'prec' : result.measure[0],
                  'rec' : result.measure[1], 'f1' : result.measure[2]} 
                       for result in evaluation_weighted]
    return pd.DataFrame.from_records(results_weighted)

In [57]:
def baseline_vocab_conf(datasets, confidence = 0.5):
    evaluation_conf = [Result(ds.name, 'vocab_conf', agg_default[0], 
                            evaluate_label_target_predictions(ds.test, 
                        build_confidence_vocabulary_2(ds.train, confidence))) for ds in datasets]
    results_conf = [{'dataset' : result.dataset, 
                        'zc' : result.fc, 'prec' : result.measure[0],
                  'rec' : result.measure[1], 'f1' : result.measure[2]} 
                       for result in evaluation_conf]
    return pd.DataFrame.from_records(results_conf)

# 4.4 Classification Models

In [58]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn import model_selection
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid

def xgboost(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    xgtrain = xgb.DMatrix(x_train.values, label=y_train)
    xgtest = xgb.DMatrix(x_test.values, label=y_test)
    xg_test_x = xgb.DMatrix(x_test.values)
    param = {'max_depth': 30, 'eta': 1, 'silent': 1, \
             'objective': 'binary:logistic',  'n_estimators':5000}
    evallist = [(xgtest, 'eval'), (xgtrain, 'train')]
    num_round = 70
    bst = xgb.train(param, xgtrain, num_round, evallist)
    prediction = bst.predict(xg_test_x)
    prediction_binary = list(map(lambda val: 1 if val>0.5 else 0, prediction))
    return y_test, prediction_binary

def adaboost(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    adab = AdaBoostClassifier(base_estimator=None, n_estimators=5000, 
                          learning_rate=1.0, algorithm='SAMME.R',
                          random_state=None)
    adab.fit(x_train, y_train) 
    prediction = adab.predict(x_test)
    return y_test, prediction

def random_forest(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    x_train = x_train.values.astype(np.float)
    x_test = x_test.values.astype(np.float)
    clf = RandomForestClassifier(max_depth=10, random_state=14521, n_estimators=1800, \
                    verbose=1, min_samples_split=5, min_samples_leaf=4, bootstrap=False)
    clf.fit(x_train, y_train)
    prediction = clf.predict(x_test)
    prediction_binary = list(map(lambda val: 1 if val>0.5 else 0, prediction))
    return y_test, prediction_binary

def random_forest_extra(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    x_train = x_train.values.astype(np.float)
    x_test = x_test.values.astype(np.float)
    clf = ExtraTreesClassifier(n_estimators=1800, criterion='gini', max_depth=None,
                     min_samples_split=5, min_samples_leaf=4, min_weight_fraction_leaf=0.0,
                     max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
                     min_impurity_split=None, bootstrap=False, oob_score=False,
                     random_state=15325, verbose=0, warm_start=False)
    clf.fit(x_train, y_train)
    prediction = clf.predict(x_test)
    prediction_binary = list(map(lambda val: 1 if val>0.5 else 0, prediction))
    return y_test, prediction_binary

def svm(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    seed = 7
    svc = SVC(C=10, kernel='rbf', degree=3, gamma='auto', 
            coef0=0.0, shrinking=True, probability=False, tol=0.001, 
            cache_size=200, class_weight=None, verbose=False, max_iter=-1, 
            decision_function_shape='ovr', random_state=None)
    svc.fit(x_train, y_train) 
    prediction = svc.predict(x_test)
    f1score = f1_score(y_test, prediction)
    return y_test, prediction


def mlp(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    mlp = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto',
          beta_1=0.9, beta_2=0.999, early_stopping=False,
          epsilon=1e-08, hidden_layer_sizes=(50, 20), learning_rate='constant',
          learning_rate_init=0.001, max_iter=200, momentum=0.9,
          nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
          solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
          warm_start=False)
    mlp.fit(x_train, y_train) 
    prediction = mlp.predict(x_test)
    return y_test, prediction

def decision_tree(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    seed = 7
    dt = DecisionTreeClassifier(criterion='gini', splitter='best', 
                                 max_depth=None, min_samples_split=2, 
                                 min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                 max_features=None, random_state=None, max_leaf_nodes=None, 
                                 min_impurity_decrease=0.0, min_impurity_split=None, 
                                 class_weight=None, presort=False)
    dt.fit(x_train, y_train) 
    prediction = dt.predict(x_test)
    return y_test, prediction


def knn(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', 
                     leaf_size=30, p=2, metric='minkowski')
    knn.fit(x_train, y_train) 
    prediction = knn.predict(x_test)
    return y_test, prediction

def nn(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    clf = NearestCentroid()
    clf.fit(x_train, y_train)
    prediction = clf.predict(x_test)
    return y_test, prediction

def naive_bayes(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    naive_bayes = GaussianNB(priors=None)
    naive_bayes.fit(x_train, y_train) 
    prediction = naive_bayes.predict(x_test)
    return y_test, prediction

def logistic_regression(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    logistic_regression = LogisticRegression(penalty='l2', dual=False, tol=0.0001,
                                     C=1.0, fit_intercept=True, intercept_scaling=1, 
                                     class_weight=None, random_state=None, solver='lbfgs',
                                     max_iter=100, verbose=0, 
                                     warm_start=False)
    logistic_regression.fit(x_train, y_train) 
    prediction = logistic_regression.predict(x_test)
    return y_test, prediction


def xgboost_with_bst(train, test, silent):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    xgtrain = xgb.DMatrix(x_train.values, label=y_train, feature_names=x_train.columns.values)
    xgtest = xgb.DMatrix(x_test.values, label=y_test, feature_names=x_test.columns.values)
    xg_test_x = xgb.DMatrix(x_test.values, feature_names=x_test.columns.values)
    param = {'max_depth': 30, 'eta': 1, 'silent': silent, 'objective': 'binary:logistic',  'n_estimators':5000}
    evallist = [(xgtest, 'eval'), (xgtrain, 'train')]
    num_round = 70
    bst = xgb.train(param, xgtrain, num_round, evallist)
    prediction = bst.predict(xg_test_x)
    prediction_binary = list(map(lambda val: 1 if val>0.5 else 0, prediction))
    f1score = precision_recall_fscore_support(y_test, prediction_binary)
    return f1score, bst

def random_forest_with_forest(train, test, label):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    x_train = x_train.as_matrix().astype(np.float)
    x_test = x_test.as_matrix().astype(np.float)
    clf = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=1800, \
                            verbose=1, min_samples_split=5, min_samples_leaf=4, bootstrap=False)
    clf.fit(x_train, y_train)
    prediction = clf.predict(x_test)
    prediction_binary = list(map(lambda val: 1 if val>0.5 else 0, prediction))
    f1score = precision_recall_fscore_support(y_test, prediction_binary)
    return f1score, clf

In [59]:
from sklearn.utils import resample

def get_upsample_count(dataframe):
    return dataframe.groupby('binary').size()[1]

def balance_dataframe(dataframe):
    df = dataframe.copy()
    df_majority = df[df.binary==1]
    df_minority = df[df.binary==0]
    count = get_upsample_count(df)
    df_minority_upsampled = resample(df_minority, 
                                 replace=True,    
                                 n_samples=count,
                                 random_state=721) 
    return pd.concat([df_majority, df_minority_upsampled])

def balance_phrase_label_dist(datasets):
    return [FeatureDataset(ds.name, ds.fc, ds.agg, 
                    balance_dataframe(ds.train), ds.test) for ds in datasets]

# X.x Complex Phrase Identifcation ML Results

## X.1 A1 Prediction Aggregation

Here we conduct the experiments for A1 word prediction aggregation to obtain the complexity of a phrase. We predict the complexity of each word of the phrase individually, and aggregate the predictions with three different simple A1 aggregation functions (min, max, majority voting). We train our models on the single words training set.

In [None]:
def compute_starts(start, target):
    curr_start = start
    starts = [curr_start]
    tokens = target.split()
    for token in tokens:
        curr_start = curr_start + len(token) + 1
        starts.append(curr_start)
    return ' '.join([str(start) for start in starts[:len(tokens)]])

def compute_ends(start, target):
    curr_start = start
    ends = []
    tokens = target.split()
    for index, token in enumerate(tokens):
        if index > 0:
            curr_start = curr_start + len(token) + 1
        else:
            curr_start = curr_start + len(token)
        ends.append(curr_start)
    return ' '.join([str(end) for end in ends])

def phrase_splitter(dataframe):
    df = dataframe.copy()
    df['starts'] = df[['start', 'target']].apply(lambda vals : compute_starts(*vals), axis=1)
    df['ends'] = df[['start', 'target']].apply(lambda vals : compute_ends(*vals), axis=1)
    df['phrase_index'] = df.apply(lambda x : x.name, axis=1)
    s1 = df.target.str.split(' ', expand=True).stack().str.strip().reset_index(level=1, drop=True)
    s2 = df.p_target.str.split(' ', expand=True).stack().str.strip().reset_index(level=1, drop=True)
    s3 = df.starts.str.split(' ', expand=True).stack().str.strip().reset_index(level=1, drop=True)
    s4 = df.ends.str.split(' ', expand=True).stack().str.strip().reset_index(level=1, drop=True)
    s5 = df.p_lemma.str.split(' ', expand=True).stack().str.strip().reset_index(level=1, drop=True)
    s6 = df.lemma.str.split(' ', expand=True).stack().str.strip().reset_index(level=1, drop=True)
    df['pos_tags'] = df.pos_tags.apply(lambda tags : ' '.join(tags))
    s7 = df.pos_tags.str.split(' ', expand=True).stack().str.strip().reset_index(level=1, drop=True)
    df['pos_tags_pt'] = df.pos_tags_pt.apply(lambda tags : ' '.join(tags))
    s8 = df.pos_tags_pt.str.split(' ', expand=True).stack().str.strip().reset_index(level=1, drop=True)
    df1 = pd.concat([s1, s2, s3, s4, s5, s6, s7, s8], axis=1, keys=['target','p_target', \
                                        'start', 'end', 'p_lemma', 'lemma','pos_tags', 'pos_tags_pt'])
    splitted_df = df.drop(['target', 'p_target', 'starts', \
                'start', 'ends', 'end', 'p_lemma','lemma', 
                        'pos_tags', 'pos_tags_pt'], axis=1).join(df1).reset_index(drop=True)
    splitted_df['start'] = pd.to_numeric(splitted_df.start, errors='coerce')
    splitted_df['end'] = pd.to_numeric(splitted_df.end, errors='coerce')
    return splitted_df

def phrase_splitting_datasets(datasets):
    return [Dataset(ds.name, ds.train, phrase_splitter(ds.test)) for ds in datasets]

In [None]:
def phrase_prediction_agg_majority_vote(predictions):
    positive_sum = np.sum(predictions)
    ratio = positive_sum / len(predictions)
    return int(ratio + 0.5)

def phrase_prediction_agg_max(predictions):
    return np.max(predictions)

def phrase_prediction_agg_min(predictions):
    return np.min(predictions)

phrase_agg_mv = Aggregation('phrase_mv', phrase_prediction_agg_majority_vote)
phrase_agg_max = Aggregation('phrase_max', phrase_prediction_agg_max)
phrase_agg_min = Aggregation('phrase_min', phrase_prediction_agg_min)

phrase_aggs = [phrase_agg_mv, phrase_agg_max, phrase_agg_min]

def phrase_merger(df_test, result, agg):
    df_test = df_test.copy()
    prediction = result[1]
    df_test['prediction'] = prediction
    pred_binary = df_test.groupby('phrase_index').apply(lambda row : \
                            (agg(row['prediction']), agg(row['binary']))).values
    predictions = [pred for pred, binary in pred_binary]
    binary = [binary for pred, binary in pred_binary]
    score = precision_recall_fscore_support(binary, predictions, average='macro')
    return score

In [None]:
def create_eval_df_from_results_phrase(results):
    evaluation = [{'dataset' : result.dataset.name, 'agg' : result.agg[0],
                        'zc' : result.fc[0], 'prec' : result.measure[0],
                   'rec' : result.measure[1], 'f1' : result.measure[2]} 
                       for result in results]
    return pd.DataFrame.from_records(evaluation)

In [None]:
# TODO: think about isntead of using all features use best feature sets from CWI experiments

In [None]:
datasets = load_datasets(['Wikipedia', 'WikiNews', 'News'], 'Train', 'Dev', type_train='word', type_test='phrase')
datasets = preprocess_datasets(datasets)
phrase_splitted_datasets = phrase_splitting_datasets(datasets)
# 1. Linguistic Features
datasets_fc_linguistic = compute_features_linguistic(phrase_splitted_datasets)
# 2. Corpus Features
datasets_fc_frequency = compute_features_frequency(phrase_splitted_datasets)
datasets_fc_language_model = compute_features_language_model(phrase_splitted_datasets)
datasets_fc_corpus = compute_features_corpus([datasets_fc_frequency, datasets_fc_language_model])
# # 3. Psycholinguistic
datasets_fc_psycholinguistic = compute_features_psycholinguistic(phrase_splitted_datasets)
# # 4. Semantic Features
datasets_fc_wordnet = compute_features_wordnet(phrase_splitted_datasets)
datasets_fc_dbpedia = compute_features_dbpedia(phrase_splitted_datasets)
datasets_fc_brown_clustering = compute_features_brown_clustering(phrase_splitted_datasets)
datasets_fc_semantic = compute_features_semantic([datasets_fc_wordnet, datasets_fc_dbpedia, datasets_fc_brown_clustering])
# # 5. Dictionary Features
datasets_fc_dictionary = compute_features_dictionary(phrase_splitted_datasets)
# # 6. Concatentation of feature categories
# # (1) Corpus + Semantic
datasets_fc_corpus_semantic = concat_feature_datasets(datasets_fc_corpus, datasets_fc_semantic, name='corpus+semantic')
# # (2) WordNet + Psycholinguistic
datasets_fc_wordnet_psycholinguistic = concat_feature_datasets(datasets_fc_wordnet, \
                             datasets_fc_psycholinguistic, name='wordnet+psycholinguistic')
# (3) All categories
datasets_fc_all = concat_feature_datasets(datasets_fc_linguistic, datasets_fc_psycholinguistic, \
                             datasets_fc_semantic, datasets_fc_corpus, datasets_fc_dictionary, name='all')

In [272]:
all_fc_datasets = []
all_fc_datasets.extend(datasets_fc_linguistic)
all_fc_datasets.extend(datasets_fc_frequency)
all_fc_datasets.extend(datasets_fc_language_model)
all_fc_datasets.extend(datasets_fc_corpus)
all_fc_datasets.extend(datasets_fc_psycholinguistic)
all_fc_datasets.extend(datasets_fc_wordnet)
all_fc_datasets.extend(datasets_fc_dbpedia)
all_fc_datasets.extend(datasets_fc_brown_clustering)
all_fc_datasets.extend(datasets_fc_semantic)
all_fc_datasets.extend(datasets_fc_dictionary)
all_fc_datasets.extend(datasets_fc_corpus_semantic)
all_fc_datasets.extend(datasets_fc_wordnet_psycholinguistic)
all_fc_datasets.extend(datasets_fc_all)

## X.x Baselines

In [71]:
baseline_always_complex(datasets)

  'precision', 'predicted', average, warn_for)


Unnamed: 0,dataset,f1,prec,rec,zc
0,Wikipedia,0.45679,0.420455,0.5,always_complex
1,WikiNews,0.433735,0.382979,0.5,always_complex
2,News,0.440171,0.39313,0.5,always_complex


In [72]:
baseline_vocab_clean(datasets)

  'precision', 'predicted', average, warn_for)


Unnamed: 0,dataset,f1,prec,rec,zc
0,Wikipedia,0.45679,0.420455,0.5,vocab_clean
1,WikiNews,0.433735,0.382979,0.5,vocab_clean
2,News,0.440171,0.39313,0.5,vocab_clean


In [73]:
baseline_vocab_weighted(datasets)

  'precision', 'predicted', average, warn_for)


Unnamed: 0,dataset,f1,prec,rec,zc
0,Wikipedia,0.45679,0.420455,0.5,vocab_weighted
1,WikiNews,0.433735,0.382979,0.5,vocab_weighted
2,News,0.440171,0.39313,0.5,vocab_weighted


## X.1 XGBoost

In [None]:
results_xgboost = [Result(fs, fs.fc, agg, phrase_merger(fs.test, xgboost(*transform_feat_to_num(
                remove_labels_for_binary_df(fs.train), 
                remove_labels_phrase_for_binary_df(fs.test))), agg.agg)) 
          for fs in all_fc_datasets
          for agg in phrase_aggs]

In [79]:
feature_eval_data_xg = create_eval_df_from_results_phrase(results_xgboost)
feature_eval_data_xg

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,phrase_mv,Wikipedia,0.465695,0.514583,0.527027,linguistic
1,phrase_max,Wikipedia,0.455729,0.490196,0.482625,linguistic
2,phrase_min,Wikipedia,0.283998,0.548205,0.545367,linguistic
3,phrase_mv,WikiNews,0.507734,0.542553,0.559343,linguistic
4,phrase_max,WikiNews,0.542208,0.547059,0.560606,linguistic
5,phrase_min,WikiNews,0.302113,0.501899,0.501263,linguistic
6,phrase_mv,News,0.550313,0.566507,0.596394,linguistic
7,phrase_max,News,0.501221,0.504984,0.506155,linguistic
8,phrase_min,News,0.329237,0.56531,0.549237,linguistic
9,phrase_mv,Wikipedia,0.497143,0.529915,0.554054,frequency


In [264]:
idx = feature_eval_data_xg.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_xg['f1']
feature_eval_data[idx]

Unnamed: 0,dataset,f1,prec,rec,zc
11,News,0.613904,0.613904,0.613904,linguistic
99,Wikipedia,0.57529,0.57529,0.57529,semantic
151,WikiNews,0.650831,0.72782,0.631313,all


## X.2 Random Forest

In [80]:
results_rf = [Result(fs, fs.fc, agg, phrase_merger(fs.test, random_forest(*transform_feat_to_num(
                remove_labels_for_binary_df(fs.train), 
                remove_labels_phrase_for_binary_df(fs.test))), agg.agg)) 
          for fs in all_fc_datasets
          for agg in phrase_aggs]

[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   17.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   19.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   23.9s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   28.4s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   32.5s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   33.0s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   54.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.4s f

[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   19.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   19.4s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   14.9s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   22.4s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   29.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   26.9s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   13.5s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s f

[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished


In [81]:
feature_eval_data_rf = create_eval_df_from_results_phrase(results_rf)
feature_eval_data_rf

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,phrase_mv,Wikipedia,0.457851,0.510898,0.52027,linguistic
1,phrase_max,Wikipedia,0.484375,0.514161,0.525097,linguistic
2,phrase_min,Wikipedia,0.283998,0.515525,0.516409,linguistic
3,phrase_mv,WikiNews,0.50972,0.560897,0.583965,linguistic
4,phrase_max,WikiNews,0.567731,0.574443,0.599116,linguistic
5,phrase_min,WikiNews,0.335613,0.553571,0.537879,linguistic
6,phrase_mv,News,0.464065,0.530714,0.54473,linguistic
7,phrase_max,News,0.450678,0.478701,0.469053,linguistic
8,phrase_min,News,0.274754,0.569492,0.533114,linguistic
9,phrase_mv,Wikipedia,0.497143,0.529915,0.554054,frequency


In [85]:
idx = feature_eval_data_rf.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_rf['f1']
feature_eval_data_rf[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
15,phrase_mv,News,0.59836,0.598784,0.635229,frequency
19,phrase_max,Wikipedia,0.571096,0.566362,0.583977,language_model
103,phrase_max,WikiNews,0.627367,0.621614,0.638889,wordnet+psycholinguistic


## X.3 Random Forest (Extra)

In [82]:
results_rfe = [Result(fs, fs.fc, agg, phrase_merger(fs.test, random_forest_extra(*transform_feat_to_num(
                remove_labels_for_binary_df(fs.train), 
                remove_labels_phrase_for_binary_df(fs.test))), agg.agg)) 
          for fs in all_fc_datasets
          for agg in phrase_aggs]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [83]:
feature_eval_data_rfe = create_eval_df_from_results_phrase(results_rfe)
feature_eval_data_rfe

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,phrase_mv,Wikipedia,0.457851,0.510898,0.52027,linguistic
1,phrase_max,Wikipedia,0.484375,0.514161,0.525097,linguistic
2,phrase_min,Wikipedia,0.283998,0.515525,0.516409,linguistic
3,phrase_mv,WikiNews,0.536054,0.575283,0.604798,linguistic
4,phrase_max,WikiNews,0.583854,0.590695,0.621843,linguistic
5,phrase_min,WikiNews,0.335613,0.553571,0.537879,linguistic
6,phrase_mv,News,0.498557,0.545343,0.567354,linguistic
7,phrase_max,News,0.471774,0.487513,0.48275,linguistic
8,phrase_min,News,0.274754,0.569492,0.533114,linguistic
9,phrase_mv,Wikipedia,0.465695,0.514583,0.527027,frequency


In [86]:
idx = feature_eval_data_rfe.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_rfe['f1']
feature_eval_data_rfe[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
19,phrase_max,Wikipedia,0.562552,0.558824,0.57722,language_model
96,phrase_mv,News,0.586171,0.590095,0.62552,corpus+semantic
112,phrase_max,WikiNews,0.627367,0.621614,0.638889,all


## X.4 AdaBoost

In [273]:
results_ada = [Result(fs, fs.fc, agg, phrase_merger(fs.test, adaboost(*transform_feat_to_num(
                remove_labels_for_binary_df(fs.train), 
                remove_labels_phrase_for_binary_df(fs.test))), agg.agg)) 
          for fs in all_fc_datasets
          for agg in phrase_aggs]

In [274]:
feature_eval_data_ada = create_eval_df_from_results_phrase(results_ada)
feature_eval_data_ada

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,phrase_mv,Wikipedia,0.473539,0.518315,0.533784,linguistic
1,phrase_max,Wikipedia,0.461444,0.48422,0.473938,linguistic
2,phrase_min,Wikipedia,0.306729,0.555708,0.55888,linguistic
3,phrase_mv,WikiNews,0.579732,0.600455,0.63952,linguistic
4,phrase_max,WikiNews,0.609847,0.60504,0.625,linguistic
5,phrase_min,WikiNews,0.314181,0.511607,0.508207,linguistic
6,phrase_mv,News,0.475323,0.511454,0.51699,linguistic
7,phrase_max,News,0.475234,0.48779,0.48353,linguistic
8,phrase_min,News,0.316629,0.559489,0.541956,linguistic
9,phrase_mv,Wikipedia,0.547059,0.550868,0.579151,frequency


In [275]:
idx = feature_eval_data_ada.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_ada['f1']
feature_eval_data_ada[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
10,phrase_max,Wikipedia,0.598174,0.59375,0.604247,frequency
33,phrase_mv,News,0.592054,0.592702,0.6263,corpus
76,phrase_max,WikiNews,0.654623,0.650595,0.659722,semantic


## X.5 Decision Tree

In [95]:
results_dt = [Result(fs, fs.fc, agg, phrase_merger(fs.test, decision_tree(*transform_feat_to_num(
                remove_labels_for_binary_df(fs.train), 
                remove_labels_phrase_for_binary_df(fs.test))), agg.agg)) 
          for fs in all_fc_datasets
          for agg in phrase_aggs]

In [96]:
feature_eval_data_dt = create_eval_df_from_results_phrase(results_dt)
feature_eval_data_dt

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,phrase_mv,Wikipedia,0.448498,0.486523,0.475869,linguistic
1,phrase_max,Wikipedia,0.46957,0.480602,0.472008,linguistic
2,phrase_min,Wikipedia,0.258971,0.588608,0.560811,linguistic
3,phrase_mv,WikiNews,0.52621,0.546703,0.564394,linguistic
4,phrase_max,WikiNews,0.591304,0.587662,0.602273,linguistic
5,phrase_min,WikiNews,0.350402,0.514737,0.513258,linguistic
6,phrase_mv,News,0.507918,0.523875,0.533894,linguistic
7,phrase_max,News,0.47994,0.482804,0.480149,linguistic
8,phrase_min,News,0.342131,0.560154,0.550017,linguistic
9,phrase_mv,Wikipedia,0.434039,0.479332,0.462355,frequency


In [97]:
idx = feature_eval_data_dt.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_dt['f1']
feature_eval_data_dt[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
33,phrase_mv,News,0.574116,0.581863,0.615811,corpus
40,phrase_max,WikiNews,0.639017,0.644444,0.69697,psycholinguistic
109,phrase_max,Wikipedia,0.608392,0.599924,0.626448,all


## X.6 Logistic Regression

In [98]:
results_lr = [Result(fs, fs.fc, agg, phrase_merger(fs.test, logistic_regression(*transform_feat_to_num(
                remove_labels_for_binary_df(fs.train), 
                remove_labels_phrase_for_binary_df(fs.test))), agg.agg)) 
          for fs in all_fc_datasets
          for agg in phrase_aggs]

  'precision', 'predicted', average, warn_for)


In [99]:
feature_eval_data_lr = create_eval_df_from_results_phrase(results_lr)
feature_eval_data_lr

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,phrase_mv,Wikipedia,0.442135,0.503618,0.506757,linguistic
1,phrase_max,Wikipedia,0.47678,0.510243,0.51834,linguistic
2,phrase_min,Wikipedia,0.29509,0.520833,0.523166,linguistic
3,phrase_mv,WikiNews,0.540917,0.587095,0.620581,linguistic
4,phrase_max,WikiNews,0.583854,0.590695,0.621843,linguistic
5,phrase_min,WikiNews,0.335613,0.553571,0.537879,linguistic
6,phrase_mv,News,0.440777,0.522488,0.531813,linguistic
7,phrase_max,News,0.421446,0.461066,0.442354,linguistic
8,phrase_min,News,0.274754,0.569492,0.533114,linguistic
9,phrase_mv,Wikipedia,0.499666,0.522321,0.53861,frequency


In [100]:
idx = feature_eval_data_lr.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_lr['f1']
feature_eval_data_lr[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
40,phrase_max,WikiNews,0.644657,0.654304,0.712753,psycholinguistic
96,phrase_mv,News,0.518042,0.562234,0.592493,corpus+semantic
109,phrase_max,Wikipedia,0.572234,0.568896,0.599421,all


## X.7 SVM

In [102]:
results_svm = [Result(fs, fs.fc, agg, phrase_merger(fs.test, svm(*transform_feat_to_num(
                remove_labels_for_binary_df(fs.train), 
                remove_labels_phrase_for_binary_df(fs.test))), agg.agg)) 
          for fs in all_fc_datasets
          for agg in phrase_aggs]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [103]:
feature_eval_data_svm = create_eval_df_from_results_phrase(results_svm)
feature_eval_data_svm

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,phrase_mv,Wikipedia,0.418391,0.492754,0.486486,linguistic
1,phrase_max,Wikipedia,0.455729,0.490196,0.482625,linguistic
2,phrase_min,Wikipedia,0.248447,0.533333,0.525097,linguistic
3,phrase_mv,WikiNews,0.514098,0.573148,0.599747,linguistic
4,phrase_max,WikiNews,0.517949,0.541496,0.557449,linguistic
5,phrase_min,WikiNews,0.311355,0.538618,0.52399,linguistic
6,phrase_mv,News,0.425024,0.506434,0.509102,linguistic
7,phrase_max,News,0.426789,0.464261,0.447209,linguistic
8,phrase_min,News,0.276125,0.551814,0.526612,linguistic
9,phrase_mv,Wikipedia,0.365079,0.527302,0.541506,frequency


In [104]:
idx = feature_eval_data_svm.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_svm['f1']
feature_eval_data_svm[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
1,phrase_max,Wikipedia,0.455729,0.490196,0.482625,linguistic
48,phrase_mv,WikiNews,0.560284,0.558551,0.563763,wordnet
69,phrase_mv,News,0.570657,0.599907,0.648405,brown_clustering


## X.8 Naive Bayes

In [105]:
results_nb = [Result(fs, fs.fc, agg, phrase_merger(fs.test, naive_bayes(*transform_feat_to_num(
                remove_labels_for_binary_df(fs.train), 
                remove_labels_phrase_for_binary_df(fs.test))), agg.agg)) 
          for fs in all_fc_datasets
          for agg in phrase_aggs]

  'precision', 'predicted', average, warn_for)


In [106]:
feature_eval_data_nb = create_eval_df_from_results_phrase(results_nb)
feature_eval_data_nb

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,phrase_mv,Wikipedia,0.453416,0.41954,0.493243,linguistic
1,phrase_max,Wikipedia,0.45679,0.420455,0.5,linguistic
2,phrase_min,Wikipedia,0.557086,0.564935,0.553089,linguistic
3,phrase_mv,WikiNews,0.433735,0.382979,0.5,linguistic
4,phrase_max,WikiNews,0.433735,0.382979,0.5,linguistic
5,phrase_min,WikiNews,0.555784,0.61634,0.556187,linguistic
6,phrase_mv,News,0.519429,0.712106,0.537361,linguistic
7,phrase_max,News,0.454167,0.518411,0.501647,linguistic
8,phrase_min,News,0.570339,0.580811,0.565967,linguistic
9,phrase_mv,Wikipedia,0.593846,0.600877,0.588803,frequency


In [107]:
idx = feature_eval_data_nb.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_nb['f1']
feature_eval_data_nb[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
15,phrase_mv,News,0.641913,0.709412,0.62266,frequency
33,phrase_mv,News,0.641913,0.709412,0.62266,corpus
77,phrase_min,WikiNews,0.562791,0.563927,0.561869,semantic
90,phrase_mv,Wikipedia,0.628138,0.632308,0.624517,corpus+semantic
108,phrase_mv,Wikipedia,0.628138,0.632308,0.624517,all


## X.9 kNN

In [108]:
results_knn = [Result(fs, fs.fc, agg, phrase_merger(fs.test, knn(*transform_feat_to_num(
                remove_labels_for_binary_df(fs.train), 
                remove_labels_phrase_for_binary_df(fs.test))), agg.agg)) 
          for fs in all_fc_datasets
          for agg in phrase_aggs]

In [109]:
feature_eval_data_knn = create_eval_df_from_results_phrase(results_knn)
feature_eval_data_knn

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,phrase_mv,Wikipedia,0.435495,0.511936,0.522201,linguistic
1,phrase_max,Wikipedia,0.513021,0.538126,0.567568,linguistic
2,phrase_min,Wikipedia,0.202899,0.420513,0.440154,linguistic
3,phrase_mv,WikiNews,0.439642,0.472273,0.46149,linguistic
4,phrase_max,WikiNews,0.498838,0.506458,0.508207,linguistic
5,phrase_min,WikiNews,0.28624,0.519048,0.510101,linguistic
6,phrase_mv,News,0.459133,0.524529,0.535801,linguistic
7,phrase_max,News,0.403717,0.442763,0.415569,linguistic
8,phrase_min,News,0.269678,0.515918,0.508755,linguistic
9,phrase_mv,Wikipedia,0.473539,0.518315,0.533784,frequency


In [110]:
idx = feature_eval_data_knn.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_knn['f1']
feature_eval_data_knn[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
24,phrase_mv,News,0.577119,0.583882,0.618239,language_model
91,phrase_max,Wikipedia,0.572234,0.568896,0.599421,corpus+semantic
109,phrase_max,Wikipedia,0.572234,0.568896,0.599421,all
112,phrase_max,WikiNews,0.627367,0.621614,0.638889,all


## X.10 MLP

In [111]:
results_mlp = [Result(fs, fs.fc, agg, phrase_merger(fs.test, mlp(*transform_feat_to_num(
                remove_labels_for_binary_df(fs.train), 
                remove_labels_phrase_for_binary_df(fs.test))), agg.agg)) 
          for fs in all_fc_datasets
          for agg in phrase_aggs]

  'precision', 'predicted', average, warn_for)


In [112]:
feature_eval_data_mlp = create_eval_df_from_results_phrase(results_mlp)
feature_eval_data_mlp

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,phrase_mv,Wikipedia,0.442135,0.503618,0.506757,linguistic
1,phrase_max,Wikipedia,0.461647,0.50265,0.504826,linguistic
2,phrase_min,Wikipedia,0.283998,0.548205,0.545367,linguistic
3,phrase_mv,WikiNews,0.508768,0.581555,0.608586,linguistic
4,phrase_max,WikiNews,0.539216,0.568841,0.59596,linguistic
5,phrase_min,WikiNews,0.28624,0.519048,0.510101,linguistic
6,phrase_mv,News,0.443737,0.516414,0.523665,linguistic
7,phrase_max,News,0.426148,0.461332,0.443135,linguistic
8,phrase_min,News,0.260854,0.56157,0.525832,linguistic
9,phrase_mv,Wikipedia,0.407008,0.526786,0.546332,frequency


In [113]:
idx = feature_eval_data_mlp.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_mlp['f1']
feature_eval_data_mlp[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
13,phrase_max,WikiNews,0.645797,0.638528,0.661616,frequency
15,phrase_mv,News,0.633384,0.639811,0.628467,frequency
27,phrase_mv,Wikipedia,0.661196,0.6457,0.697876,corpus


## X.2 A2 Feature Aggregation

Test both training set inputs (DS-P and DS-WP)

In [197]:
datasets = load_datasets(['Wikipedia', 'WikiNews', 'News'], 'Train', 'Dev', type_train='phrase', type_test='phrase')
datasets = preprocess_datasets(datasets)
# 1. Linguistic Features
datasets_fc_linguistic = compute_features_linguistic(datasets, aggs=aggs_all)
# 2. Corpus Features
datasets_fc_frequency = compute_features_frequency(datasets, aggs=aggs_all)
datasets_fc_language_model = compute_features_language_model(datasets, aggs=aggs_all)
datasets_fc_corpus = compute_features_corpus([datasets_fc_frequency, datasets_fc_language_model])
# 3. Psycholinguistic
datasets_fc_psycholinguistic = compute_features_psycholinguistic(datasets, aggs=aggs_all)
# 4. Semantic Features
datasets_fc_wordnet = compute_features_wordnet(datasets, aggs=aggs_all)
datasets_fc_dbpedia = compute_features_dbpedia(datasets, aggs=aggs_all)
datasets_fc_brown_clustering = compute_features_brown_clustering(datasets, aggs=aggs_all)
datasets_fc_semantic = compute_features_semantic([datasets_fc_wordnet, datasets_fc_dbpedia, datasets_fc_brown_clustering])
# 5. Dictionary Features
datasets_fc_dictionary = compute_features_dictionary(datasets, aggs=aggs_all)
# 6. Concatentation of feature categories
# (1) Corpus + Semantic
datasets_fc_corpus_semantic = concat_feature_datasets(datasets_fc_corpus, datasets_fc_semantic, name='corpus+semantic')
# (2) WordNet + Psycholinguistic
datasets_fc_wordnet_psycholinguistic = concat_feature_datasets(datasets_fc_wordnet, \
                            datasets_fc_psycholinguistic, name='wordnet+psycholinguistic')
#(3) All categories
datasets_fc_all = concat_feature_datasets(datasets_fc_linguistic, datasets_fc_psycholinguistic, \
                            datasets_fc_semantic, datasets_fc_corpus, datasets_fc_dictionary, name='all')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return umr_maximum(a, axis, None, out, keepdims)
  return umr_minimum(a, axis, None, out, keepdims)


In [198]:
all_fc_datasets = []
all_fc_datasets.extend(datasets_fc_linguistic)
all_fc_datasets.extend(datasets_fc_frequency)
all_fc_datasets.extend(datasets_fc_language_model)
all_fc_datasets.extend(datasets_fc_corpus)
all_fc_datasets.extend(datasets_fc_psycholinguistic)
all_fc_datasets.extend(datasets_fc_wordnet)
all_fc_datasets.extend(datasets_fc_dbpedia)
all_fc_datasets.extend(datasets_fc_brown_clustering)
all_fc_datasets.extend(datasets_fc_semantic)
all_fc_datasets.extend(datasets_fc_dictionary)
all_fc_datasets.extend(datasets_fc_corpus_semantic)
all_fc_datasets.extend(datasets_fc_wordnet_psycholinguistic)
all_fc_datasets.extend(datasets_fc_all)

In [None]:
datasets = load_datasets(['Wikipedia', 'WikiNews', 'News'], 'Train', 'Dev', type_train='both', type_test='phrase')
datasets = preprocess_datasets(datasets)
# 1. Linguistic Features
datasets_fc_linguistic = compute_features_linguistic(datasets, aggs=aggs_all)
# 2. Corpus Features
datasets_fc_frequency = compute_features_frequency(datasets, aggs=aggs_all)
datasets_fc_language_model = compute_features_language_model(datasets, aggs=aggs_all)
datasets_fc_corpus = compute_features_corpus([datasets_fc_frequency, datasets_fc_language_model])
# 3. Psycholinguistic
datasets_fc_psycholinguistic = compute_features_psycholinguistic(datasets, aggs=aggs_all)
# 4. Semantic Features
datasets_fc_wordnet = compute_features_wordnet(datasets, aggs=aggs_all)
datasets_fc_dbpedia = compute_features_dbpedia(datasets, aggs=aggs_all)
datasets_fc_brown_clustering = compute_features_brown_clustering(datasets, aggs=aggs_all)
datasets_fc_semantic = compute_features_semantic([datasets_fc_wordnet, datasets_fc_dbpedia, datasets_fc_brown_clustering])
# 5. Dictionary Features
datasets_fc_dictionary = compute_features_dictionary(datasets, aggs=aggs_all)
# 6. Concatentation of feature categories
# (1) Corpus + Semantic
datasets_fc_corpus_semantic = concat_feature_datasets(datasets_fc_corpus, datasets_fc_semantic, name='corpus+semantic')
# (2) WordNet + Psycholinguistic
datasets_fc_wordnet_psycholinguistic = concat_feature_datasets(datasets_fc_wordnet, \
                            datasets_fc_psycholinguistic, name='wordnet+psycholinguistic')
#(3) All categories
datasets_fc_all = concat_feature_datasets(datasets_fc_linguistic, datasets_fc_psycholinguistic, \
                            datasets_fc_semantic, datasets_fc_corpus, datasets_fc_dictionary, name='all')

In [132]:
all_fc_datasets_dswp = []
all_fc_datasets_dswp.extend(datasets_fc_linguistic)
all_fc_datasets_dswp.extend(datasets_fc_frequency)
all_fc_datasets_dswp.extend(datasets_fc_language_model)
all_fc_datasets_dswp.extend(datasets_fc_corpus)
all_fc_datasets_dswp.extend(datasets_fc_psycholinguistic)
all_fc_datasets_dswp.extend(datasets_fc_wordnet)
all_fc_datasets_dswp.extend(datasets_fc_dbpedia)
all_fc_datasets_dswp.extend(datasets_fc_brown_clustering)
all_fc_datasets_dswp.extend(datasets_fc_semantic)
all_fc_datasets_dswp.extend(datasets_fc_dictionary)
all_fc_datasets_dswp.extend(datasets_fc_corpus_semantic)
all_fc_datasets_dswp.extend(datasets_fc_wordnet_psycholinguistic)
all_fc_datasets_dswp.extend(datasets_fc_all)

## X.2.0 Baselines

In [176]:
datasets_fc_baseline_1 = compute_features_baseline_1(datasets)

In [179]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*nn(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in datasets_fc_baseline_1]

In [180]:
feature_eval_data_nn = create_eval_df_from_results_macro(results, False)
feature_eval_data_nn

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.337831,0.538235,0.550193,baseline_1
1,mean,WikiNews,0.287153,0.395897,0.392677,baseline_1
2,mean,News,0.251908,0.37474,0.37474,baseline_1


## X.2.1 XGBoost

In [203]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*xgboost(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets]

[0]	eval-error:0.261364	train-error:0.100279
[1]	eval-error:0.25	train-error:0.052925
[2]	eval-error:0.238636	train-error:0.019499
[3]	eval-error:0.261364	train-error:0.004178
[4]	eval-error:0.227273	train-error:0
[5]	eval-error:0.227273	train-error:0
[6]	eval-error:0.215909	train-error:0
[7]	eval-error:0.238636	train-error:0
[8]	eval-error:0.238636	train-error:0
[9]	eval-error:0.227273	train-error:0
[10]	eval-error:0.215909	train-error:0
[11]	eval-error:0.204545	train-error:0
[12]	eval-error:0.204545	train-error:0
[13]	eval-error:0.204545	train-error:0
[14]	eval-error:0.204545	train-error:0
[15]	eval-error:0.204545	train-error:0
[16]	eval-error:0.204545	train-error:0
[17]	eval-error:0.215909	train-error:0
[18]	eval-error:0.215909	train-error:0
[19]	eval-error:0.227273	train-error:0
[20]	eval-error:0.215909	train-error:0
[21]	eval-error:0.227273	train-error:0
[22]	eval-error:0.227273	train-error:0
[23]	eval-error:0.227273	train-error:0
[24]	eval-error:0.227273	train-error:0
[25]	eval-e

[69]	eval-error:0.215909	train-error:0
[0]	eval-error:0.261364	train-error:0.108635
[1]	eval-error:0.261364	train-error:0.044568
[2]	eval-error:0.215909	train-error:0.01532
[3]	eval-error:0.25	train-error:0.001393
[4]	eval-error:0.215909	train-error:0
[5]	eval-error:0.227273	train-error:0
[6]	eval-error:0.215909	train-error:0
[7]	eval-error:0.193182	train-error:0
[8]	eval-error:0.204545	train-error:0
[9]	eval-error:0.204545	train-error:0
[10]	eval-error:0.204545	train-error:0
[11]	eval-error:0.204545	train-error:0
[12]	eval-error:0.204545	train-error:0
[13]	eval-error:0.204545	train-error:0
[14]	eval-error:0.204545	train-error:0
[15]	eval-error:0.204545	train-error:0
[16]	eval-error:0.204545	train-error:0
[17]	eval-error:0.204545	train-error:0
[18]	eval-error:0.204545	train-error:0
[19]	eval-error:0.204545	train-error:0
[20]	eval-error:0.204545	train-error:0
[21]	eval-error:0.215909	train-error:0
[22]	eval-error:0.215909	train-error:0
[23]	eval-error:0.215909	train-error:0
[24]	eval-er

[49]	eval-error:0.223404	train-error:0.001034
[50]	eval-error:0.223404	train-error:0.001034
[51]	eval-error:0.223404	train-error:0.001034
[52]	eval-error:0.234043	train-error:0.001034
[53]	eval-error:0.234043	train-error:0.001034
[54]	eval-error:0.223404	train-error:0.001034
[55]	eval-error:0.234043	train-error:0.001034
[56]	eval-error:0.234043	train-error:0.001034
[57]	eval-error:0.223404	train-error:0.001034
[58]	eval-error:0.223404	train-error:0.001034
[59]	eval-error:0.223404	train-error:0.001034
[60]	eval-error:0.223404	train-error:0.001034
[61]	eval-error:0.244681	train-error:0.001034
[62]	eval-error:0.234043	train-error:0.001034
[63]	eval-error:0.244681	train-error:0.001034
[64]	eval-error:0.234043	train-error:0.001034
[65]	eval-error:0.223404	train-error:0.001034
[66]	eval-error:0.223404	train-error:0.001034
[67]	eval-error:0.223404	train-error:0.001034
[68]	eval-error:0.223404	train-error:0.001034
[69]	eval-error:0.223404	train-error:0.001034
[0]	eval-error:0.351064	train-erro

[18]	eval-error:0.236641	train-error:0.005831
[19]	eval-error:0.232824	train-error:0.005831
[20]	eval-error:0.225191	train-error:0.005831
[21]	eval-error:0.232824	train-error:0.005831
[22]	eval-error:0.240458	train-error:0.005831
[23]	eval-error:0.232824	train-error:0.005831
[24]	eval-error:0.225191	train-error:0.005831
[25]	eval-error:0.225191	train-error:0.005831
[26]	eval-error:0.232824	train-error:0.005831
[27]	eval-error:0.229008	train-error:0.005831
[28]	eval-error:0.225191	train-error:0.005831
[29]	eval-error:0.221374	train-error:0.005831
[30]	eval-error:0.221374	train-error:0.005831
[31]	eval-error:0.221374	train-error:0.005831
[32]	eval-error:0.221374	train-error:0.005831
[33]	eval-error:0.221374	train-error:0.005831
[34]	eval-error:0.221374	train-error:0.005831
[35]	eval-error:0.232824	train-error:0.005831
[36]	eval-error:0.236641	train-error:0.005831
[37]	eval-error:0.229008	train-error:0.005831
[38]	eval-error:0.225191	train-error:0.005831
[39]	eval-error:0.225191	train-err

[57]	eval-error:0.229008	train-error:0.005831
[58]	eval-error:0.225191	train-error:0.005831
[59]	eval-error:0.229008	train-error:0.005831
[60]	eval-error:0.229008	train-error:0.005831
[61]	eval-error:0.229008	train-error:0.005831
[62]	eval-error:0.229008	train-error:0.005831
[63]	eval-error:0.229008	train-error:0.005831
[64]	eval-error:0.229008	train-error:0.005831
[65]	eval-error:0.229008	train-error:0.005831
[66]	eval-error:0.229008	train-error:0.005831
[67]	eval-error:0.229008	train-error:0.005831
[68]	eval-error:0.229008	train-error:0.005831
[69]	eval-error:0.229008	train-error:0.005831
[0]	eval-error:0.236641	train-error:0.099611
[1]	eval-error:0.21374	train-error:0.052478
[2]	eval-error:0.217557	train-error:0.02138
[3]	eval-error:0.194656	train-error:0.011176
[4]	eval-error:0.183206	train-error:0.009232
[5]	eval-error:0.187023	train-error:0.006803
[6]	eval-error:0.20229	train-error:0.007289
[7]	eval-error:0.19084	train-error:0.006317
[8]	eval-error:0.198473	train-error:0.006317
[

[27]	eval-error:0.272727	train-error:0.006964
[28]	eval-error:0.272727	train-error:0.006964
[29]	eval-error:0.261364	train-error:0.006964
[30]	eval-error:0.261364	train-error:0.006964
[31]	eval-error:0.261364	train-error:0.006964
[32]	eval-error:0.25	train-error:0.006964
[33]	eval-error:0.261364	train-error:0.006964
[34]	eval-error:0.261364	train-error:0.006964
[35]	eval-error:0.261364	train-error:0.006964
[36]	eval-error:0.261364	train-error:0.006964
[37]	eval-error:0.261364	train-error:0.006964
[38]	eval-error:0.261364	train-error:0.006964
[39]	eval-error:0.261364	train-error:0.006964
[40]	eval-error:0.25	train-error:0.006964
[41]	eval-error:0.25	train-error:0.006964
[42]	eval-error:0.25	train-error:0.006964
[43]	eval-error:0.25	train-error:0.006964
[44]	eval-error:0.25	train-error:0.006964
[45]	eval-error:0.25	train-error:0.006964
[46]	eval-error:0.25	train-error:0.006964
[47]	eval-error:0.25	train-error:0.006964
[48]	eval-error:0.238636	train-error:0.006964
[49]	eval-error:0.25	tra

[2]	eval-error:0.276596	train-error:0.045502
[3]	eval-error:0.244681	train-error:0.02999
[4]	eval-error:0.244681	train-error:0.025853
[5]	eval-error:0.244681	train-error:0.021717
[6]	eval-error:0.244681	train-error:0.020683
[7]	eval-error:0.255319	train-error:0.020683
[8]	eval-error:0.234043	train-error:0.020683
[9]	eval-error:0.244681	train-error:0.018614
[10]	eval-error:0.234043	train-error:0.020683
[11]	eval-error:0.244681	train-error:0.018614
[12]	eval-error:0.244681	train-error:0.018614
[13]	eval-error:0.244681	train-error:0.018614
[14]	eval-error:0.244681	train-error:0.018614
[15]	eval-error:0.234043	train-error:0.018614
[16]	eval-error:0.234043	train-error:0.018614
[17]	eval-error:0.234043	train-error:0.018614
[18]	eval-error:0.223404	train-error:0.018614
[19]	eval-error:0.223404	train-error:0.018614
[20]	eval-error:0.234043	train-error:0.018614
[21]	eval-error:0.234043	train-error:0.018614
[22]	eval-error:0.234043	train-error:0.018614
[23]	eval-error:0.244681	train-error:0.0196

[42]	eval-error:0.329787	train-error:0.03516
[43]	eval-error:0.340426	train-error:0.03516
[44]	eval-error:0.340426	train-error:0.03516
[45]	eval-error:0.340426	train-error:0.03516
[46]	eval-error:0.340426	train-error:0.03516
[47]	eval-error:0.340426	train-error:0.03516
[48]	eval-error:0.340426	train-error:0.03516
[49]	eval-error:0.340426	train-error:0.03516
[50]	eval-error:0.351064	train-error:0.03516
[51]	eval-error:0.340426	train-error:0.03516
[52]	eval-error:0.340426	train-error:0.03516
[53]	eval-error:0.340426	train-error:0.03516
[54]	eval-error:0.340426	train-error:0.03516
[55]	eval-error:0.340426	train-error:0.03516
[56]	eval-error:0.340426	train-error:0.03516
[57]	eval-error:0.340426	train-error:0.03516
[58]	eval-error:0.329787	train-error:0.03516
[59]	eval-error:0.329787	train-error:0.03516
[60]	eval-error:0.329787	train-error:0.03516
[61]	eval-error:0.329787	train-error:0.03516
[62]	eval-error:0.329787	train-error:0.03516
[63]	eval-error:0.340426	train-error:0.03516
[64]	eval-

[12]	eval-error:0.221374	train-error:0.028183
[13]	eval-error:0.229008	train-error:0.028183
[14]	eval-error:0.221374	train-error:0.028183
[15]	eval-error:0.229008	train-error:0.028183
[16]	eval-error:0.225191	train-error:0.028183
[17]	eval-error:0.225191	train-error:0.028183
[18]	eval-error:0.229008	train-error:0.028183
[19]	eval-error:0.225191	train-error:0.028183
[20]	eval-error:0.232824	train-error:0.028183
[21]	eval-error:0.225191	train-error:0.028183
[22]	eval-error:0.236641	train-error:0.028183
[23]	eval-error:0.229008	train-error:0.028183
[24]	eval-error:0.232824	train-error:0.028183
[25]	eval-error:0.221374	train-error:0.028183
[26]	eval-error:0.229008	train-error:0.028183
[27]	eval-error:0.236641	train-error:0.028183
[28]	eval-error:0.232824	train-error:0.028183
[29]	eval-error:0.229008	train-error:0.028183
[30]	eval-error:0.225191	train-error:0.028183
[31]	eval-error:0.229008	train-error:0.028183
[32]	eval-error:0.229008	train-error:0.028183
[33]	eval-error:0.217557	train-err

[51]	eval-error:0.194656	train-error:0.027211
[52]	eval-error:0.198473	train-error:0.027211
[53]	eval-error:0.198473	train-error:0.027211
[54]	eval-error:0.20229	train-error:0.027211
[55]	eval-error:0.19084	train-error:0.027211
[56]	eval-error:0.194656	train-error:0.027211
[57]	eval-error:0.198473	train-error:0.027211
[58]	eval-error:0.194656	train-error:0.027211
[59]	eval-error:0.194656	train-error:0.027211
[60]	eval-error:0.19084	train-error:0.027211
[61]	eval-error:0.198473	train-error:0.027211
[62]	eval-error:0.19084	train-error:0.027211
[63]	eval-error:0.198473	train-error:0.027211
[64]	eval-error:0.194656	train-error:0.027211
[65]	eval-error:0.194656	train-error:0.027211
[66]	eval-error:0.198473	train-error:0.027211
[67]	eval-error:0.19084	train-error:0.027211
[68]	eval-error:0.19084	train-error:0.027211
[69]	eval-error:0.187023	train-error:0.027211
[0]	eval-error:0.295455	train-error:0.118384
[1]	eval-error:0.238636	train-error:0.084958
[2]	eval-error:0.193182	train-error:0.0389

[24]	eval-error:0.227273	train-error:0.004178
[25]	eval-error:0.227273	train-error:0.004178
[26]	eval-error:0.227273	train-error:0.004178
[27]	eval-error:0.215909	train-error:0.004178
[28]	eval-error:0.227273	train-error:0.004178
[29]	eval-error:0.227273	train-error:0.004178
[30]	eval-error:0.215909	train-error:0.004178
[31]	eval-error:0.227273	train-error:0.004178
[32]	eval-error:0.227273	train-error:0.004178
[33]	eval-error:0.227273	train-error:0.004178
[34]	eval-error:0.215909	train-error:0.004178
[35]	eval-error:0.215909	train-error:0.004178
[36]	eval-error:0.227273	train-error:0.004178
[37]	eval-error:0.238636	train-error:0.004178
[38]	eval-error:0.25	train-error:0.004178
[39]	eval-error:0.238636	train-error:0.004178
[40]	eval-error:0.25	train-error:0.004178
[41]	eval-error:0.238636	train-error:0.004178
[42]	eval-error:0.238636	train-error:0.004178
[43]	eval-error:0.25	train-error:0.004178
[44]	eval-error:0.238636	train-error:0.004178
[45]	eval-error:0.238636	train-error:0.004178


[63]	eval-error:0.287234	train-error:0.018614
[64]	eval-error:0.287234	train-error:0.018614
[65]	eval-error:0.287234	train-error:0.018614
[66]	eval-error:0.297872	train-error:0.018614
[67]	eval-error:0.287234	train-error:0.018614
[68]	eval-error:0.287234	train-error:0.018614
[69]	eval-error:0.287234	train-error:0.018614
[0]	eval-error:0.308511	train-error:0.127198
[1]	eval-error:0.329787	train-error:0.085832
[2]	eval-error:0.308511	train-error:0.05274
[3]	eval-error:0.319149	train-error:0.036194
[4]	eval-error:0.297872	train-error:0.027921
[5]	eval-error:0.329787	train-error:0.025853
[6]	eval-error:0.297872	train-error:0.025853
[7]	eval-error:0.308511	train-error:0.022751
[8]	eval-error:0.308511	train-error:0.021717
[9]	eval-error:0.287234	train-error:0.020683
[10]	eval-error:0.308511	train-error:0.019648
[11]	eval-error:0.297872	train-error:0.018614
[12]	eval-error:0.308511	train-error:0.018614
[13]	eval-error:0.297872	train-error:0.018614
[14]	eval-error:0.297872	train-error:0.018614

[32]	eval-error:0.308511	train-error:0.018614
[33]	eval-error:0.308511	train-error:0.018614
[34]	eval-error:0.308511	train-error:0.018614
[35]	eval-error:0.308511	train-error:0.018614
[36]	eval-error:0.319149	train-error:0.018614
[37]	eval-error:0.319149	train-error:0.018614
[38]	eval-error:0.308511	train-error:0.018614
[39]	eval-error:0.308511	train-error:0.018614
[40]	eval-error:0.308511	train-error:0.018614
[41]	eval-error:0.319149	train-error:0.018614
[42]	eval-error:0.319149	train-error:0.018614
[43]	eval-error:0.319149	train-error:0.018614
[44]	eval-error:0.319149	train-error:0.018614
[45]	eval-error:0.319149	train-error:0.018614
[46]	eval-error:0.319149	train-error:0.018614
[47]	eval-error:0.308511	train-error:0.018614
[48]	eval-error:0.308511	train-error:0.018614
[49]	eval-error:0.308511	train-error:0.018614
[50]	eval-error:0.308511	train-error:0.018614
[51]	eval-error:0.308511	train-error:0.018614
[52]	eval-error:0.297872	train-error:0.018614
[53]	eval-error:0.308511	train-err

[2]	eval-error:0.21374	train-error:0.052478
[3]	eval-error:0.217557	train-error:0.038873
[4]	eval-error:0.229008	train-error:0.032556
[5]	eval-error:0.229008	train-error:0.02964
[6]	eval-error:0.240458	train-error:0.028183
[7]	eval-error:0.225191	train-error:0.028183
[8]	eval-error:0.221374	train-error:0.028183
[9]	eval-error:0.221374	train-error:0.028183
[10]	eval-error:0.209924	train-error:0.028183
[11]	eval-error:0.217557	train-error:0.028183
[12]	eval-error:0.21374	train-error:0.028183
[13]	eval-error:0.217557	train-error:0.028183
[14]	eval-error:0.21374	train-error:0.028183
[15]	eval-error:0.206107	train-error:0.028183
[16]	eval-error:0.206107	train-error:0.028183
[17]	eval-error:0.21374	train-error:0.028183
[18]	eval-error:0.217557	train-error:0.028183
[19]	eval-error:0.221374	train-error:0.028183
[20]	eval-error:0.225191	train-error:0.028183
[21]	eval-error:0.225191	train-error:0.028183
[22]	eval-error:0.232824	train-error:0.028183
[23]	eval-error:0.225191	train-error:0.028183
[

[42]	eval-error:0.215909	train-error:0.004178
[43]	eval-error:0.215909	train-error:0.004178
[44]	eval-error:0.204545	train-error:0.004178
[45]	eval-error:0.215909	train-error:0.004178
[46]	eval-error:0.204545	train-error:0.004178
[47]	eval-error:0.215909	train-error:0.004178
[48]	eval-error:0.204545	train-error:0.004178
[49]	eval-error:0.204545	train-error:0.004178
[50]	eval-error:0.204545	train-error:0.004178
[51]	eval-error:0.204545	train-error:0.004178
[52]	eval-error:0.215909	train-error:0.004178
[53]	eval-error:0.204545	train-error:0.004178
[54]	eval-error:0.215909	train-error:0.004178
[55]	eval-error:0.204545	train-error:0.004178
[56]	eval-error:0.204545	train-error:0.004178
[57]	eval-error:0.204545	train-error:0.004178
[58]	eval-error:0.204545	train-error:0.004178
[59]	eval-error:0.204545	train-error:0.004178
[60]	eval-error:0.204545	train-error:0.004178
[61]	eval-error:0.215909	train-error:0.004178
[62]	eval-error:0.204545	train-error:0.004178
[63]	eval-error:0.204545	train-err

[12]	eval-error:0.261364	train-error:0.004178
[13]	eval-error:0.238636	train-error:0.004178
[14]	eval-error:0.238636	train-error:0.004178
[15]	eval-error:0.238636	train-error:0.004178
[16]	eval-error:0.227273	train-error:0.004178
[17]	eval-error:0.238636	train-error:0.004178
[18]	eval-error:0.238636	train-error:0.004178
[19]	eval-error:0.238636	train-error:0.004178
[20]	eval-error:0.238636	train-error:0.004178
[21]	eval-error:0.238636	train-error:0.004178
[22]	eval-error:0.238636	train-error:0.004178
[23]	eval-error:0.238636	train-error:0.004178
[24]	eval-error:0.238636	train-error:0.004178
[25]	eval-error:0.238636	train-error:0.004178
[26]	eval-error:0.238636	train-error:0.004178
[27]	eval-error:0.25	train-error:0.004178
[28]	eval-error:0.25	train-error:0.004178
[29]	eval-error:0.25	train-error:0.004178
[30]	eval-error:0.25	train-error:0.004178
[31]	eval-error:0.25	train-error:0.004178
[32]	eval-error:0.25	train-error:0.004178
[33]	eval-error:0.25	train-error:0.004178
[34]	eval-error:

[54]	eval-error:0.287234	train-error:0.018614
[55]	eval-error:0.276596	train-error:0.018614
[56]	eval-error:0.287234	train-error:0.018614
[57]	eval-error:0.287234	train-error:0.018614
[58]	eval-error:0.287234	train-error:0.018614
[59]	eval-error:0.287234	train-error:0.018614
[60]	eval-error:0.287234	train-error:0.018614
[61]	eval-error:0.297872	train-error:0.018614
[62]	eval-error:0.287234	train-error:0.018614
[63]	eval-error:0.287234	train-error:0.018614
[64]	eval-error:0.287234	train-error:0.018614
[65]	eval-error:0.287234	train-error:0.018614
[66]	eval-error:0.287234	train-error:0.018614
[67]	eval-error:0.287234	train-error:0.018614
[68]	eval-error:0.287234	train-error:0.018614
[69]	eval-error:0.287234	train-error:0.018614
[0]	eval-error:0.351064	train-error:0.113754
[1]	eval-error:0.319149	train-error:0.066184
[2]	eval-error:0.265957	train-error:0.031024
[3]	eval-error:0.244681	train-error:0.025853
[4]	eval-error:0.255319	train-error:0.021717
[5]	eval-error:0.276596	train-error:0.0

[23]	eval-error:0.21374	train-error:0.027697
[24]	eval-error:0.217557	train-error:0.027697
[25]	eval-error:0.21374	train-error:0.027697
[26]	eval-error:0.221374	train-error:0.027697
[27]	eval-error:0.229008	train-error:0.027697
[28]	eval-error:0.221374	train-error:0.027697
[29]	eval-error:0.229008	train-error:0.027697
[30]	eval-error:0.225191	train-error:0.027697
[31]	eval-error:0.217557	train-error:0.027697
[32]	eval-error:0.221374	train-error:0.027697
[33]	eval-error:0.221374	train-error:0.027697
[34]	eval-error:0.221374	train-error:0.027697
[35]	eval-error:0.217557	train-error:0.027697
[36]	eval-error:0.225191	train-error:0.027697
[37]	eval-error:0.209924	train-error:0.027697
[38]	eval-error:0.209924	train-error:0.027697
[39]	eval-error:0.21374	train-error:0.027697
[40]	eval-error:0.209924	train-error:0.027697
[41]	eval-error:0.21374	train-error:0.027697
[42]	eval-error:0.21374	train-error:0.027697
[43]	eval-error:0.206107	train-error:0.027697
[44]	eval-error:0.21374	train-error:0.0

[63]	eval-error:0.209924	train-error:0.027697
[64]	eval-error:0.209924	train-error:0.027697
[65]	eval-error:0.217557	train-error:0.027697
[66]	eval-error:0.21374	train-error:0.027697
[67]	eval-error:0.217557	train-error:0.027697
[68]	eval-error:0.225191	train-error:0.027697
[69]	eval-error:0.217557	train-error:0.027697
[0]	eval-error:0.282443	train-error:0.098154
[1]	eval-error:0.248092	train-error:0.063654
[2]	eval-error:0.240458	train-error:0.04033
[3]	eval-error:0.263359	train-error:0.031098
[4]	eval-error:0.244275	train-error:0.02964
[5]	eval-error:0.236641	train-error:0.027211
[6]	eval-error:0.240458	train-error:0.027211
[7]	eval-error:0.225191	train-error:0.027211
[8]	eval-error:0.225191	train-error:0.027211
[9]	eval-error:0.221374	train-error:0.027211
[10]	eval-error:0.225191	train-error:0.027211
[11]	eval-error:0.229008	train-error:0.027211
[12]	eval-error:0.209924	train-error:0.027211
[13]	eval-error:0.221374	train-error:0.027211
[14]	eval-error:0.217557	train-error:0.027211
[

[34]	eval-error:0.25	train-error:0.036212
[35]	eval-error:0.25	train-error:0.036212
[36]	eval-error:0.25	train-error:0.036212
[37]	eval-error:0.261364	train-error:0.036212
[38]	eval-error:0.25	train-error:0.036212
[39]	eval-error:0.25	train-error:0.036212
[40]	eval-error:0.261364	train-error:0.036212
[41]	eval-error:0.261364	train-error:0.036212
[42]	eval-error:0.261364	train-error:0.036212
[43]	eval-error:0.261364	train-error:0.036212
[44]	eval-error:0.261364	train-error:0.036212
[45]	eval-error:0.261364	train-error:0.036212
[46]	eval-error:0.261364	train-error:0.036212
[47]	eval-error:0.261364	train-error:0.036212
[48]	eval-error:0.261364	train-error:0.036212
[49]	eval-error:0.261364	train-error:0.036212
[50]	eval-error:0.261364	train-error:0.036212
[51]	eval-error:0.261364	train-error:0.036212
[52]	eval-error:0.261364	train-error:0.036212
[53]	eval-error:0.261364	train-error:0.036212
[54]	eval-error:0.261364	train-error:0.036212
[55]	eval-error:0.261364	train-error:0.036212
[56]	eva

[4]	eval-error:0.276596	train-error:0.063082
[5]	eval-error:0.265957	train-error:0.059979
[6]	eval-error:0.265957	train-error:0.057911
[7]	eval-error:0.287234	train-error:0.057911
[8]	eval-error:0.297872	train-error:0.057911
[9]	eval-error:0.297872	train-error:0.057911
[10]	eval-error:0.287234	train-error:0.057911
[11]	eval-error:0.287234	train-error:0.057911
[12]	eval-error:0.276596	train-error:0.057911
[13]	eval-error:0.276596	train-error:0.057911
[14]	eval-error:0.287234	train-error:0.057911
[15]	eval-error:0.276596	train-error:0.057911
[16]	eval-error:0.276596	train-error:0.057911
[17]	eval-error:0.276596	train-error:0.057911
[18]	eval-error:0.265957	train-error:0.057911
[19]	eval-error:0.255319	train-error:0.057911
[20]	eval-error:0.255319	train-error:0.057911
[21]	eval-error:0.265957	train-error:0.057911
[22]	eval-error:0.265957	train-error:0.057911
[23]	eval-error:0.255319	train-error:0.057911
[24]	eval-error:0.265957	train-error:0.057911
[25]	eval-error:0.276596	train-error:0.0

[43]	eval-error:0.244681	train-error:0.105481
[44]	eval-error:0.234043	train-error:0.105481
[45]	eval-error:0.234043	train-error:0.105481
[46]	eval-error:0.234043	train-error:0.105481
[47]	eval-error:0.234043	train-error:0.105481
[48]	eval-error:0.234043	train-error:0.105481
[49]	eval-error:0.234043	train-error:0.105481
[50]	eval-error:0.234043	train-error:0.105481
[51]	eval-error:0.234043	train-error:0.105481
[52]	eval-error:0.234043	train-error:0.105481
[53]	eval-error:0.234043	train-error:0.105481
[54]	eval-error:0.234043	train-error:0.105481
[55]	eval-error:0.234043	train-error:0.105481
[56]	eval-error:0.234043	train-error:0.105481
[57]	eval-error:0.234043	train-error:0.105481
[58]	eval-error:0.234043	train-error:0.105481
[59]	eval-error:0.234043	train-error:0.105481
[60]	eval-error:0.234043	train-error:0.105481
[61]	eval-error:0.234043	train-error:0.105481
[62]	eval-error:0.234043	train-error:0.105481
[63]	eval-error:0.234043	train-error:0.105481
[64]	eval-error:0.234043	train-err

[14]	eval-error:0.229008	train-error:0.058795
[15]	eval-error:0.225191	train-error:0.058795
[16]	eval-error:0.21374	train-error:0.058795
[17]	eval-error:0.229008	train-error:0.058795
[18]	eval-error:0.232824	train-error:0.058795
[19]	eval-error:0.225191	train-error:0.058795
[20]	eval-error:0.209924	train-error:0.058795
[21]	eval-error:0.217557	train-error:0.058795
[22]	eval-error:0.21374	train-error:0.058795
[23]	eval-error:0.21374	train-error:0.058795
[24]	eval-error:0.217557	train-error:0.058795
[25]	eval-error:0.20229	train-error:0.058795
[26]	eval-error:0.194656	train-error:0.058795
[27]	eval-error:0.20229	train-error:0.058795
[28]	eval-error:0.209924	train-error:0.058795
[29]	eval-error:0.20229	train-error:0.058795
[30]	eval-error:0.209924	train-error:0.058795
[31]	eval-error:0.21374	train-error:0.058795
[32]	eval-error:0.21374	train-error:0.058795
[33]	eval-error:0.21374	train-error:0.058795
[34]	eval-error:0.221374	train-error:0.058795
[35]	eval-error:0.225191	train-error:0.0587

[53]	eval-error:0.255725	train-error:0.037415
[54]	eval-error:0.244275	train-error:0.037415
[55]	eval-error:0.244275	train-error:0.037415
[56]	eval-error:0.244275	train-error:0.037415
[57]	eval-error:0.236641	train-error:0.037415
[58]	eval-error:0.244275	train-error:0.037415
[59]	eval-error:0.244275	train-error:0.037415
[60]	eval-error:0.240458	train-error:0.037415
[61]	eval-error:0.240458	train-error:0.037415
[62]	eval-error:0.244275	train-error:0.037415
[63]	eval-error:0.244275	train-error:0.037415
[64]	eval-error:0.244275	train-error:0.037415
[65]	eval-error:0.251908	train-error:0.037415
[66]	eval-error:0.236641	train-error:0.037415
[67]	eval-error:0.244275	train-error:0.037415
[68]	eval-error:0.244275	train-error:0.037415
[69]	eval-error:0.251908	train-error:0.037415
[0]	eval-error:0.272727	train-error:0.108635
[1]	eval-error:0.181818	train-error:0.05571
[2]	eval-error:0.227273	train-error:0.027855
[3]	eval-error:0.181818	train-error:0.008357
[4]	eval-error:0.193182	train-error:0.0

[22]	eval-error:0.204545	train-error:0.006964
[23]	eval-error:0.193182	train-error:0.006964
[24]	eval-error:0.193182	train-error:0.006964
[25]	eval-error:0.204545	train-error:0.006964
[26]	eval-error:0.193182	train-error:0.006964
[27]	eval-error:0.204545	train-error:0.006964
[28]	eval-error:0.193182	train-error:0.006964
[29]	eval-error:0.204545	train-error:0.006964
[30]	eval-error:0.204545	train-error:0.006964
[31]	eval-error:0.204545	train-error:0.006964
[32]	eval-error:0.215909	train-error:0.006964
[33]	eval-error:0.204545	train-error:0.006964
[34]	eval-error:0.204545	train-error:0.006964
[35]	eval-error:0.193182	train-error:0.006964
[36]	eval-error:0.193182	train-error:0.006964
[37]	eval-error:0.204545	train-error:0.006964
[38]	eval-error:0.193182	train-error:0.006964
[39]	eval-error:0.193182	train-error:0.006964
[40]	eval-error:0.204545	train-error:0.006964
[41]	eval-error:0.204545	train-error:0.006964
[42]	eval-error:0.215909	train-error:0.006964
[43]	eval-error:0.215909	train-err

[62]	eval-error:0.234043	train-error:0.023785
[63]	eval-error:0.255319	train-error:0.023785
[64]	eval-error:0.234043	train-error:0.023785
[65]	eval-error:0.244681	train-error:0.023785
[66]	eval-error:0.265957	train-error:0.023785
[67]	eval-error:0.255319	train-error:0.023785
[68]	eval-error:0.244681	train-error:0.023785
[69]	eval-error:0.255319	train-error:0.023785
[0]	eval-error:0.244681	train-error:0.111686
[1]	eval-error:0.244681	train-error:0.066184
[2]	eval-error:0.255319	train-error:0.043433
[3]	eval-error:0.287234	train-error:0.033092
[4]	eval-error:0.276596	train-error:0.027921
[5]	eval-error:0.276596	train-error:0.026887
[6]	eval-error:0.244681	train-error:0.026887
[7]	eval-error:0.255319	train-error:0.024819
[8]	eval-error:0.223404	train-error:0.024819
[9]	eval-error:0.244681	train-error:0.024819
[10]	eval-error:0.255319	train-error:0.024819
[11]	eval-error:0.265957	train-error:0.024819
[12]	eval-error:0.255319	train-error:0.024819
[13]	eval-error:0.255319	train-error:0.02481

[31]	eval-error:0.244681	train-error:0.023785
[32]	eval-error:0.223404	train-error:0.023785
[33]	eval-error:0.244681	train-error:0.023785
[34]	eval-error:0.244681	train-error:0.023785
[35]	eval-error:0.255319	train-error:0.023785
[36]	eval-error:0.234043	train-error:0.023785
[37]	eval-error:0.212766	train-error:0.023785
[38]	eval-error:0.234043	train-error:0.023785
[39]	eval-error:0.234043	train-error:0.023785
[40]	eval-error:0.212766	train-error:0.023785
[41]	eval-error:0.234043	train-error:0.023785
[42]	eval-error:0.223404	train-error:0.023785
[43]	eval-error:0.244681	train-error:0.023785
[44]	eval-error:0.244681	train-error:0.023785
[45]	eval-error:0.244681	train-error:0.023785
[46]	eval-error:0.255319	train-error:0.023785
[47]	eval-error:0.255319	train-error:0.023785
[48]	eval-error:0.255319	train-error:0.023785
[49]	eval-error:0.234043	train-error:0.023785
[50]	eval-error:0.255319	train-error:0.023785
[51]	eval-error:0.255319	train-error:0.023785
[52]	eval-error:0.255319	train-err

[0]	eval-error:0.244275	train-error:0.116132
[1]	eval-error:0.251908	train-error:0.076774
[2]	eval-error:0.236641	train-error:0.048591
[3]	eval-error:0.20229	train-error:0.035471
[4]	eval-error:0.206107	train-error:0.030612
[5]	eval-error:0.206107	train-error:0.030126
[6]	eval-error:0.21374	train-error:0.028669
[7]	eval-error:0.225191	train-error:0.028669
[8]	eval-error:0.232824	train-error:0.028669
[9]	eval-error:0.225191	train-error:0.029155
[10]	eval-error:0.225191	train-error:0.028669
[11]	eval-error:0.21374	train-error:0.028669
[12]	eval-error:0.225191	train-error:0.028669
[13]	eval-error:0.221374	train-error:0.028669
[14]	eval-error:0.229008	train-error:0.028669
[15]	eval-error:0.229008	train-error:0.028669
[16]	eval-error:0.221374	train-error:0.028669
[17]	eval-error:0.217557	train-error:0.028669
[18]	eval-error:0.225191	train-error:0.028669
[19]	eval-error:0.225191	train-error:0.028669
[20]	eval-error:0.221374	train-error:0.028669
[21]	eval-error:0.221374	train-error:0.028669
[

[40]	eval-error:0.204545	train-error:0.038997
[41]	eval-error:0.204545	train-error:0.038997
[42]	eval-error:0.204545	train-error:0.038997
[43]	eval-error:0.204545	train-error:0.038997
[44]	eval-error:0.204545	train-error:0.038997
[45]	eval-error:0.204545	train-error:0.038997
[46]	eval-error:0.204545	train-error:0.038997
[47]	eval-error:0.204545	train-error:0.038997
[48]	eval-error:0.204545	train-error:0.038997
[49]	eval-error:0.204545	train-error:0.038997
[50]	eval-error:0.204545	train-error:0.038997
[51]	eval-error:0.204545	train-error:0.038997
[52]	eval-error:0.204545	train-error:0.038997
[53]	eval-error:0.204545	train-error:0.038997
[54]	eval-error:0.204545	train-error:0.038997
[55]	eval-error:0.204545	train-error:0.038997
[56]	eval-error:0.204545	train-error:0.038997
[57]	eval-error:0.204545	train-error:0.038997
[58]	eval-error:0.204545	train-error:0.038997
[59]	eval-error:0.204545	train-error:0.038997
[60]	eval-error:0.204545	train-error:0.038997
[61]	eval-error:0.204545	train-err

[10]	eval-error:0.204545	train-error:0.04039
[11]	eval-error:0.193182	train-error:0.04039
[12]	eval-error:0.193182	train-error:0.04039
[13]	eval-error:0.193182	train-error:0.04039
[14]	eval-error:0.193182	train-error:0.04039
[15]	eval-error:0.193182	train-error:0.04039
[16]	eval-error:0.193182	train-error:0.04039
[17]	eval-error:0.193182	train-error:0.04039
[18]	eval-error:0.193182	train-error:0.04039
[19]	eval-error:0.193182	train-error:0.04039
[20]	eval-error:0.193182	train-error:0.04039
[21]	eval-error:0.193182	train-error:0.038997
[22]	eval-error:0.193182	train-error:0.038997
[23]	eval-error:0.193182	train-error:0.038997
[24]	eval-error:0.204545	train-error:0.038997
[25]	eval-error:0.204545	train-error:0.04039
[26]	eval-error:0.204545	train-error:0.04039
[27]	eval-error:0.204545	train-error:0.038997
[28]	eval-error:0.193182	train-error:0.038997
[29]	eval-error:0.204545	train-error:0.038997
[30]	eval-error:0.204545	train-error:0.038997
[31]	eval-error:0.204545	train-error:0.038997
[

[49]	eval-error:0.287234	train-error:0.038263
[50]	eval-error:0.276596	train-error:0.038263
[51]	eval-error:0.276596	train-error:0.038263
[52]	eval-error:0.287234	train-error:0.038263
[53]	eval-error:0.276596	train-error:0.038263
[54]	eval-error:0.276596	train-error:0.038263
[55]	eval-error:0.265957	train-error:0.038263
[56]	eval-error:0.265957	train-error:0.038263
[57]	eval-error:0.276596	train-error:0.038263
[58]	eval-error:0.276596	train-error:0.038263
[59]	eval-error:0.276596	train-error:0.038263
[60]	eval-error:0.265957	train-error:0.038263
[61]	eval-error:0.276596	train-error:0.038263
[62]	eval-error:0.265957	train-error:0.038263
[63]	eval-error:0.265957	train-error:0.038263
[64]	eval-error:0.287234	train-error:0.038263
[65]	eval-error:0.276596	train-error:0.038263
[66]	eval-error:0.276596	train-error:0.038263
[67]	eval-error:0.287234	train-error:0.038263
[68]	eval-error:0.265957	train-error:0.038263
[69]	eval-error:0.276596	train-error:0.038263
[0]	eval-error:0.202128	train-erro

[18]	eval-error:0.244275	train-error:0.047619
[19]	eval-error:0.244275	train-error:0.047619
[20]	eval-error:0.236641	train-error:0.047619
[21]	eval-error:0.232824	train-error:0.047619
[22]	eval-error:0.232824	train-error:0.047619
[23]	eval-error:0.236641	train-error:0.047619
[24]	eval-error:0.236641	train-error:0.047619
[25]	eval-error:0.236641	train-error:0.047619
[26]	eval-error:0.244275	train-error:0.047619
[27]	eval-error:0.240458	train-error:0.047619
[28]	eval-error:0.240458	train-error:0.047619
[29]	eval-error:0.240458	train-error:0.047619
[30]	eval-error:0.236641	train-error:0.047619
[31]	eval-error:0.232824	train-error:0.047619
[32]	eval-error:0.236641	train-error:0.047619
[33]	eval-error:0.232824	train-error:0.047619
[34]	eval-error:0.244275	train-error:0.047619
[35]	eval-error:0.236641	train-error:0.047619
[36]	eval-error:0.236641	train-error:0.047619
[37]	eval-error:0.236641	train-error:0.047619
[38]	eval-error:0.229008	train-error:0.047619
[39]	eval-error:0.240458	train-err

[57]	eval-error:0.240458	train-error:0.047619
[58]	eval-error:0.236641	train-error:0.047619
[59]	eval-error:0.240458	train-error:0.047619
[60]	eval-error:0.232824	train-error:0.047619
[61]	eval-error:0.236641	train-error:0.047619
[62]	eval-error:0.232824	train-error:0.047619
[63]	eval-error:0.236641	train-error:0.047619
[64]	eval-error:0.236641	train-error:0.047619
[65]	eval-error:0.236641	train-error:0.047619
[66]	eval-error:0.236641	train-error:0.047619
[67]	eval-error:0.240458	train-error:0.047619
[68]	eval-error:0.236641	train-error:0.047619
[69]	eval-error:0.240458	train-error:0.047619
[0]	eval-error:0.263359	train-error:0.145287
[1]	eval-error:0.274809	train-error:0.126336
[2]	eval-error:0.267176	train-error:0.088435
[3]	eval-error:0.251908	train-error:0.074344
[4]	eval-error:0.240458	train-error:0.064626
[5]	eval-error:0.236641	train-error:0.057337
[6]	eval-error:0.240458	train-error:0.052964
[7]	eval-error:0.236641	train-error:0.052478
[8]	eval-error:0.236641	train-error:0.0505

[26]	eval-error:0.181818	train-error:0.043175
[27]	eval-error:0.181818	train-error:0.043175
[28]	eval-error:0.181818	train-error:0.043175
[29]	eval-error:0.181818	train-error:0.043175
[30]	eval-error:0.181818	train-error:0.043175
[31]	eval-error:0.181818	train-error:0.043175
[32]	eval-error:0.181818	train-error:0.043175
[33]	eval-error:0.181818	train-error:0.043175
[34]	eval-error:0.181818	train-error:0.043175
[35]	eval-error:0.181818	train-error:0.043175
[36]	eval-error:0.181818	train-error:0.043175
[37]	eval-error:0.181818	train-error:0.043175
[38]	eval-error:0.181818	train-error:0.043175
[39]	eval-error:0.181818	train-error:0.043175
[40]	eval-error:0.181818	train-error:0.043175
[41]	eval-error:0.181818	train-error:0.043175
[42]	eval-error:0.181818	train-error:0.043175
[43]	eval-error:0.181818	train-error:0.043175
[44]	eval-error:0.181818	train-error:0.043175
[45]	eval-error:0.181818	train-error:0.043175
[46]	eval-error:0.181818	train-error:0.043175
[47]	eval-error:0.181818	train-err

[68]	eval-error:0.193182	train-error:0.011142
[69]	eval-error:0.193182	train-error:0.011142
[0]	eval-error:0.319149	train-error:0.182006
[1]	eval-error:0.319149	train-error:0.144778
[2]	eval-error:0.319149	train-error:0.096174
[3]	eval-error:0.276596	train-error:0.073423
[4]	eval-error:0.297872	train-error:0.056877
[5]	eval-error:0.319149	train-error:0.04757
[6]	eval-error:0.329787	train-error:0.042399
[7]	eval-error:0.329787	train-error:0.043433
[8]	eval-error:0.329787	train-error:0.039297
[9]	eval-error:0.329787	train-error:0.039297
[10]	eval-error:0.319149	train-error:0.039297
[11]	eval-error:0.319149	train-error:0.038263
[12]	eval-error:0.308511	train-error:0.038263
[13]	eval-error:0.319149	train-error:0.038263
[14]	eval-error:0.308511	train-error:0.038263
[15]	eval-error:0.308511	train-error:0.038263
[16]	eval-error:0.319149	train-error:0.038263
[17]	eval-error:0.308511	train-error:0.038263
[18]	eval-error:0.319149	train-error:0.038263
[19]	eval-error:0.319149	train-error:0.038263

[38]	eval-error:0.287234	train-error:0.1303
[39]	eval-error:0.297872	train-error:0.1303
[40]	eval-error:0.287234	train-error:0.1303
[41]	eval-error:0.287234	train-error:0.1303
[42]	eval-error:0.297872	train-error:0.1303
[43]	eval-error:0.287234	train-error:0.1303
[44]	eval-error:0.297872	train-error:0.1303
[45]	eval-error:0.287234	train-error:0.1303
[46]	eval-error:0.297872	train-error:0.1303
[47]	eval-error:0.297872	train-error:0.1303
[48]	eval-error:0.297872	train-error:0.1303
[49]	eval-error:0.297872	train-error:0.1303
[50]	eval-error:0.297872	train-error:0.1303
[51]	eval-error:0.287234	train-error:0.1303
[52]	eval-error:0.287234	train-error:0.1303
[53]	eval-error:0.287234	train-error:0.1303
[54]	eval-error:0.287234	train-error:0.1303
[55]	eval-error:0.287234	train-error:0.1303
[56]	eval-error:0.287234	train-error:0.1303
[57]	eval-error:0.287234	train-error:0.1303
[58]	eval-error:0.287234	train-error:0.1303
[59]	eval-error:0.287234	train-error:0.1303
[60]	eval-error:0.287234	train-e

[10]	eval-error:0.267176	train-error:0.060739
[11]	eval-error:0.28626	train-error:0.059767
[12]	eval-error:0.278626	train-error:0.059767
[13]	eval-error:0.278626	train-error:0.059767
[14]	eval-error:0.270992	train-error:0.059767
[15]	eval-error:0.274809	train-error:0.059767
[16]	eval-error:0.274809	train-error:0.059767
[17]	eval-error:0.278626	train-error:0.059767
[18]	eval-error:0.274809	train-error:0.059767
[19]	eval-error:0.267176	train-error:0.059767
[20]	eval-error:0.270992	train-error:0.059767
[21]	eval-error:0.267176	train-error:0.059767
[22]	eval-error:0.267176	train-error:0.059767
[23]	eval-error:0.270992	train-error:0.059767
[24]	eval-error:0.270992	train-error:0.059767
[25]	eval-error:0.263359	train-error:0.059767
[26]	eval-error:0.270992	train-error:0.059767
[27]	eval-error:0.270992	train-error:0.059767
[28]	eval-error:0.267176	train-error:0.059767
[29]	eval-error:0.267176	train-error:0.059767
[30]	eval-error:0.274809	train-error:0.059767
[31]	eval-error:0.270992	train-erro

[49]	eval-error:0.274809	train-error:0.028669
[50]	eval-error:0.278626	train-error:0.028669
[51]	eval-error:0.274809	train-error:0.028669
[52]	eval-error:0.270992	train-error:0.028669
[53]	eval-error:0.270992	train-error:0.028669
[54]	eval-error:0.274809	train-error:0.028669
[55]	eval-error:0.274809	train-error:0.028669
[56]	eval-error:0.278626	train-error:0.028669
[57]	eval-error:0.270992	train-error:0.028669
[58]	eval-error:0.270992	train-error:0.028669
[59]	eval-error:0.270992	train-error:0.028669
[60]	eval-error:0.270992	train-error:0.028669
[61]	eval-error:0.270992	train-error:0.028669
[62]	eval-error:0.270992	train-error:0.028669
[63]	eval-error:0.270992	train-error:0.028669
[64]	eval-error:0.270992	train-error:0.028669
[65]	eval-error:0.274809	train-error:0.028669
[66]	eval-error:0.270992	train-error:0.028669
[67]	eval-error:0.267176	train-error:0.028669
[68]	eval-error:0.267176	train-error:0.028669
[69]	eval-error:0.270992	train-error:0.028669
[0]	eval-error:0.238636	train-erro

[19]	eval-error:0.261364	train-error:0.002786
[20]	eval-error:0.25	train-error:0.002786
[21]	eval-error:0.261364	train-error:0.002786
[22]	eval-error:0.261364	train-error:0.002786
[23]	eval-error:0.261364	train-error:0.002786
[24]	eval-error:0.261364	train-error:0.002786
[25]	eval-error:0.261364	train-error:0.002786
[26]	eval-error:0.261364	train-error:0.002786
[27]	eval-error:0.261364	train-error:0.002786
[28]	eval-error:0.261364	train-error:0.002786
[29]	eval-error:0.261364	train-error:0.002786
[30]	eval-error:0.261364	train-error:0.002786
[31]	eval-error:0.261364	train-error:0.002786
[32]	eval-error:0.261364	train-error:0.002786
[33]	eval-error:0.261364	train-error:0.002786
[34]	eval-error:0.261364	train-error:0.002786
[35]	eval-error:0.261364	train-error:0.002786
[36]	eval-error:0.272727	train-error:0.002786
[37]	eval-error:0.272727	train-error:0.002786
[38]	eval-error:0.261364	train-error:0.002786
[39]	eval-error:0.261364	train-error:0.002786
[40]	eval-error:0.261364	train-error:0

[58]	eval-error:0.234043	train-error:0.014478
[59]	eval-error:0.234043	train-error:0.014478
[60]	eval-error:0.223404	train-error:0.014478
[61]	eval-error:0.223404	train-error:0.014478
[62]	eval-error:0.223404	train-error:0.014478
[63]	eval-error:0.223404	train-error:0.014478
[64]	eval-error:0.223404	train-error:0.014478
[65]	eval-error:0.223404	train-error:0.014478
[66]	eval-error:0.223404	train-error:0.014478
[67]	eval-error:0.212766	train-error:0.014478
[68]	eval-error:0.212766	train-error:0.014478
[69]	eval-error:0.212766	train-error:0.014478
[0]	eval-error:0.297872	train-error:0.110652
[1]	eval-error:0.297872	train-error:0.054809
[2]	eval-error:0.276596	train-error:0.028956
[3]	eval-error:0.255319	train-error:0.020683
[4]	eval-error:0.265957	train-error:0.01758
[5]	eval-error:0.255319	train-error:0.014478
[6]	eval-error:0.265957	train-error:0.014478
[7]	eval-error:0.223404	train-error:0.014478
[8]	eval-error:0.234043	train-error:0.014478
[9]	eval-error:0.234043	train-error:0.014478

[27]	eval-error:0.255319	train-error:0.014478
[28]	eval-error:0.255319	train-error:0.014478
[29]	eval-error:0.255319	train-error:0.014478
[30]	eval-error:0.255319	train-error:0.014478
[31]	eval-error:0.244681	train-error:0.014478
[32]	eval-error:0.234043	train-error:0.014478
[33]	eval-error:0.234043	train-error:0.014478
[34]	eval-error:0.234043	train-error:0.014478
[35]	eval-error:0.234043	train-error:0.014478
[36]	eval-error:0.234043	train-error:0.014478
[37]	eval-error:0.223404	train-error:0.014478
[38]	eval-error:0.212766	train-error:0.014478
[39]	eval-error:0.212766	train-error:0.014478
[40]	eval-error:0.223404	train-error:0.014478
[41]	eval-error:0.223404	train-error:0.014478
[42]	eval-error:0.223404	train-error:0.014478
[43]	eval-error:0.223404	train-error:0.014478
[44]	eval-error:0.223404	train-error:0.014478
[45]	eval-error:0.223404	train-error:0.014478
[46]	eval-error:0.223404	train-error:0.014478
[47]	eval-error:0.223404	train-error:0.014478
[48]	eval-error:0.223404	train-err

[67]	eval-error:0.232824	train-error:0.01895
[68]	eval-error:0.232824	train-error:0.01895
[69]	eval-error:0.225191	train-error:0.01895
[0]	eval-error:0.267176	train-error:0.095724
[1]	eval-error:0.278626	train-error:0.058795
[2]	eval-error:0.232824	train-error:0.032556
[3]	eval-error:0.236641	train-error:0.027211
[4]	eval-error:0.236641	train-error:0.023324
[5]	eval-error:0.221374	train-error:0.020894
[6]	eval-error:0.221374	train-error:0.019922
[7]	eval-error:0.229008	train-error:0.019922
[8]	eval-error:0.229008	train-error:0.019922
[9]	eval-error:0.236641	train-error:0.019922
[10]	eval-error:0.236641	train-error:0.019922
[11]	eval-error:0.236641	train-error:0.019922
[12]	eval-error:0.232824	train-error:0.019436
[13]	eval-error:0.236641	train-error:0.019436
[14]	eval-error:0.229008	train-error:0.019436
[15]	eval-error:0.229008	train-error:0.019436
[16]	eval-error:0.229008	train-error:0.019436
[17]	eval-error:0.225191	train-error:0.019436
[18]	eval-error:0.232824	train-error:0.019436
[

[36]	eval-error:0.204545	train-error:0.147632
[37]	eval-error:0.193182	train-error:0.147632
[38]	eval-error:0.181818	train-error:0.147632
[39]	eval-error:0.181818	train-error:0.147632
[40]	eval-error:0.193182	train-error:0.147632
[41]	eval-error:0.193182	train-error:0.147632
[42]	eval-error:0.181818	train-error:0.147632
[43]	eval-error:0.181818	train-error:0.147632
[44]	eval-error:0.193182	train-error:0.147632
[45]	eval-error:0.215909	train-error:0.147632
[46]	eval-error:0.193182	train-error:0.147632
[47]	eval-error:0.181818	train-error:0.147632
[48]	eval-error:0.193182	train-error:0.147632
[49]	eval-error:0.215909	train-error:0.147632
[50]	eval-error:0.181818	train-error:0.147632
[51]	eval-error:0.181818	train-error:0.147632
[52]	eval-error:0.204545	train-error:0.147632
[53]	eval-error:0.215909	train-error:0.147632
[54]	eval-error:0.193182	train-error:0.147632
[55]	eval-error:0.215909	train-error:0.147632
[56]	eval-error:0.193182	train-error:0.147632
[57]	eval-error:0.204545	train-err

  'precision', 'predicted', average, warn_for)


[58]	eval-error:0.159091	train-error:0.18663
[59]	eval-error:0.159091	train-error:0.18663
[60]	eval-error:0.159091	train-error:0.18663
[61]	eval-error:0.159091	train-error:0.18663
[62]	eval-error:0.159091	train-error:0.18663
[63]	eval-error:0.159091	train-error:0.18663
[64]	eval-error:0.159091	train-error:0.18663
[65]	eval-error:0.159091	train-error:0.18663
[66]	eval-error:0.159091	train-error:0.18663
[67]	eval-error:0.159091	train-error:0.18663
[68]	eval-error:0.159091	train-error:0.18663
[69]	eval-error:0.159091	train-error:0.18663
[0]	eval-error:0.193182	train-error:0.167131
[1]	eval-error:0.227273	train-error:0.151811
[2]	eval-error:0.204545	train-error:0.116992
[3]	eval-error:0.227273	train-error:0.097493
[4]	eval-error:0.227273	train-error:0.089136
[5]	eval-error:0.238636	train-error:0.077994
[6]	eval-error:0.25	train-error:0.069638
[7]	eval-error:0.284091	train-error:0.061281
[8]	eval-error:0.272727	train-error:0.05571
[9]	eval-error:0.261364	train-error:0.054318
[10]	eval-error

[29]	eval-error:0.234043	train-error:0.250259
[30]	eval-error:0.234043	train-error:0.250259
[31]	eval-error:0.234043	train-error:0.250259
[32]	eval-error:0.234043	train-error:0.250259
[33]	eval-error:0.234043	train-error:0.250259
[34]	eval-error:0.234043	train-error:0.250259
[35]	eval-error:0.234043	train-error:0.250259
[36]	eval-error:0.234043	train-error:0.250259
[37]	eval-error:0.234043	train-error:0.250259
[38]	eval-error:0.234043	train-error:0.250259
[39]	eval-error:0.234043	train-error:0.250259
[40]	eval-error:0.234043	train-error:0.250259
[41]	eval-error:0.234043	train-error:0.250259
[42]	eval-error:0.234043	train-error:0.250259
[43]	eval-error:0.234043	train-error:0.250259
[44]	eval-error:0.234043	train-error:0.250259
[45]	eval-error:0.234043	train-error:0.250259
[46]	eval-error:0.234043	train-error:0.250259
[47]	eval-error:0.234043	train-error:0.250259
[48]	eval-error:0.234043	train-error:0.250259
[49]	eval-error:0.234043	train-error:0.250259
[50]	eval-error:0.234043	train-err

[68]	eval-error:0.297872	train-error:0.074457
[69]	eval-error:0.297872	train-error:0.074457
[0]	eval-error:0.194656	train-error:0.170554
[1]	eval-error:0.198473	train-error:0.166667
[2]	eval-error:0.198473	train-error:0.161322
[3]	eval-error:0.198473	train-error:0.160836
[4]	eval-error:0.20229	train-error:0.158892
[5]	eval-error:0.20229	train-error:0.15792
[6]	eval-error:0.20229	train-error:0.156463
[7]	eval-error:0.209924	train-error:0.155977
[8]	eval-error:0.21374	train-error:0.153547
[9]	eval-error:0.21374	train-error:0.153547
[10]	eval-error:0.21374	train-error:0.153547
[11]	eval-error:0.20229	train-error:0.152575
[12]	eval-error:0.206107	train-error:0.152575
[13]	eval-error:0.206107	train-error:0.152089
[14]	eval-error:0.206107	train-error:0.151604
[15]	eval-error:0.206107	train-error:0.152089
[16]	eval-error:0.209924	train-error:0.151604
[17]	eval-error:0.206107	train-error:0.151604
[18]	eval-error:0.20229	train-error:0.151604
[19]	eval-error:0.20229	train-error:0.151604
[20]	eva

[40]	eval-error:0.21374	train-error:0.195335
[41]	eval-error:0.21374	train-error:0.195335
[42]	eval-error:0.21374	train-error:0.195335
[43]	eval-error:0.21374	train-error:0.195335
[44]	eval-error:0.21374	train-error:0.195335
[45]	eval-error:0.21374	train-error:0.195335
[46]	eval-error:0.21374	train-error:0.195335
[47]	eval-error:0.21374	train-error:0.195335
[48]	eval-error:0.21374	train-error:0.195335
[49]	eval-error:0.21374	train-error:0.195335
[50]	eval-error:0.21374	train-error:0.195335
[51]	eval-error:0.21374	train-error:0.195335
[52]	eval-error:0.21374	train-error:0.195335
[53]	eval-error:0.21374	train-error:0.195335
[54]	eval-error:0.21374	train-error:0.195335
[55]	eval-error:0.21374	train-error:0.195335
[56]	eval-error:0.21374	train-error:0.195335
[57]	eval-error:0.21374	train-error:0.195335
[58]	eval-error:0.21374	train-error:0.195335
[59]	eval-error:0.21374	train-error:0.195335
[60]	eval-error:0.21374	train-error:0.195335
[61]	eval-error:0.21374	train-error:0.195335
[62]	eval-

[10]	eval-error:0.204545	train-error:0.001393
[11]	eval-error:0.204545	train-error:0.001393
[12]	eval-error:0.193182	train-error:0.001393
[13]	eval-error:0.193182	train-error:0.001393
[14]	eval-error:0.181818	train-error:0.001393
[15]	eval-error:0.193182	train-error:0.001393
[16]	eval-error:0.181818	train-error:0.001393
[17]	eval-error:0.181818	train-error:0.001393
[18]	eval-error:0.193182	train-error:0.001393
[19]	eval-error:0.181818	train-error:0.001393
[20]	eval-error:0.181818	train-error:0.001393
[21]	eval-error:0.181818	train-error:0.001393
[22]	eval-error:0.181818	train-error:0.001393
[23]	eval-error:0.181818	train-error:0.001393
[24]	eval-error:0.193182	train-error:0.001393
[25]	eval-error:0.193182	train-error:0.001393
[26]	eval-error:0.193182	train-error:0.001393
[27]	eval-error:0.193182	train-error:0.001393
[28]	eval-error:0.193182	train-error:0.001393
[29]	eval-error:0.204545	train-error:0.001393
[30]	eval-error:0.193182	train-error:0.001393
[31]	eval-error:0.193182	train-err

[50]	eval-error:0.261364	train-error:0.001393
[51]	eval-error:0.261364	train-error:0.001393
[52]	eval-error:0.261364	train-error:0.001393
[53]	eval-error:0.261364	train-error:0.001393
[54]	eval-error:0.261364	train-error:0.001393
[55]	eval-error:0.261364	train-error:0.001393
[56]	eval-error:0.261364	train-error:0.001393
[57]	eval-error:0.261364	train-error:0.001393
[58]	eval-error:0.261364	train-error:0.001393
[59]	eval-error:0.261364	train-error:0.001393
[60]	eval-error:0.261364	train-error:0.001393
[61]	eval-error:0.261364	train-error:0.001393
[62]	eval-error:0.261364	train-error:0.001393
[63]	eval-error:0.261364	train-error:0.001393
[64]	eval-error:0.261364	train-error:0.001393
[65]	eval-error:0.261364	train-error:0.001393
[66]	eval-error:0.261364	train-error:0.001393
[67]	eval-error:0.261364	train-error:0.001393
[68]	eval-error:0.261364	train-error:0.001393
[69]	eval-error:0.261364	train-error:0.001393
[0]	eval-error:0.329787	train-error:0.089969
[1]	eval-error:0.265957	train-error

[19]	eval-error:0.297872	train-error:0.011375
[20]	eval-error:0.297872	train-error:0.011375
[21]	eval-error:0.297872	train-error:0.011375
[22]	eval-error:0.297872	train-error:0.011375
[23]	eval-error:0.308511	train-error:0.011375
[24]	eval-error:0.297872	train-error:0.011375
[25]	eval-error:0.308511	train-error:0.011375
[26]	eval-error:0.308511	train-error:0.011375
[27]	eval-error:0.308511	train-error:0.011375
[28]	eval-error:0.319149	train-error:0.011375
[29]	eval-error:0.308511	train-error:0.011375
[30]	eval-error:0.308511	train-error:0.011375
[31]	eval-error:0.308511	train-error:0.011375
[32]	eval-error:0.308511	train-error:0.011375
[33]	eval-error:0.308511	train-error:0.011375
[34]	eval-error:0.308511	train-error:0.011375
[35]	eval-error:0.308511	train-error:0.011375
[36]	eval-error:0.308511	train-error:0.011375
[37]	eval-error:0.319149	train-error:0.011375
[38]	eval-error:0.319149	train-error:0.011375
[39]	eval-error:0.319149	train-error:0.011375
[40]	eval-error:0.319149	train-err

[58]	eval-error:0.217557	train-error:0.018465
[59]	eval-error:0.21374	train-error:0.018465
[60]	eval-error:0.221374	train-error:0.018465
[61]	eval-error:0.21374	train-error:0.018465
[62]	eval-error:0.209924	train-error:0.018465
[63]	eval-error:0.209924	train-error:0.018465
[64]	eval-error:0.21374	train-error:0.018465
[65]	eval-error:0.21374	train-error:0.018465
[66]	eval-error:0.21374	train-error:0.018465
[67]	eval-error:0.209924	train-error:0.018465
[68]	eval-error:0.206107	train-error:0.018465
[69]	eval-error:0.21374	train-error:0.018465
[0]	eval-error:0.28626	train-error:0.080661
[1]	eval-error:0.244275	train-error:0.04276
[2]	eval-error:0.244275	train-error:0.029155
[3]	eval-error:0.229008	train-error:0.02381
[4]	eval-error:0.229008	train-error:0.021866
[5]	eval-error:0.225191	train-error:0.019922
[6]	eval-error:0.229008	train-error:0.019436
[7]	eval-error:0.244275	train-error:0.01895
[8]	eval-error:0.232824	train-error:0.01895
[9]	eval-error:0.236641	train-error:0.01895
[10]	eval-

[30]	eval-error:0.240458	train-error:0.01895
[31]	eval-error:0.232824	train-error:0.01895
[32]	eval-error:0.240458	train-error:0.01895
[33]	eval-error:0.236641	train-error:0.01895
[34]	eval-error:0.229008	train-error:0.01895
[35]	eval-error:0.232824	train-error:0.01895
[36]	eval-error:0.236641	train-error:0.01895
[37]	eval-error:0.236641	train-error:0.01895
[38]	eval-error:0.232824	train-error:0.01895
[39]	eval-error:0.232824	train-error:0.01895
[40]	eval-error:0.229008	train-error:0.01895
[41]	eval-error:0.240458	train-error:0.01895
[42]	eval-error:0.236641	train-error:0.01895
[43]	eval-error:0.240458	train-error:0.01895
[44]	eval-error:0.244275	train-error:0.01895
[45]	eval-error:0.232824	train-error:0.01895
[46]	eval-error:0.240458	train-error:0.01895
[47]	eval-error:0.236641	train-error:0.01895
[48]	eval-error:0.232824	train-error:0.01895
[49]	eval-error:0.232824	train-error:0.01895
[50]	eval-error:0.244275	train-error:0.01895
[51]	eval-error:0.232824	train-error:0.01895
[52]	eval-

[0]	eval-error:0.284091	train-error:0.123955
[1]	eval-error:0.204545	train-error:0.071031
[2]	eval-error:0.204545	train-error:0.038997
[3]	eval-error:0.204545	train-error:0.019499
[4]	eval-error:0.204545	train-error:0.009749
[5]	eval-error:0.25	train-error:0.006964
[6]	eval-error:0.238636	train-error:0.005571
[7]	eval-error:0.238636	train-error:0.005571
[8]	eval-error:0.238636	train-error:0.005571
[9]	eval-error:0.227273	train-error:0.005571
[10]	eval-error:0.227273	train-error:0.005571
[11]	eval-error:0.227273	train-error:0.005571
[12]	eval-error:0.227273	train-error:0.005571
[13]	eval-error:0.227273	train-error:0.005571
[14]	eval-error:0.215909	train-error:0.005571
[15]	eval-error:0.215909	train-error:0.005571
[16]	eval-error:0.215909	train-error:0.005571
[17]	eval-error:0.204545	train-error:0.005571
[18]	eval-error:0.204545	train-error:0.005571
[19]	eval-error:0.204545	train-error:0.005571
[20]	eval-error:0.193182	train-error:0.005571
[21]	eval-error:0.193182	train-error:0.005571
[2

[39]	eval-error:0.212766	train-error:0.022751
[40]	eval-error:0.212766	train-error:0.022751
[41]	eval-error:0.212766	train-error:0.022751
[42]	eval-error:0.202128	train-error:0.022751
[43]	eval-error:0.191489	train-error:0.022751
[44]	eval-error:0.202128	train-error:0.022751
[45]	eval-error:0.180851	train-error:0.022751
[46]	eval-error:0.212766	train-error:0.022751
[47]	eval-error:0.212766	train-error:0.022751
[48]	eval-error:0.212766	train-error:0.022751
[49]	eval-error:0.212766	train-error:0.022751
[50]	eval-error:0.212766	train-error:0.022751
[51]	eval-error:0.212766	train-error:0.022751
[52]	eval-error:0.202128	train-error:0.022751
[53]	eval-error:0.223404	train-error:0.022751
[54]	eval-error:0.223404	train-error:0.022751
[55]	eval-error:0.223404	train-error:0.022751
[56]	eval-error:0.223404	train-error:0.022751
[57]	eval-error:0.223404	train-error:0.022751
[58]	eval-error:0.223404	train-error:0.022751
[59]	eval-error:0.202128	train-error:0.022751
[60]	eval-error:0.223404	train-err

[8]	eval-error:0.255319	train-error:0.022751
[9]	eval-error:0.212766	train-error:0.022751
[10]	eval-error:0.191489	train-error:0.022751
[11]	eval-error:0.212766	train-error:0.022751
[12]	eval-error:0.212766	train-error:0.022751
[13]	eval-error:0.212766	train-error:0.022751
[14]	eval-error:0.234043	train-error:0.022751
[15]	eval-error:0.223404	train-error:0.022751
[16]	eval-error:0.223404	train-error:0.022751
[17]	eval-error:0.212766	train-error:0.022751
[18]	eval-error:0.223404	train-error:0.022751
[19]	eval-error:0.202128	train-error:0.022751
[20]	eval-error:0.212766	train-error:0.022751
[21]	eval-error:0.223404	train-error:0.022751
[22]	eval-error:0.212766	train-error:0.022751
[23]	eval-error:0.223404	train-error:0.022751
[24]	eval-error:0.223404	train-error:0.022751
[25]	eval-error:0.212766	train-error:0.022751
[26]	eval-error:0.223404	train-error:0.022751
[27]	eval-error:0.223404	train-error:0.022751
[28]	eval-error:0.212766	train-error:0.022751
[29]	eval-error:0.223404	train-error

[48]	eval-error:0.194656	train-error:0.02381
[49]	eval-error:0.194656	train-error:0.02381
[50]	eval-error:0.194656	train-error:0.02381
[51]	eval-error:0.198473	train-error:0.02381
[52]	eval-error:0.198473	train-error:0.02381
[53]	eval-error:0.198473	train-error:0.02381
[54]	eval-error:0.194656	train-error:0.02381
[55]	eval-error:0.194656	train-error:0.02381
[56]	eval-error:0.20229	train-error:0.02381
[57]	eval-error:0.198473	train-error:0.02381
[58]	eval-error:0.20229	train-error:0.02381
[59]	eval-error:0.194656	train-error:0.02381
[60]	eval-error:0.194656	train-error:0.02381
[61]	eval-error:0.194656	train-error:0.02381
[62]	eval-error:0.198473	train-error:0.02381
[63]	eval-error:0.20229	train-error:0.02381
[64]	eval-error:0.198473	train-error:0.02381
[65]	eval-error:0.194656	train-error:0.02381
[66]	eval-error:0.194656	train-error:0.02381
[67]	eval-error:0.194656	train-error:0.02381
[68]	eval-error:0.198473	train-error:0.02381
[69]	eval-error:0.198473	train-error:0.02381
[0]	eval-erro

[20]	eval-error:0.193182	train-error:0
[21]	eval-error:0.181818	train-error:0
[22]	eval-error:0.181818	train-error:0
[23]	eval-error:0.181818	train-error:0
[24]	eval-error:0.181818	train-error:0
[25]	eval-error:0.181818	train-error:0
[26]	eval-error:0.193182	train-error:0
[27]	eval-error:0.181818	train-error:0
[28]	eval-error:0.181818	train-error:0
[29]	eval-error:0.193182	train-error:0
[30]	eval-error:0.193182	train-error:0
[31]	eval-error:0.181818	train-error:0
[32]	eval-error:0.181818	train-error:0
[33]	eval-error:0.193182	train-error:0
[34]	eval-error:0.193182	train-error:0
[35]	eval-error:0.181818	train-error:0
[36]	eval-error:0.193182	train-error:0
[37]	eval-error:0.193182	train-error:0
[38]	eval-error:0.181818	train-error:0
[39]	eval-error:0.193182	train-error:0
[40]	eval-error:0.193182	train-error:0
[41]	eval-error:0.193182	train-error:0
[42]	eval-error:0.193182	train-error:0
[43]	eval-error:0.193182	train-error:0
[44]	eval-error:0.193182	train-error:0
[45]	eval-error:0.193182	

[20]	eval-error:0.227273	train-error:0
[21]	eval-error:0.227273	train-error:0
[22]	eval-error:0.227273	train-error:0
[23]	eval-error:0.227273	train-error:0
[24]	eval-error:0.227273	train-error:0
[25]	eval-error:0.25	train-error:0
[26]	eval-error:0.238636	train-error:0
[27]	eval-error:0.25	train-error:0
[28]	eval-error:0.25	train-error:0
[29]	eval-error:0.238636	train-error:0
[30]	eval-error:0.25	train-error:0
[31]	eval-error:0.25	train-error:0
[32]	eval-error:0.238636	train-error:0
[33]	eval-error:0.238636	train-error:0
[34]	eval-error:0.227273	train-error:0
[35]	eval-error:0.238636	train-error:0
[36]	eval-error:0.238636	train-error:0
[37]	eval-error:0.227273	train-error:0
[38]	eval-error:0.238636	train-error:0
[39]	eval-error:0.261364	train-error:0
[40]	eval-error:0.25	train-error:0
[41]	eval-error:0.25	train-error:0
[42]	eval-error:0.25	train-error:0
[43]	eval-error:0.238636	train-error:0
[44]	eval-error:0.25	train-error:0
[45]	eval-error:0.238636	train-error:0
[46]	eval-error:0.2386

[68]	eval-error:0.255319	train-error:0.001034
[69]	eval-error:0.255319	train-error:0.001034
[0]	eval-error:0.329787	train-error:0.086867
[1]	eval-error:0.329787	train-error:0.031024
[2]	eval-error:0.297872	train-error:0.010341
[3]	eval-error:0.287234	train-error:0.003102
[4]	eval-error:0.287234	train-error:0.001034
[5]	eval-error:0.276596	train-error:0.001034
[6]	eval-error:0.276596	train-error:0.001034
[7]	eval-error:0.287234	train-error:0.001034
[8]	eval-error:0.255319	train-error:0.001034
[9]	eval-error:0.255319	train-error:0.001034
[10]	eval-error:0.265957	train-error:0.001034
[11]	eval-error:0.276596	train-error:0.001034
[12]	eval-error:0.265957	train-error:0.001034
[13]	eval-error:0.276596	train-error:0.001034
[14]	eval-error:0.265957	train-error:0.001034
[15]	eval-error:0.265957	train-error:0.001034
[16]	eval-error:0.265957	train-error:0.001034
[17]	eval-error:0.255319	train-error:0.001034
[18]	eval-error:0.255319	train-error:0.001034
[19]	eval-error:0.244681	train-error:0.00103

[37]	eval-error:0.221374	train-error:0.005831
[38]	eval-error:0.221374	train-error:0.005831
[39]	eval-error:0.21374	train-error:0.005831
[40]	eval-error:0.217557	train-error:0.005831
[41]	eval-error:0.221374	train-error:0.005831
[42]	eval-error:0.229008	train-error:0.005831
[43]	eval-error:0.225191	train-error:0.005831
[44]	eval-error:0.225191	train-error:0.005831
[45]	eval-error:0.225191	train-error:0.005831
[46]	eval-error:0.229008	train-error:0.005831
[47]	eval-error:0.221374	train-error:0.005831
[48]	eval-error:0.221374	train-error:0.005831
[49]	eval-error:0.221374	train-error:0.005831
[50]	eval-error:0.225191	train-error:0.005831
[51]	eval-error:0.221374	train-error:0.005831
[52]	eval-error:0.225191	train-error:0.005831
[53]	eval-error:0.225191	train-error:0.005831
[54]	eval-error:0.229008	train-error:0.005831
[55]	eval-error:0.229008	train-error:0.005831
[56]	eval-error:0.221374	train-error:0.005831
[57]	eval-error:0.221374	train-error:0.005831
[58]	eval-error:0.229008	train-erro

[7]	eval-error:0.217557	train-error:0.005831
[8]	eval-error:0.217557	train-error:0.005831
[9]	eval-error:0.209924	train-error:0.005831
[10]	eval-error:0.20229	train-error:0.005831
[11]	eval-error:0.209924	train-error:0.005831
[12]	eval-error:0.198473	train-error:0.005831
[13]	eval-error:0.20229	train-error:0.005831
[14]	eval-error:0.206107	train-error:0.005831
[15]	eval-error:0.206107	train-error:0.005831
[16]	eval-error:0.209924	train-error:0.005831
[17]	eval-error:0.206107	train-error:0.005831
[18]	eval-error:0.206107	train-error:0.005831
[19]	eval-error:0.206107	train-error:0.005831
[20]	eval-error:0.198473	train-error:0.005831
[21]	eval-error:0.198473	train-error:0.005831
[22]	eval-error:0.198473	train-error:0.005831
[23]	eval-error:0.198473	train-error:0.005831
[24]	eval-error:0.206107	train-error:0.005831
[25]	eval-error:0.194656	train-error:0.005831
[26]	eval-error:0.20229	train-error:0.005831
[27]	eval-error:0.20229	train-error:0.005831
[28]	eval-error:0.20229	train-error:0.005

In [204]:
feature_eval_data_xg = create_eval_df_from_results_macro(results, False)
feature_eval_data_xg

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.474851,0.473277,0.48166,linguistic
1,max,Wikipedia,0.574879,0.597046,0.566602,linguistic
2,min,Wikipedia,0.43949,0.415663,0.466216,linguistic
3,weighted_mean,Wikipedia,0.540399,0.568783,0.537645,linguistic
4,mean,WikiNews,0.571228,0.584936,0.566919,linguistic
5,max,WikiNews,0.64668,0.678059,0.633207,linguistic
6,min,WikiNews,0.533231,0.556911,0.535354,linguistic
7,weighted_mean,WikiNews,0.680329,0.717722,0.662879,linguistic
8,mean,News,0.577685,0.628605,0.570735,linguistic
9,max,News,0.634292,0.673442,0.619452,linguistic


In [205]:
idx = feature_eval_data_xg.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_xg['f1']
feature_eval_data_xg[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
11,weighted_mean,News,0.669999,0.748187,0.645371,linguistic
99,"(weighted_mean, <function <lambda> at 0x000000FF52232E18>)",Wikipedia,0.648541,0.723986,0.622587,semantic
151,"(weighted_mean, <function <lambda> at 0x000000FF52232E18>)",WikiNews,0.712758,0.795732,0.683712,all


In [206]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*xgboost(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets_dswp]

[0]	eval-error:0.295455	train-error:0.115655
[1]	eval-error:0.25	train-error:0.062151
[2]	eval-error:0.227273	train-error:0.026842
[3]	eval-error:0.238636	train-error:0.008647
[4]	eval-error:0.227273	train-error:0.002882
[5]	eval-error:0.215909	train-error:0.001081
[6]	eval-error:0.215909	train-error:0.00036
[7]	eval-error:0.227273	train-error:0
[8]	eval-error:0.215909	train-error:0
[9]	eval-error:0.215909	train-error:0
[10]	eval-error:0.238636	train-error:0
[11]	eval-error:0.238636	train-error:0
[12]	eval-error:0.215909	train-error:0
[13]	eval-error:0.238636	train-error:0
[14]	eval-error:0.25	train-error:0
[15]	eval-error:0.227273	train-error:0
[16]	eval-error:0.227273	train-error:0
[17]	eval-error:0.238636	train-error:0
[18]	eval-error:0.193182	train-error:0
[19]	eval-error:0.215909	train-error:0
[20]	eval-error:0.215909	train-error:0
[21]	eval-error:0.227273	train-error:0
[22]	eval-error:0.215909	train-error:0
[23]	eval-error:0.215909	train-error:0
[24]	eval-error:0.204545	train-err

[1]	eval-error:0.227273	train-error:0.058368
[2]	eval-error:0.25	train-error:0.026482
[3]	eval-error:0.227273	train-error:0.008647
[4]	eval-error:0.25	train-error:0.002882
[5]	eval-error:0.215909	train-error:0.00054
[6]	eval-error:0.204545	train-error:0.00018
[7]	eval-error:0.193182	train-error:0
[8]	eval-error:0.204545	train-error:0
[9]	eval-error:0.204545	train-error:0
[10]	eval-error:0.204545	train-error:0
[11]	eval-error:0.215909	train-error:0
[12]	eval-error:0.193182	train-error:0
[13]	eval-error:0.204545	train-error:0
[14]	eval-error:0.204545	train-error:0
[15]	eval-error:0.204545	train-error:0
[16]	eval-error:0.204545	train-error:0
[17]	eval-error:0.215909	train-error:0
[18]	eval-error:0.204545	train-error:0
[19]	eval-error:0.215909	train-error:0
[20]	eval-error:0.204545	train-error:0
[21]	eval-error:0.193182	train-error:0
[22]	eval-error:0.193182	train-error:0
[23]	eval-error:0.193182	train-error:0
[24]	eval-error:0.193182	train-error:0
[25]	eval-error:0.193182	train-error:0
[2

[50]	eval-error:0.244681	train-error:0.003098
[51]	eval-error:0.244681	train-error:0.003098
[52]	eval-error:0.244681	train-error:0.003098
[53]	eval-error:0.244681	train-error:0.003098
[54]	eval-error:0.223404	train-error:0.003098
[55]	eval-error:0.234043	train-error:0.003098
[56]	eval-error:0.234043	train-error:0.003098
[57]	eval-error:0.234043	train-error:0.003098
[58]	eval-error:0.223404	train-error:0.003098
[59]	eval-error:0.234043	train-error:0.003098
[60]	eval-error:0.234043	train-error:0.003098
[61]	eval-error:0.234043	train-error:0.003098
[62]	eval-error:0.234043	train-error:0.003098
[63]	eval-error:0.223404	train-error:0.003098
[64]	eval-error:0.234043	train-error:0.003098
[65]	eval-error:0.234043	train-error:0.003098
[66]	eval-error:0.223404	train-error:0.003098
[67]	eval-error:0.223404	train-error:0.003098
[68]	eval-error:0.223404	train-error:0.003098
[69]	eval-error:0.234043	train-error:0.003098
[0]	eval-error:0.393617	train-error:0.108185
[1]	eval-error:0.351064	train-error

[19]	eval-error:0.217557	train-error:0.007785
[20]	eval-error:0.221374	train-error:0.007785
[21]	eval-error:0.221374	train-error:0.007785
[22]	eval-error:0.221374	train-error:0.007785
[23]	eval-error:0.221374	train-error:0.007785
[24]	eval-error:0.221374	train-error:0.007785
[25]	eval-error:0.229008	train-error:0.007785
[26]	eval-error:0.221374	train-error:0.007785
[27]	eval-error:0.225191	train-error:0.007785
[28]	eval-error:0.217557	train-error:0.007785
[29]	eval-error:0.21374	train-error:0.007785
[30]	eval-error:0.221374	train-error:0.007785
[31]	eval-error:0.217557	train-error:0.007785
[32]	eval-error:0.217557	train-error:0.007785
[33]	eval-error:0.221374	train-error:0.007785
[34]	eval-error:0.221374	train-error:0.007785
[35]	eval-error:0.21374	train-error:0.007785
[36]	eval-error:0.217557	train-error:0.007785
[37]	eval-error:0.217557	train-error:0.007785
[38]	eval-error:0.225191	train-error:0.007785
[39]	eval-error:0.221374	train-error:0.007785
[40]	eval-error:0.221374	train-error

[59]	eval-error:0.21374	train-error:0.007785
[60]	eval-error:0.21374	train-error:0.007785
[61]	eval-error:0.217557	train-error:0.007785
[62]	eval-error:0.217557	train-error:0.007785
[63]	eval-error:0.217557	train-error:0.007785
[64]	eval-error:0.217557	train-error:0.007785
[65]	eval-error:0.217557	train-error:0.007785
[66]	eval-error:0.217557	train-error:0.007785
[67]	eval-error:0.217557	train-error:0.007785
[68]	eval-error:0.217557	train-error:0.007785
[69]	eval-error:0.217557	train-error:0.007785
[0]	eval-error:0.259542	train-error:0.088916
[1]	eval-error:0.229008	train-error:0.054278
[2]	eval-error:0.209924	train-error:0.030353
[3]	eval-error:0.209924	train-error:0.019997
[4]	eval-error:0.225191	train-error:0.013498
[5]	eval-error:0.236641	train-error:0.010713
[6]	eval-error:0.240458	train-error:0.00907
[7]	eval-error:0.251908	train-error:0.00807
[8]	eval-error:0.236641	train-error:0.00807
[9]	eval-error:0.229008	train-error:0.007927
[10]	eval-error:0.225191	train-error:0.007856
[11

[31]	eval-error:0.295455	train-error:0.058908
[32]	eval-error:0.295455	train-error:0.058908
[33]	eval-error:0.295455	train-error:0.058908
[34]	eval-error:0.295455	train-error:0.058908
[35]	eval-error:0.306818	train-error:0.058908
[36]	eval-error:0.284091	train-error:0.058908
[37]	eval-error:0.295455	train-error:0.058908
[38]	eval-error:0.295455	train-error:0.058908
[39]	eval-error:0.295455	train-error:0.058908
[40]	eval-error:0.295455	train-error:0.058908
[41]	eval-error:0.295455	train-error:0.058908
[42]	eval-error:0.284091	train-error:0.058908
[43]	eval-error:0.295455	train-error:0.058908
[44]	eval-error:0.284091	train-error:0.058908
[45]	eval-error:0.284091	train-error:0.058908
[46]	eval-error:0.284091	train-error:0.058908
[47]	eval-error:0.295455	train-error:0.058908
[48]	eval-error:0.284091	train-error:0.058908
[49]	eval-error:0.295455	train-error:0.058908
[50]	eval-error:0.295455	train-error:0.058908
[51]	eval-error:0.295455	train-error:0.058908
[52]	eval-error:0.284091	train-err

[2]	eval-error:0.234043	train-error:0.076168
[3]	eval-error:0.244681	train-error:0.071263
[4]	eval-error:0.265957	train-error:0.068939
[5]	eval-error:0.244681	train-error:0.068422
[6]	eval-error:0.244681	train-error:0.068035
[7]	eval-error:0.255319	train-error:0.067648
[8]	eval-error:0.276596	train-error:0.067648
[9]	eval-error:0.265957	train-error:0.067519
[10]	eval-error:0.265957	train-error:0.067519
[11]	eval-error:0.265957	train-error:0.067519
[12]	eval-error:0.276596	train-error:0.067519
[13]	eval-error:0.276596	train-error:0.067519
[14]	eval-error:0.276596	train-error:0.067519
[15]	eval-error:0.276596	train-error:0.067519
[16]	eval-error:0.276596	train-error:0.067519
[17]	eval-error:0.287234	train-error:0.067519
[18]	eval-error:0.287234	train-error:0.067519
[19]	eval-error:0.287234	train-error:0.067519
[20]	eval-error:0.276596	train-error:0.067519
[21]	eval-error:0.276596	train-error:0.067519
[22]	eval-error:0.287234	train-error:0.067519
[23]	eval-error:0.287234	train-error:0.067

[41]	eval-error:0.297872	train-error:0.071004
[42]	eval-error:0.297872	train-error:0.071004
[43]	eval-error:0.308511	train-error:0.071004
[44]	eval-error:0.308511	train-error:0.071004
[45]	eval-error:0.308511	train-error:0.071004
[46]	eval-error:0.319149	train-error:0.071004
[47]	eval-error:0.319149	train-error:0.071004
[48]	eval-error:0.308511	train-error:0.071004
[49]	eval-error:0.329787	train-error:0.071004
[50]	eval-error:0.319149	train-error:0.071004
[51]	eval-error:0.319149	train-error:0.071004
[52]	eval-error:0.308511	train-error:0.071004
[53]	eval-error:0.308511	train-error:0.071004
[54]	eval-error:0.308511	train-error:0.071004
[55]	eval-error:0.308511	train-error:0.071004
[56]	eval-error:0.308511	train-error:0.071004
[57]	eval-error:0.308511	train-error:0.071004
[58]	eval-error:0.308511	train-error:0.071004
[59]	eval-error:0.308511	train-error:0.071004
[60]	eval-error:0.308511	train-error:0.071004
[61]	eval-error:0.308511	train-error:0.071004
[62]	eval-error:0.308511	train-err

[11]	eval-error:0.263359	train-error:0.067348
[12]	eval-error:0.240458	train-error:0.067276
[13]	eval-error:0.240458	train-error:0.067276
[14]	eval-error:0.251908	train-error:0.067276
[15]	eval-error:0.240458	train-error:0.067276
[16]	eval-error:0.236641	train-error:0.067276
[17]	eval-error:0.248092	train-error:0.067276
[18]	eval-error:0.267176	train-error:0.067276
[19]	eval-error:0.263359	train-error:0.067276
[20]	eval-error:0.259542	train-error:0.067276
[21]	eval-error:0.255725	train-error:0.067276
[22]	eval-error:0.248092	train-error:0.067276
[23]	eval-error:0.248092	train-error:0.067276
[24]	eval-error:0.248092	train-error:0.067276
[25]	eval-error:0.251908	train-error:0.067276
[26]	eval-error:0.248092	train-error:0.067276
[27]	eval-error:0.251908	train-error:0.067276
[28]	eval-error:0.255725	train-error:0.067276
[29]	eval-error:0.251908	train-error:0.067276
[30]	eval-error:0.248092	train-error:0.067276
[31]	eval-error:0.248092	train-error:0.067276
[32]	eval-error:0.248092	train-err

[50]	eval-error:0.248092	train-error:0.066776
[51]	eval-error:0.244275	train-error:0.066776
[52]	eval-error:0.251908	train-error:0.066776
[53]	eval-error:0.248092	train-error:0.066776
[54]	eval-error:0.248092	train-error:0.066776
[55]	eval-error:0.248092	train-error:0.066776
[56]	eval-error:0.248092	train-error:0.066776
[57]	eval-error:0.240458	train-error:0.066776
[58]	eval-error:0.248092	train-error:0.066776
[59]	eval-error:0.236641	train-error:0.066776
[60]	eval-error:0.244275	train-error:0.066776
[61]	eval-error:0.248092	train-error:0.066776
[62]	eval-error:0.244275	train-error:0.066776
[63]	eval-error:0.248092	train-error:0.066776
[64]	eval-error:0.248092	train-error:0.066776
[65]	eval-error:0.248092	train-error:0.066776
[66]	eval-error:0.251908	train-error:0.066776
[67]	eval-error:0.244275	train-error:0.066776
[68]	eval-error:0.251908	train-error:0.066776
[69]	eval-error:0.248092	train-error:0.066776
[0]	eval-error:0.409091	train-error:0.124482
[1]	eval-error:0.284091	train-error

[20]	eval-error:0.329545	train-error:0.054405
[21]	eval-error:0.318182	train-error:0.054405
[22]	eval-error:0.318182	train-error:0.054405
[23]	eval-error:0.329545	train-error:0.054405
[24]	eval-error:0.329545	train-error:0.054405
[25]	eval-error:0.306818	train-error:0.054405
[26]	eval-error:0.306818	train-error:0.054405
[27]	eval-error:0.318182	train-error:0.054405
[28]	eval-error:0.329545	train-error:0.054405
[29]	eval-error:0.329545	train-error:0.054405
[30]	eval-error:0.329545	train-error:0.054405
[31]	eval-error:0.318182	train-error:0.054405
[32]	eval-error:0.329545	train-error:0.054405
[33]	eval-error:0.329545	train-error:0.054405
[34]	eval-error:0.306818	train-error:0.054405
[35]	eval-error:0.306818	train-error:0.054405
[36]	eval-error:0.318182	train-error:0.054405
[37]	eval-error:0.306818	train-error:0.054405
[38]	eval-error:0.318182	train-error:0.054405
[39]	eval-error:0.318182	train-error:0.054405
[40]	eval-error:0.306818	train-error:0.054405
[41]	eval-error:0.318182	train-err

[59]	eval-error:0.255319	train-error:0.066357
[60]	eval-error:0.255319	train-error:0.066357
[61]	eval-error:0.255319	train-error:0.066357
[62]	eval-error:0.255319	train-error:0.066357
[63]	eval-error:0.255319	train-error:0.066357
[64]	eval-error:0.244681	train-error:0.066357
[65]	eval-error:0.244681	train-error:0.066357
[66]	eval-error:0.244681	train-error:0.066357
[67]	eval-error:0.244681	train-error:0.066357
[68]	eval-error:0.255319	train-error:0.066357
[69]	eval-error:0.255319	train-error:0.066357
[0]	eval-error:0.244681	train-error:0.12174
[1]	eval-error:0.244681	train-error:0.095275
[2]	eval-error:0.244681	train-error:0.078879
[3]	eval-error:0.212766	train-error:0.071521
[4]	eval-error:0.234043	train-error:0.06881
[5]	eval-error:0.234043	train-error:0.067777
[6]	eval-error:0.244681	train-error:0.067261
[7]	eval-error:0.255319	train-error:0.066873
[8]	eval-error:0.255319	train-error:0.066744
[9]	eval-error:0.265957	train-error:0.066615
[10]	eval-error:0.234043	train-error:0.066486


[28]	eval-error:0.255319	train-error:0.066357
[29]	eval-error:0.255319	train-error:0.066357
[30]	eval-error:0.244681	train-error:0.066357
[31]	eval-error:0.244681	train-error:0.066357
[32]	eval-error:0.255319	train-error:0.066357
[33]	eval-error:0.244681	train-error:0.066357
[34]	eval-error:0.244681	train-error:0.066357
[35]	eval-error:0.244681	train-error:0.066357
[36]	eval-error:0.255319	train-error:0.066357
[37]	eval-error:0.255319	train-error:0.066357
[38]	eval-error:0.244681	train-error:0.066357
[39]	eval-error:0.244681	train-error:0.066357
[40]	eval-error:0.234043	train-error:0.066357
[41]	eval-error:0.244681	train-error:0.066357
[42]	eval-error:0.234043	train-error:0.066357
[43]	eval-error:0.234043	train-error:0.066357
[44]	eval-error:0.244681	train-error:0.066357
[45]	eval-error:0.244681	train-error:0.066357
[46]	eval-error:0.234043	train-error:0.066357
[47]	eval-error:0.255319	train-error:0.066357
[48]	eval-error:0.255319	train-error:0.066357
[49]	eval-error:0.255319	train-err

[68]	eval-error:0.259542	train-error:0.066776
[69]	eval-error:0.259542	train-error:0.066776
[0]	eval-error:0.290076	train-error:0.101628
[1]	eval-error:0.244275	train-error:0.086845
[2]	eval-error:0.251908	train-error:0.075418
[3]	eval-error:0.255725	train-error:0.071204
[4]	eval-error:0.259542	train-error:0.068919
[5]	eval-error:0.251908	train-error:0.067776
[6]	eval-error:0.270992	train-error:0.067133
[7]	eval-error:0.267176	train-error:0.066848
[8]	eval-error:0.282443	train-error:0.066919
[9]	eval-error:0.270992	train-error:0.066776
[10]	eval-error:0.282443	train-error:0.066776
[11]	eval-error:0.278626	train-error:0.066776
[12]	eval-error:0.278626	train-error:0.066776
[13]	eval-error:0.274809	train-error:0.066776
[14]	eval-error:0.278626	train-error:0.066776
[15]	eval-error:0.270992	train-error:0.066776
[16]	eval-error:0.274809	train-error:0.066776
[17]	eval-error:0.270992	train-error:0.066776
[18]	eval-error:0.293893	train-error:0.066776
[19]	eval-error:0.263359	train-error:0.06677

[38]	eval-error:0.261364	train-error:0.053684
[39]	eval-error:0.261364	train-error:0.053684
[40]	eval-error:0.25	train-error:0.053684
[41]	eval-error:0.261364	train-error:0.053684
[42]	eval-error:0.261364	train-error:0.053684
[43]	eval-error:0.272727	train-error:0.053684
[44]	eval-error:0.261364	train-error:0.053684
[45]	eval-error:0.261364	train-error:0.053684
[46]	eval-error:0.261364	train-error:0.053684
[47]	eval-error:0.261364	train-error:0.053684
[48]	eval-error:0.261364	train-error:0.053684
[49]	eval-error:0.261364	train-error:0.053684
[50]	eval-error:0.261364	train-error:0.053684
[51]	eval-error:0.272727	train-error:0.053684
[52]	eval-error:0.272727	train-error:0.053684
[53]	eval-error:0.272727	train-error:0.053684
[54]	eval-error:0.272727	train-error:0.053684
[55]	eval-error:0.272727	train-error:0.053684
[56]	eval-error:0.272727	train-error:0.053684
[57]	eval-error:0.272727	train-error:0.053684
[58]	eval-error:0.272727	train-error:0.053684
[59]	eval-error:0.272727	train-error:0

[8]	eval-error:0.306818	train-error:0.053684
[9]	eval-error:0.318182	train-error:0.053684
[10]	eval-error:0.306818	train-error:0.053684
[11]	eval-error:0.318182	train-error:0.053684
[12]	eval-error:0.329545	train-error:0.053684
[13]	eval-error:0.329545	train-error:0.053684
[14]	eval-error:0.329545	train-error:0.053684
[15]	eval-error:0.318182	train-error:0.053684
[16]	eval-error:0.318182	train-error:0.053684
[17]	eval-error:0.329545	train-error:0.053684
[18]	eval-error:0.318182	train-error:0.053684
[19]	eval-error:0.318182	train-error:0.053684
[20]	eval-error:0.340909	train-error:0.053684
[21]	eval-error:0.329545	train-error:0.053684
[22]	eval-error:0.329545	train-error:0.053684
[23]	eval-error:0.318182	train-error:0.053684
[24]	eval-error:0.329545	train-error:0.053684
[25]	eval-error:0.318182	train-error:0.053684
[26]	eval-error:0.306818	train-error:0.053684
[27]	eval-error:0.306818	train-error:0.053684
[28]	eval-error:0.306818	train-error:0.053684
[29]	eval-error:0.306818	train-error

[47]	eval-error:0.287234	train-error:0.064162
[48]	eval-error:0.287234	train-error:0.064162
[49]	eval-error:0.276596	train-error:0.064162
[50]	eval-error:0.276596	train-error:0.064162
[51]	eval-error:0.276596	train-error:0.064162
[52]	eval-error:0.276596	train-error:0.064162
[53]	eval-error:0.276596	train-error:0.064162
[54]	eval-error:0.276596	train-error:0.064162
[55]	eval-error:0.276596	train-error:0.064162
[56]	eval-error:0.265957	train-error:0.064162
[57]	eval-error:0.265957	train-error:0.064162
[58]	eval-error:0.276596	train-error:0.064162
[59]	eval-error:0.265957	train-error:0.064162
[60]	eval-error:0.265957	train-error:0.064162
[61]	eval-error:0.265957	train-error:0.064162
[62]	eval-error:0.265957	train-error:0.064162
[63]	eval-error:0.276596	train-error:0.064162
[64]	eval-error:0.276596	train-error:0.064162
[65]	eval-error:0.276596	train-error:0.064162
[66]	eval-error:0.276596	train-error:0.064162
[67]	eval-error:0.276596	train-error:0.064162
[68]	eval-error:0.276596	train-err

[16]	eval-error:0.217557	train-error:0.065776
[17]	eval-error:0.209924	train-error:0.065776
[18]	eval-error:0.20229	train-error:0.065776
[19]	eval-error:0.21374	train-error:0.065776
[20]	eval-error:0.198473	train-error:0.065776
[21]	eval-error:0.20229	train-error:0.065776
[22]	eval-error:0.21374	train-error:0.065776
[23]	eval-error:0.209924	train-error:0.065776
[24]	eval-error:0.194656	train-error:0.065776
[25]	eval-error:0.198473	train-error:0.065776
[26]	eval-error:0.198473	train-error:0.065776
[27]	eval-error:0.194656	train-error:0.065776
[28]	eval-error:0.194656	train-error:0.065776
[29]	eval-error:0.198473	train-error:0.065776
[30]	eval-error:0.198473	train-error:0.065776
[31]	eval-error:0.194656	train-error:0.065776
[32]	eval-error:0.198473	train-error:0.065776
[33]	eval-error:0.187023	train-error:0.065776
[34]	eval-error:0.194656	train-error:0.065776
[35]	eval-error:0.187023	train-error:0.065776
[36]	eval-error:0.187023	train-error:0.065776
[37]	eval-error:0.194656	train-error:0

[55]	eval-error:0.248092	train-error:0.065776
[56]	eval-error:0.251908	train-error:0.065776
[57]	eval-error:0.244275	train-error:0.065776
[58]	eval-error:0.255725	train-error:0.065776
[59]	eval-error:0.248092	train-error:0.065776
[60]	eval-error:0.248092	train-error:0.065776
[61]	eval-error:0.248092	train-error:0.065776
[62]	eval-error:0.248092	train-error:0.065776
[63]	eval-error:0.251908	train-error:0.065776
[64]	eval-error:0.259542	train-error:0.065776
[65]	eval-error:0.248092	train-error:0.065776
[66]	eval-error:0.248092	train-error:0.065776
[67]	eval-error:0.255725	train-error:0.065776
[68]	eval-error:0.259542	train-error:0.065776
[69]	eval-error:0.255725	train-error:0.065776
[0]	eval-error:0.278626	train-error:0.090844
[1]	eval-error:0.244275	train-error:0.077132
[2]	eval-error:0.267176	train-error:0.070347
[3]	eval-error:0.251908	train-error:0.067133
[4]	eval-error:0.263359	train-error:0.065919
[5]	eval-error:0.251908	train-error:0.065848
[6]	eval-error:0.248092	train-error:0.06

[25]	eval-error:0.443182	train-error:0.18285
[26]	eval-error:0.477273	train-error:0.18285
[27]	eval-error:0.477273	train-error:0.18285
[28]	eval-error:0.477273	train-error:0.18285
[29]	eval-error:0.477273	train-error:0.18285
[30]	eval-error:0.477273	train-error:0.18285
[31]	eval-error:0.477273	train-error:0.18285
[32]	eval-error:0.477273	train-error:0.18285
[33]	eval-error:0.477273	train-error:0.18285
[34]	eval-error:0.477273	train-error:0.18285
[35]	eval-error:0.477273	train-error:0.18285
[36]	eval-error:0.477273	train-error:0.18285
[37]	eval-error:0.477273	train-error:0.18285
[38]	eval-error:0.477273	train-error:0.18285
[39]	eval-error:0.477273	train-error:0.18285
[40]	eval-error:0.477273	train-error:0.18285
[41]	eval-error:0.454545	train-error:0.18285
[42]	eval-error:0.454545	train-error:0.18285
[43]	eval-error:0.477273	train-error:0.18285
[44]	eval-error:0.454545	train-error:0.18285
[45]	eval-error:0.454545	train-error:0.18285
[46]	eval-error:0.465909	train-error:0.18285
[47]	eval-

[65]	eval-error:0.409091	train-error:0.177265
[66]	eval-error:0.409091	train-error:0.177265
[67]	eval-error:0.409091	train-error:0.177265
[68]	eval-error:0.409091	train-error:0.177265
[69]	eval-error:0.409091	train-error:0.177265
[0]	eval-error:0.361702	train-error:0.214691
[1]	eval-error:0.297872	train-error:0.193648
[2]	eval-error:0.329787	train-error:0.183062
[3]	eval-error:0.340426	train-error:0.178286
[4]	eval-error:0.319149	train-error:0.175574
[5]	eval-error:0.297872	train-error:0.1748
[6]	eval-error:0.319149	train-error:0.174154
[7]	eval-error:0.297872	train-error:0.174025
[8]	eval-error:0.329787	train-error:0.174025
[9]	eval-error:0.329787	train-error:0.174025
[10]	eval-error:0.319149	train-error:0.174025
[11]	eval-error:0.329787	train-error:0.174025
[12]	eval-error:0.340426	train-error:0.174025
[13]	eval-error:0.329787	train-error:0.174025
[14]	eval-error:0.329787	train-error:0.174025
[15]	eval-error:0.329787	train-error:0.174025
[16]	eval-error:0.329787	train-error:0.174025


[34]	eval-error:0.319149	train-error:0.185257
[35]	eval-error:0.319149	train-error:0.185257
[36]	eval-error:0.319149	train-error:0.185257
[37]	eval-error:0.319149	train-error:0.185257
[38]	eval-error:0.319149	train-error:0.185257
[39]	eval-error:0.329787	train-error:0.185257
[40]	eval-error:0.329787	train-error:0.185257
[41]	eval-error:0.329787	train-error:0.185257
[42]	eval-error:0.329787	train-error:0.185257
[43]	eval-error:0.329787	train-error:0.185257
[44]	eval-error:0.329787	train-error:0.185257
[45]	eval-error:0.329787	train-error:0.185257
[46]	eval-error:0.329787	train-error:0.185257
[47]	eval-error:0.329787	train-error:0.185257
[48]	eval-error:0.329787	train-error:0.185257
[49]	eval-error:0.329787	train-error:0.185257
[50]	eval-error:0.329787	train-error:0.185257
[51]	eval-error:0.329787	train-error:0.185257
[52]	eval-error:0.329787	train-error:0.185257
[53]	eval-error:0.329787	train-error:0.185257
[54]	eval-error:0.329787	train-error:0.185257
[55]	eval-error:0.329787	train-err

[3]	eval-error:0.312977	train-error:0.159192
[4]	eval-error:0.332061	train-error:0.156978
[5]	eval-error:0.347328	train-error:0.155835
[6]	eval-error:0.320611	train-error:0.155264
[7]	eval-error:0.305344	train-error:0.154692
[8]	eval-error:0.312977	train-error:0.154621
[9]	eval-error:0.305344	train-error:0.154621
[10]	eval-error:0.301527	train-error:0.154549
[11]	eval-error:0.30916	train-error:0.154549
[12]	eval-error:0.301527	train-error:0.154549
[13]	eval-error:0.305344	train-error:0.154549
[14]	eval-error:0.320611	train-error:0.154549
[15]	eval-error:0.30916	train-error:0.154549
[16]	eval-error:0.301527	train-error:0.154549
[17]	eval-error:0.316794	train-error:0.154549
[18]	eval-error:0.320611	train-error:0.154549
[19]	eval-error:0.305344	train-error:0.154549
[20]	eval-error:0.312977	train-error:0.154549
[21]	eval-error:0.305344	train-error:0.154549
[22]	eval-error:0.30916	train-error:0.154549
[23]	eval-error:0.301527	train-error:0.154549
[24]	eval-error:0.30916	train-error:0.154549

[43]	eval-error:0.28626	train-error:0.148122
[44]	eval-error:0.270992	train-error:0.148122
[45]	eval-error:0.274809	train-error:0.148122
[46]	eval-error:0.282443	train-error:0.148122
[47]	eval-error:0.274809	train-error:0.148122
[48]	eval-error:0.282443	train-error:0.148122
[49]	eval-error:0.267176	train-error:0.148122
[50]	eval-error:0.267176	train-error:0.148122
[51]	eval-error:0.278626	train-error:0.148122
[52]	eval-error:0.267176	train-error:0.148122
[53]	eval-error:0.274809	train-error:0.148122
[54]	eval-error:0.270992	train-error:0.148122
[55]	eval-error:0.274809	train-error:0.148122
[56]	eval-error:0.278626	train-error:0.148122
[57]	eval-error:0.278626	train-error:0.148122
[58]	eval-error:0.274809	train-error:0.148122
[59]	eval-error:0.270992	train-error:0.148122
[60]	eval-error:0.278626	train-error:0.148122
[61]	eval-error:0.274809	train-error:0.148122
[62]	eval-error:0.274809	train-error:0.148122
[63]	eval-error:0.278626	train-error:0.148122
[64]	eval-error:0.274809	train-erro

[16]	eval-error:0.272727	train-error:0.062872
[17]	eval-error:0.284091	train-error:0.062691
[18]	eval-error:0.284091	train-error:0.062691
[19]	eval-error:0.272727	train-error:0.062691
[20]	eval-error:0.272727	train-error:0.062691
[21]	eval-error:0.261364	train-error:0.062691
[22]	eval-error:0.272727	train-error:0.062691
[23]	eval-error:0.272727	train-error:0.062691
[24]	eval-error:0.272727	train-error:0.062691
[25]	eval-error:0.284091	train-error:0.062691
[26]	eval-error:0.284091	train-error:0.062691
[27]	eval-error:0.284091	train-error:0.062691
[28]	eval-error:0.284091	train-error:0.062691
[29]	eval-error:0.272727	train-error:0.062691
[30]	eval-error:0.272727	train-error:0.062691
[31]	eval-error:0.272727	train-error:0.062691
[32]	eval-error:0.272727	train-error:0.062691
[33]	eval-error:0.272727	train-error:0.062691
[34]	eval-error:0.272727	train-error:0.062691
[35]	eval-error:0.272727	train-error:0.062691
[36]	eval-error:0.272727	train-error:0.062691
[37]	eval-error:0.272727	train-err

[57]	eval-error:0.276596	train-error:0.065066
[58]	eval-error:0.265957	train-error:0.065066
[59]	eval-error:0.265957	train-error:0.065066
[60]	eval-error:0.265957	train-error:0.065066
[61]	eval-error:0.265957	train-error:0.065066
[62]	eval-error:0.265957	train-error:0.065066
[63]	eval-error:0.255319	train-error:0.065066
[64]	eval-error:0.255319	train-error:0.065066
[65]	eval-error:0.255319	train-error:0.065066
[66]	eval-error:0.255319	train-error:0.065066
[67]	eval-error:0.255319	train-error:0.065066
[68]	eval-error:0.255319	train-error:0.065066
[69]	eval-error:0.255319	train-error:0.065066
[0]	eval-error:0.319149	train-error:0.115802
[1]	eval-error:0.329787	train-error:0.090369
[2]	eval-error:0.37234	train-error:0.076685
[3]	eval-error:0.319149	train-error:0.070101
[4]	eval-error:0.361702	train-error:0.067648
[5]	eval-error:0.351064	train-error:0.066228
[6]	eval-error:0.329787	train-error:0.066099
[7]	eval-error:0.308511	train-error:0.065582
[8]	eval-error:0.319149	train-error:0.06545

[26]	eval-error:0.287234	train-error:0.064937
[27]	eval-error:0.276596	train-error:0.064937
[28]	eval-error:0.276596	train-error:0.064937
[29]	eval-error:0.276596	train-error:0.064937
[30]	eval-error:0.276596	train-error:0.064937
[31]	eval-error:0.276596	train-error:0.064937
[32]	eval-error:0.265957	train-error:0.064937
[33]	eval-error:0.276596	train-error:0.064937
[34]	eval-error:0.276596	train-error:0.064937
[35]	eval-error:0.276596	train-error:0.064937
[36]	eval-error:0.276596	train-error:0.064937
[37]	eval-error:0.265957	train-error:0.064937
[38]	eval-error:0.265957	train-error:0.064937
[39]	eval-error:0.265957	train-error:0.064937
[40]	eval-error:0.265957	train-error:0.064937
[41]	eval-error:0.265957	train-error:0.064937
[42]	eval-error:0.265957	train-error:0.064937
[43]	eval-error:0.297872	train-error:0.064937
[44]	eval-error:0.297872	train-error:0.064937
[45]	eval-error:0.276596	train-error:0.064937
[46]	eval-error:0.265957	train-error:0.064937
[47]	eval-error:0.265957	train-err

[65]	eval-error:0.240458	train-error:0.066776
[66]	eval-error:0.244275	train-error:0.066776
[67]	eval-error:0.240458	train-error:0.066776
[68]	eval-error:0.244275	train-error:0.066776
[69]	eval-error:0.244275	train-error:0.066776
[0]	eval-error:0.28626	train-error:0.100914
[1]	eval-error:0.274809	train-error:0.086845
[2]	eval-error:0.267176	train-error:0.076632
[3]	eval-error:0.270992	train-error:0.071133
[4]	eval-error:0.263359	train-error:0.069133
[5]	eval-error:0.251908	train-error:0.068347
[6]	eval-error:0.255725	train-error:0.06799
[7]	eval-error:0.251908	train-error:0.067776
[8]	eval-error:0.259542	train-error:0.067705
[9]	eval-error:0.259542	train-error:0.067705
[10]	eval-error:0.259542	train-error:0.06749
[11]	eval-error:0.263359	train-error:0.067419
[12]	eval-error:0.263359	train-error:0.067419
[13]	eval-error:0.259542	train-error:0.067419
[14]	eval-error:0.259542	train-error:0.067419
[15]	eval-error:0.278626	train-error:0.067419
[16]	eval-error:0.270992	train-error:0.067419
[

[35]	eval-error:0.375	train-error:0.195821
[36]	eval-error:0.375	train-error:0.195821
[37]	eval-error:0.363636	train-error:0.195821
[38]	eval-error:0.352273	train-error:0.195821
[39]	eval-error:0.352273	train-error:0.195821
[40]	eval-error:0.352273	train-error:0.195821
[41]	eval-error:0.386364	train-error:0.195821
[42]	eval-error:0.386364	train-error:0.195821
[43]	eval-error:0.375	train-error:0.195821
[44]	eval-error:0.375	train-error:0.195821
[45]	eval-error:0.375	train-error:0.195821
[46]	eval-error:0.386364	train-error:0.195821
[47]	eval-error:0.375	train-error:0.195821
[48]	eval-error:0.375	train-error:0.195821
[49]	eval-error:0.375	train-error:0.195821
[50]	eval-error:0.375	train-error:0.195821
[51]	eval-error:0.375	train-error:0.195821
[52]	eval-error:0.375	train-error:0.195821
[53]	eval-error:0.375	train-error:0.195821
[54]	eval-error:0.375	train-error:0.195821
[55]	eval-error:0.375	train-error:0.195821
[56]	eval-error:0.375	train-error:0.195821
[57]	eval-error:0.375	train-error

[11]	eval-error:0.363636	train-error:0.196721
[12]	eval-error:0.352273	train-error:0.196361
[13]	eval-error:0.375	train-error:0.196361
[14]	eval-error:0.375	train-error:0.196361
[15]	eval-error:0.363636	train-error:0.196181
[16]	eval-error:0.375	train-error:0.196181
[17]	eval-error:0.363636	train-error:0.196001
[18]	eval-error:0.375	train-error:0.196001
[19]	eval-error:0.363636	train-error:0.196001
[20]	eval-error:0.363636	train-error:0.196001
[21]	eval-error:0.352273	train-error:0.196001
[22]	eval-error:0.352273	train-error:0.196001
[23]	eval-error:0.363636	train-error:0.196001
[24]	eval-error:0.363636	train-error:0.196001
[25]	eval-error:0.363636	train-error:0.195821
[26]	eval-error:0.363636	train-error:0.196001
[27]	eval-error:0.375	train-error:0.196001
[28]	eval-error:0.386364	train-error:0.195821
[29]	eval-error:0.386364	train-error:0.195821
[30]	eval-error:0.375	train-error:0.195821
[31]	eval-error:0.375	train-error:0.195821
[32]	eval-error:0.363636	train-error:0.195821
[33]	eval

[52]	eval-error:0.361702	train-error:0.175445
[53]	eval-error:0.361702	train-error:0.175445
[54]	eval-error:0.361702	train-error:0.175445
[55]	eval-error:0.361702	train-error:0.175445
[56]	eval-error:0.361702	train-error:0.175445
[57]	eval-error:0.361702	train-error:0.175445
[58]	eval-error:0.361702	train-error:0.175445
[59]	eval-error:0.361702	train-error:0.175445
[60]	eval-error:0.361702	train-error:0.175445
[61]	eval-error:0.361702	train-error:0.175445
[62]	eval-error:0.361702	train-error:0.175445
[63]	eval-error:0.361702	train-error:0.175445
[64]	eval-error:0.351064	train-error:0.175445
[65]	eval-error:0.351064	train-error:0.175445
[66]	eval-error:0.361702	train-error:0.175445
[67]	eval-error:0.361702	train-error:0.175445
[68]	eval-error:0.361702	train-error:0.175445
[69]	eval-error:0.361702	train-error:0.175445
[0]	eval-error:0.361702	train-error:0.235347
[1]	eval-error:0.361702	train-error:0.214433
[2]	eval-error:0.361702	train-error:0.203589
[3]	eval-error:0.361702	train-error:0

[21]	eval-error:0.354962	train-error:0.166048
[22]	eval-error:0.358779	train-error:0.166048
[23]	eval-error:0.377863	train-error:0.166048
[24]	eval-error:0.370229	train-error:0.166119
[25]	eval-error:0.366412	train-error:0.166048
[26]	eval-error:0.374046	train-error:0.166048
[27]	eval-error:0.366412	train-error:0.166048
[28]	eval-error:0.370229	train-error:0.166048
[29]	eval-error:0.374046	train-error:0.166048
[30]	eval-error:0.374046	train-error:0.166048
[31]	eval-error:0.377863	train-error:0.166048
[32]	eval-error:0.377863	train-error:0.166048
[33]	eval-error:0.377863	train-error:0.166048
[34]	eval-error:0.385496	train-error:0.166048
[35]	eval-error:0.385496	train-error:0.166048
[36]	eval-error:0.381679	train-error:0.166048
[37]	eval-error:0.385496	train-error:0.166048
[38]	eval-error:0.385496	train-error:0.166048
[39]	eval-error:0.377863	train-error:0.166048
[40]	eval-error:0.385496	train-error:0.166048
[41]	eval-error:0.381679	train-error:0.166048
[42]	eval-error:0.377863	train-err

[60]	eval-error:0.385496	train-error:0.166048
[61]	eval-error:0.385496	train-error:0.166048
[62]	eval-error:0.370229	train-error:0.166048
[63]	eval-error:0.370229	train-error:0.166048
[64]	eval-error:0.377863	train-error:0.166048
[65]	eval-error:0.374046	train-error:0.166048
[66]	eval-error:0.370229	train-error:0.166048
[67]	eval-error:0.366412	train-error:0.166048
[68]	eval-error:0.366412	train-error:0.166048
[69]	eval-error:0.366412	train-error:0.166048
[0]	eval-error:0.358779	train-error:0.208899
[1]	eval-error:0.377863	train-error:0.194686
[2]	eval-error:0.362595	train-error:0.186902
[3]	eval-error:0.370229	train-error:0.181474
[4]	eval-error:0.358779	train-error:0.178832
[5]	eval-error:0.362595	train-error:0.175546
[6]	eval-error:0.362595	train-error:0.174404
[7]	eval-error:0.354962	train-error:0.17219
[8]	eval-error:0.354962	train-error:0.171904
[9]	eval-error:0.354962	train-error:0.171404
[10]	eval-error:0.354962	train-error:0.170761
[11]	eval-error:0.370229	train-error:0.169333

[30]	eval-error:0.352273	train-error:0.161052
[31]	eval-error:0.352273	train-error:0.161052
[32]	eval-error:0.352273	train-error:0.161052
[33]	eval-error:0.352273	train-error:0.161052
[34]	eval-error:0.352273	train-error:0.161052
[35]	eval-error:0.352273	train-error:0.161052
[36]	eval-error:0.352273	train-error:0.161052
[37]	eval-error:0.352273	train-error:0.161052
[38]	eval-error:0.363636	train-error:0.161052
[39]	eval-error:0.352273	train-error:0.161052
[40]	eval-error:0.352273	train-error:0.161052
[41]	eval-error:0.352273	train-error:0.161052
[42]	eval-error:0.352273	train-error:0.161052
[43]	eval-error:0.352273	train-error:0.161052
[44]	eval-error:0.363636	train-error:0.161052
[45]	eval-error:0.363636	train-error:0.161052
[46]	eval-error:0.363636	train-error:0.161052
[47]	eval-error:0.363636	train-error:0.161052
[48]	eval-error:0.363636	train-error:0.161052
[49]	eval-error:0.363636	train-error:0.161052
[50]	eval-error:0.363636	train-error:0.161052
[51]	eval-error:0.363636	train-err

[69]	eval-error:0.261364	train-error:0.142677
[0]	eval-error:0.404255	train-error:0.164988
[1]	eval-error:0.361702	train-error:0.139556
[2]	eval-error:0.351064	train-error:0.123419
[3]	eval-error:0.382979	train-error:0.11606
[4]	eval-error:0.308511	train-error:0.112832
[5]	eval-error:0.297872	train-error:0.110509
[6]	eval-error:0.287234	train-error:0.110121
[7]	eval-error:0.297872	train-error:0.109347
[8]	eval-error:0.308511	train-error:0.109089
[9]	eval-error:0.287234	train-error:0.108959
[10]	eval-error:0.297872	train-error:0.108959
[11]	eval-error:0.287234	train-error:0.108959
[12]	eval-error:0.287234	train-error:0.108959
[13]	eval-error:0.308511	train-error:0.108959
[14]	eval-error:0.297872	train-error:0.108959
[15]	eval-error:0.297872	train-error:0.108959
[16]	eval-error:0.297872	train-error:0.108959
[17]	eval-error:0.276596	train-error:0.108959
[18]	eval-error:0.276596	train-error:0.108959
[19]	eval-error:0.276596	train-error:0.108959
[20]	eval-error:0.276596	train-error:0.108959

[39]	eval-error:0.37234	train-error:0.136716
[40]	eval-error:0.37234	train-error:0.136716
[41]	eval-error:0.382979	train-error:0.136716
[42]	eval-error:0.382979	train-error:0.136716
[43]	eval-error:0.382979	train-error:0.136716
[44]	eval-error:0.37234	train-error:0.136716
[45]	eval-error:0.37234	train-error:0.136716
[46]	eval-error:0.382979	train-error:0.136716
[47]	eval-error:0.382979	train-error:0.136716
[48]	eval-error:0.382979	train-error:0.136716
[49]	eval-error:0.382979	train-error:0.136716
[50]	eval-error:0.382979	train-error:0.136716
[51]	eval-error:0.37234	train-error:0.136716
[52]	eval-error:0.37234	train-error:0.136716
[53]	eval-error:0.382979	train-error:0.136716
[54]	eval-error:0.382979	train-error:0.136716
[55]	eval-error:0.382979	train-error:0.136716
[56]	eval-error:0.382979	train-error:0.136716
[57]	eval-error:0.382979	train-error:0.136716
[58]	eval-error:0.37234	train-error:0.136716
[59]	eval-error:0.382979	train-error:0.136716
[60]	eval-error:0.382979	train-error:0.13

[9]	eval-error:0.351145	train-error:0.103699
[10]	eval-error:0.343511	train-error:0.103628
[11]	eval-error:0.351145	train-error:0.103628
[12]	eval-error:0.362595	train-error:0.103557
[13]	eval-error:0.354962	train-error:0.103485
[14]	eval-error:0.362595	train-error:0.103485
[15]	eval-error:0.358779	train-error:0.103485
[16]	eval-error:0.354962	train-error:0.103485
[17]	eval-error:0.351145	train-error:0.103485
[18]	eval-error:0.358779	train-error:0.103485
[19]	eval-error:0.358779	train-error:0.103485
[20]	eval-error:0.358779	train-error:0.103485
[21]	eval-error:0.351145	train-error:0.103485
[22]	eval-error:0.366412	train-error:0.103485
[23]	eval-error:0.366412	train-error:0.103485
[24]	eval-error:0.362595	train-error:0.103485
[25]	eval-error:0.366412	train-error:0.103485
[26]	eval-error:0.370229	train-error:0.103485
[27]	eval-error:0.354962	train-error:0.103485
[28]	eval-error:0.358779	train-error:0.103485
[29]	eval-error:0.374046	train-error:0.103485
[30]	eval-error:0.366412	train-erro

[48]	eval-error:0.290076	train-error:0.086416
[49]	eval-error:0.28626	train-error:0.086416
[50]	eval-error:0.290076	train-error:0.086416
[51]	eval-error:0.28626	train-error:0.086416
[52]	eval-error:0.28626	train-error:0.086416
[53]	eval-error:0.282443	train-error:0.086416
[54]	eval-error:0.293893	train-error:0.086416
[55]	eval-error:0.290076	train-error:0.086416
[56]	eval-error:0.290076	train-error:0.086416
[57]	eval-error:0.28626	train-error:0.086416
[58]	eval-error:0.29771	train-error:0.086416
[59]	eval-error:0.290076	train-error:0.086416
[60]	eval-error:0.293893	train-error:0.086416
[61]	eval-error:0.293893	train-error:0.086416
[62]	eval-error:0.293893	train-error:0.086416
[63]	eval-error:0.290076	train-error:0.086416
[64]	eval-error:0.28626	train-error:0.086416
[65]	eval-error:0.290076	train-error:0.086416
[66]	eval-error:0.28626	train-error:0.086416
[67]	eval-error:0.29771	train-error:0.086416
[68]	eval-error:0.29771	train-error:0.086416
[69]	eval-error:0.29771	train-error:0.08641

[18]	eval-error:0.238636	train-error:0.029004
[19]	eval-error:0.238636	train-error:0.029004
[20]	eval-error:0.238636	train-error:0.029004
[21]	eval-error:0.25	train-error:0.029004
[22]	eval-error:0.238636	train-error:0.029004
[23]	eval-error:0.227273	train-error:0.029004
[24]	eval-error:0.238636	train-error:0.029004
[25]	eval-error:0.227273	train-error:0.029004
[26]	eval-error:0.238636	train-error:0.029004
[27]	eval-error:0.227273	train-error:0.029004
[28]	eval-error:0.238636	train-error:0.029004
[29]	eval-error:0.227273	train-error:0.029004
[30]	eval-error:0.261364	train-error:0.029004
[31]	eval-error:0.261364	train-error:0.029004
[32]	eval-error:0.25	train-error:0.029004
[33]	eval-error:0.238636	train-error:0.029004
[34]	eval-error:0.238636	train-error:0.029004
[35]	eval-error:0.25	train-error:0.029004
[36]	eval-error:0.25	train-error:0.029004
[37]	eval-error:0.25	train-error:0.029004
[38]	eval-error:0.25	train-error:0.029004
[39]	eval-error:0.25	train-error:0.029004
[40]	eval-error:

[61]	eval-error:0.276596	train-error:0.03873
[62]	eval-error:0.276596	train-error:0.03873
[63]	eval-error:0.276596	train-error:0.03873
[64]	eval-error:0.276596	train-error:0.03873
[65]	eval-error:0.276596	train-error:0.03873
[66]	eval-error:0.276596	train-error:0.03873
[67]	eval-error:0.276596	train-error:0.03873
[68]	eval-error:0.276596	train-error:0.03873
[69]	eval-error:0.276596	train-error:0.03873
[0]	eval-error:0.287234	train-error:0.100568
[1]	eval-error:0.265957	train-error:0.073845
[2]	eval-error:0.244681	train-error:0.056804
[3]	eval-error:0.244681	train-error:0.049574
[4]	eval-error:0.234043	train-error:0.045443
[5]	eval-error:0.212766	train-error:0.043635
[6]	eval-error:0.223404	train-error:0.042861
[7]	eval-error:0.223404	train-error:0.041441
[8]	eval-error:0.234043	train-error:0.040924
[9]	eval-error:0.223404	train-error:0.04015
[10]	eval-error:0.234043	train-error:0.040021
[11]	eval-error:0.212766	train-error:0.040021
[12]	eval-error:0.223404	train-error:0.039892
[13]	eva

[31]	eval-error:0.308511	train-error:0.03873
[32]	eval-error:0.308511	train-error:0.03873
[33]	eval-error:0.297872	train-error:0.03873
[34]	eval-error:0.297872	train-error:0.03873
[35]	eval-error:0.308511	train-error:0.03873
[36]	eval-error:0.308511	train-error:0.03873
[37]	eval-error:0.319149	train-error:0.03873
[38]	eval-error:0.319149	train-error:0.03873
[39]	eval-error:0.308511	train-error:0.03873
[40]	eval-error:0.319149	train-error:0.03873
[41]	eval-error:0.308511	train-error:0.03873
[42]	eval-error:0.308511	train-error:0.03873
[43]	eval-error:0.308511	train-error:0.03873
[44]	eval-error:0.319149	train-error:0.03873
[45]	eval-error:0.319149	train-error:0.03873
[46]	eval-error:0.319149	train-error:0.03873
[47]	eval-error:0.319149	train-error:0.03873
[48]	eval-error:0.319149	train-error:0.03873
[49]	eval-error:0.319149	train-error:0.03873
[50]	eval-error:0.319149	train-error:0.03873
[51]	eval-error:0.319149	train-error:0.03873
[52]	eval-error:0.319149	train-error:0.03873
[53]	eval-

[1]	eval-error:0.28626	train-error:0.071347
[2]	eval-error:0.263359	train-error:0.05992
[3]	eval-error:0.278626	train-error:0.054349
[4]	eval-error:0.263359	train-error:0.049921
[5]	eval-error:0.255725	train-error:0.048922
[6]	eval-error:0.263359	train-error:0.047493
[7]	eval-error:0.259542	train-error:0.04685
[8]	eval-error:0.263359	train-error:0.046565
[9]	eval-error:0.270992	train-error:0.046351
[10]	eval-error:0.270992	train-error:0.046279
[11]	eval-error:0.251908	train-error:0.046279
[12]	eval-error:0.255725	train-error:0.046279
[13]	eval-error:0.255725	train-error:0.046208
[14]	eval-error:0.263359	train-error:0.046136
[15]	eval-error:0.259542	train-error:0.046136
[16]	eval-error:0.263359	train-error:0.046208
[17]	eval-error:0.255725	train-error:0.046208
[18]	eval-error:0.267176	train-error:0.046136
[19]	eval-error:0.278626	train-error:0.046136
[20]	eval-error:0.263359	train-error:0.046136
[21]	eval-error:0.274809	train-error:0.046136
[22]	eval-error:0.263359	train-error:0.046136


[41]	eval-error:0.318182	train-error:0.29238
[42]	eval-error:0.318182	train-error:0.29238
[43]	eval-error:0.295455	train-error:0.29238
[44]	eval-error:0.295455	train-error:0.29238
[45]	eval-error:0.318182	train-error:0.29238
[46]	eval-error:0.295455	train-error:0.29238
[47]	eval-error:0.295455	train-error:0.29238
[48]	eval-error:0.295455	train-error:0.29238
[49]	eval-error:0.295455	train-error:0.29238
[50]	eval-error:0.295455	train-error:0.29238
[51]	eval-error:0.295455	train-error:0.29238
[52]	eval-error:0.295455	train-error:0.29238
[53]	eval-error:0.295455	train-error:0.29238
[54]	eval-error:0.295455	train-error:0.29238
[55]	eval-error:0.306818	train-error:0.29238
[56]	eval-error:0.318182	train-error:0.29238
[57]	eval-error:0.284091	train-error:0.29238
[58]	eval-error:0.295455	train-error:0.29238
[59]	eval-error:0.284091	train-error:0.29238
[60]	eval-error:0.295455	train-error:0.29238
[61]	eval-error:0.284091	train-error:0.29238
[62]	eval-error:0.284091	train-error:0.29238
[63]	eval-

[12]	eval-error:0.284091	train-error:0.279409
[13]	eval-error:0.295455	train-error:0.279049
[14]	eval-error:0.284091	train-error:0.278689
[15]	eval-error:0.284091	train-error:0.278869
[16]	eval-error:0.306818	train-error:0.278508
[17]	eval-error:0.295455	train-error:0.278689
[18]	eval-error:0.295455	train-error:0.278508
[19]	eval-error:0.295455	train-error:0.278508
[20]	eval-error:0.318182	train-error:0.278508
[21]	eval-error:0.306818	train-error:0.278508
[22]	eval-error:0.306818	train-error:0.278508
[23]	eval-error:0.318182	train-error:0.278508
[24]	eval-error:0.295455	train-error:0.278508
[25]	eval-error:0.306818	train-error:0.278508
[26]	eval-error:0.306818	train-error:0.278508
[27]	eval-error:0.306818	train-error:0.278508
[28]	eval-error:0.306818	train-error:0.278689
[29]	eval-error:0.318182	train-error:0.278508
[30]	eval-error:0.295455	train-error:0.278508
[31]	eval-error:0.306818	train-error:0.278508
[32]	eval-error:0.306818	train-error:0.278508
[33]	eval-error:0.295455	train-err

[52]	eval-error:0.659574	train-error:0.32236
[53]	eval-error:0.659574	train-error:0.32236
[54]	eval-error:0.659574	train-error:0.32236
[55]	eval-error:0.659574	train-error:0.32236
[56]	eval-error:0.659574	train-error:0.32236
[57]	eval-error:0.659574	train-error:0.32236
[58]	eval-error:0.659574	train-error:0.32236
[59]	eval-error:0.659574	train-error:0.32236
[60]	eval-error:0.659574	train-error:0.32236
[61]	eval-error:0.659574	train-error:0.32236
[62]	eval-error:0.659574	train-error:0.32236
[63]	eval-error:0.659574	train-error:0.32236
[64]	eval-error:0.659574	train-error:0.32236
[65]	eval-error:0.659574	train-error:0.32236
[66]	eval-error:0.659574	train-error:0.32236
[67]	eval-error:0.659574	train-error:0.32236
[68]	eval-error:0.659574	train-error:0.32236
[69]	eval-error:0.659574	train-error:0.32236
[0]	eval-error:0.56383	train-error:0.31668
[1]	eval-error:0.56383	train-error:0.316292
[2]	eval-error:0.56383	train-error:0.315905
[3]	eval-error:0.56383	train-error:0.315647
[4]	eval-error:

[23]	eval-error:0.328244	train-error:0.257106
[24]	eval-error:0.332061	train-error:0.257106
[25]	eval-error:0.335878	train-error:0.257106
[26]	eval-error:0.332061	train-error:0.257106
[27]	eval-error:0.328244	train-error:0.257106
[28]	eval-error:0.328244	train-error:0.257106
[29]	eval-error:0.339695	train-error:0.257106
[30]	eval-error:0.335878	train-error:0.257106
[31]	eval-error:0.332061	train-error:0.257106
[32]	eval-error:0.328244	train-error:0.257106
[33]	eval-error:0.332061	train-error:0.257106
[34]	eval-error:0.335878	train-error:0.257106
[35]	eval-error:0.328244	train-error:0.257106
[36]	eval-error:0.324427	train-error:0.257106
[37]	eval-error:0.324427	train-error:0.257106
[38]	eval-error:0.328244	train-error:0.257106
[39]	eval-error:0.332061	train-error:0.257106
[40]	eval-error:0.332061	train-error:0.257106
[41]	eval-error:0.335878	train-error:0.257106
[42]	eval-error:0.332061	train-error:0.257106
[43]	eval-error:0.332061	train-error:0.257106
[44]	eval-error:0.324427	train-err

[62]	eval-error:0.568702	train-error:0.308313
[63]	eval-error:0.568702	train-error:0.308313
[64]	eval-error:0.568702	train-error:0.308313
[65]	eval-error:0.568702	train-error:0.308313
[66]	eval-error:0.568702	train-error:0.308313
[67]	eval-error:0.568702	train-error:0.308313
[68]	eval-error:0.568702	train-error:0.308313
[69]	eval-error:0.568702	train-error:0.308313
[0]	eval-error:0.362595	train-error:0.256178
[1]	eval-error:0.354962	train-error:0.253178
[2]	eval-error:0.358779	train-error:0.250893
[3]	eval-error:0.358779	train-error:0.249322
[4]	eval-error:0.362595	train-error:0.247893
[5]	eval-error:0.366412	train-error:0.246251
[6]	eval-error:0.362595	train-error:0.245751
[7]	eval-error:0.362595	train-error:0.244679
[8]	eval-error:0.351145	train-error:0.244108
[9]	eval-error:0.343511	train-error:0.243751
[10]	eval-error:0.339695	train-error:0.243037
[11]	eval-error:0.351145	train-error:0.243037
[12]	eval-error:0.351145	train-error:0.242537
[13]	eval-error:0.343511	train-error:0.24253

[33]	eval-error:0.215909	train-error:0.022699
[34]	eval-error:0.204545	train-error:0.022699
[35]	eval-error:0.215909	train-error:0.022699
[36]	eval-error:0.215909	train-error:0.022699
[37]	eval-error:0.215909	train-error:0.022699
[38]	eval-error:0.215909	train-error:0.022699
[39]	eval-error:0.215909	train-error:0.022699
[40]	eval-error:0.215909	train-error:0.022699
[41]	eval-error:0.204545	train-error:0.022699
[42]	eval-error:0.204545	train-error:0.022699
[43]	eval-error:0.204545	train-error:0.022699
[44]	eval-error:0.204545	train-error:0.022699
[45]	eval-error:0.215909	train-error:0.022699
[46]	eval-error:0.215909	train-error:0.022699
[47]	eval-error:0.215909	train-error:0.022699
[48]	eval-error:0.204545	train-error:0.022699
[49]	eval-error:0.215909	train-error:0.022699
[50]	eval-error:0.215909	train-error:0.022699
[51]	eval-error:0.215909	train-error:0.022699
[52]	eval-error:0.215909	train-error:0.022699
[53]	eval-error:0.215909	train-error:0.022699
[54]	eval-error:0.227273	train-err

[4]	eval-error:0.287234	train-error:0.039117
[5]	eval-error:0.287234	train-error:0.037697
[6]	eval-error:0.287234	train-error:0.036406
[7]	eval-error:0.265957	train-error:0.035502
[8]	eval-error:0.265957	train-error:0.035115
[9]	eval-error:0.276596	train-error:0.034728
[10]	eval-error:0.276596	train-error:0.034599
[11]	eval-error:0.265957	train-error:0.034599
[12]	eval-error:0.276596	train-error:0.034599
[13]	eval-error:0.276596	train-error:0.034599
[14]	eval-error:0.265957	train-error:0.034469
[15]	eval-error:0.276596	train-error:0.034469
[16]	eval-error:0.265957	train-error:0.034469
[17]	eval-error:0.265957	train-error:0.034469
[18]	eval-error:0.265957	train-error:0.034469
[19]	eval-error:0.287234	train-error:0.034469
[20]	eval-error:0.297872	train-error:0.034469
[21]	eval-error:0.297872	train-error:0.034469
[22]	eval-error:0.297872	train-error:0.034469
[23]	eval-error:0.297872	train-error:0.034469
[24]	eval-error:0.287234	train-error:0.034469
[25]	eval-error:0.308511	train-error:0.0

[43]	eval-error:0.255319	train-error:0.034469
[44]	eval-error:0.265957	train-error:0.034469
[45]	eval-error:0.265957	train-error:0.034469
[46]	eval-error:0.265957	train-error:0.034469
[47]	eval-error:0.265957	train-error:0.034469
[48]	eval-error:0.265957	train-error:0.034469
[49]	eval-error:0.265957	train-error:0.034469
[50]	eval-error:0.265957	train-error:0.034469
[51]	eval-error:0.287234	train-error:0.034469
[52]	eval-error:0.287234	train-error:0.034469
[53]	eval-error:0.287234	train-error:0.034469
[54]	eval-error:0.287234	train-error:0.034469
[55]	eval-error:0.287234	train-error:0.034469
[56]	eval-error:0.287234	train-error:0.034469
[57]	eval-error:0.287234	train-error:0.034469
[58]	eval-error:0.287234	train-error:0.034469
[59]	eval-error:0.297872	train-error:0.034469
[60]	eval-error:0.287234	train-error:0.034469
[61]	eval-error:0.287234	train-error:0.034469
[62]	eval-error:0.297872	train-error:0.034469
[63]	eval-error:0.297872	train-error:0.034469
[64]	eval-error:0.287234	train-err

[13]	eval-error:0.251908	train-error:0.044137
[14]	eval-error:0.248092	train-error:0.044137
[15]	eval-error:0.244275	train-error:0.044137
[16]	eval-error:0.240458	train-error:0.044137
[17]	eval-error:0.229008	train-error:0.044137
[18]	eval-error:0.240458	train-error:0.044137
[19]	eval-error:0.236641	train-error:0.044137
[20]	eval-error:0.248092	train-error:0.044137
[21]	eval-error:0.248092	train-error:0.044137
[22]	eval-error:0.240458	train-error:0.044137
[23]	eval-error:0.236641	train-error:0.044137
[24]	eval-error:0.248092	train-error:0.044137
[25]	eval-error:0.251908	train-error:0.044137
[26]	eval-error:0.255725	train-error:0.044137
[27]	eval-error:0.244275	train-error:0.044137
[28]	eval-error:0.244275	train-error:0.044137
[29]	eval-error:0.248092	train-error:0.044137
[30]	eval-error:0.244275	train-error:0.044137
[31]	eval-error:0.240458	train-error:0.044137
[32]	eval-error:0.248092	train-error:0.044137
[33]	eval-error:0.232824	train-error:0.044137
[34]	eval-error:0.244275	train-err

[52]	eval-error:0.255725	train-error:0.044065
[53]	eval-error:0.259542	train-error:0.044065
[54]	eval-error:0.259542	train-error:0.044065
[55]	eval-error:0.251908	train-error:0.044065
[56]	eval-error:0.251908	train-error:0.044065
[57]	eval-error:0.259542	train-error:0.044065
[58]	eval-error:0.259542	train-error:0.044065
[59]	eval-error:0.259542	train-error:0.044065
[60]	eval-error:0.255725	train-error:0.044065
[61]	eval-error:0.255725	train-error:0.044065
[62]	eval-error:0.259542	train-error:0.044065
[63]	eval-error:0.255725	train-error:0.044065
[64]	eval-error:0.251908	train-error:0.044065
[65]	eval-error:0.248092	train-error:0.044065
[66]	eval-error:0.251908	train-error:0.044065
[67]	eval-error:0.255725	train-error:0.044065
[68]	eval-error:0.255725	train-error:0.044065
[69]	eval-error:0.251908	train-error:0.044065
[0]	eval-error:0.295455	train-error:0.10953
[1]	eval-error:0.261364	train-error:0.082508
[2]	eval-error:0.272727	train-error:0.069177
[3]	eval-error:0.261364	train-error:0.

[25]	eval-error:0.318182	train-error:0.059629
[26]	eval-error:0.306818	train-error:0.059629
[27]	eval-error:0.295455	train-error:0.059629
[28]	eval-error:0.295455	train-error:0.059629
[29]	eval-error:0.295455	train-error:0.059629
[30]	eval-error:0.295455	train-error:0.059629
[31]	eval-error:0.306818	train-error:0.059629
[32]	eval-error:0.318182	train-error:0.059629
[33]	eval-error:0.329545	train-error:0.059629
[34]	eval-error:0.295455	train-error:0.059629
[35]	eval-error:0.318182	train-error:0.059629
[36]	eval-error:0.295455	train-error:0.059629
[37]	eval-error:0.306818	train-error:0.059629
[38]	eval-error:0.306818	train-error:0.059629
[39]	eval-error:0.318182	train-error:0.059629
[40]	eval-error:0.318182	train-error:0.059629
[41]	eval-error:0.318182	train-error:0.059629
[42]	eval-error:0.295455	train-error:0.059629
[43]	eval-error:0.306818	train-error:0.059629
[44]	eval-error:0.306818	train-error:0.059629
[45]	eval-error:0.306818	train-error:0.059629
[46]	eval-error:0.306818	train-err

[66]	eval-error:0.329787	train-error:0.064033
[67]	eval-error:0.329787	train-error:0.064033
[68]	eval-error:0.329787	train-error:0.064033
[69]	eval-error:0.329787	train-error:0.064033
[0]	eval-error:0.340426	train-error:0.108314
[1]	eval-error:0.37234	train-error:0.085205
[2]	eval-error:0.37234	train-error:0.074619
[3]	eval-error:0.340426	train-error:0.068939
[4]	eval-error:0.308511	train-error:0.066486
[5]	eval-error:0.319149	train-error:0.065453
[6]	eval-error:0.329787	train-error:0.064808
[7]	eval-error:0.287234	train-error:0.064937
[8]	eval-error:0.319149	train-error:0.064679
[9]	eval-error:0.308511	train-error:0.064679
[10]	eval-error:0.340426	train-error:0.064679
[11]	eval-error:0.329787	train-error:0.064549
[12]	eval-error:0.308511	train-error:0.064549
[13]	eval-error:0.329787	train-error:0.064549
[14]	eval-error:0.329787	train-error:0.064549
[15]	eval-error:0.319149	train-error:0.064549
[16]	eval-error:0.329787	train-error:0.064549
[17]	eval-error:0.329787	train-error:0.064549


[35]	eval-error:0.255319	train-error:0.064033
[36]	eval-error:0.255319	train-error:0.064033
[37]	eval-error:0.255319	train-error:0.064033
[38]	eval-error:0.255319	train-error:0.064033
[39]	eval-error:0.276596	train-error:0.064033
[40]	eval-error:0.255319	train-error:0.064033
[41]	eval-error:0.265957	train-error:0.064033
[42]	eval-error:0.276596	train-error:0.064033
[43]	eval-error:0.276596	train-error:0.064033
[44]	eval-error:0.276596	train-error:0.064033
[45]	eval-error:0.255319	train-error:0.064033
[46]	eval-error:0.255319	train-error:0.064033
[47]	eval-error:0.265957	train-error:0.064033
[48]	eval-error:0.255319	train-error:0.064033
[49]	eval-error:0.255319	train-error:0.064033
[50]	eval-error:0.255319	train-error:0.064033
[51]	eval-error:0.255319	train-error:0.064033
[52]	eval-error:0.255319	train-error:0.064033
[53]	eval-error:0.265957	train-error:0.064033
[54]	eval-error:0.265957	train-error:0.064033
[55]	eval-error:0.276596	train-error:0.064033
[56]	eval-error:0.276596	train-err

[5]	eval-error:0.263359	train-error:0.066419
[6]	eval-error:0.259542	train-error:0.066205
[7]	eval-error:0.251908	train-error:0.065991
[8]	eval-error:0.255725	train-error:0.065919
[9]	eval-error:0.255725	train-error:0.065848
[10]	eval-error:0.259542	train-error:0.065776
[11]	eval-error:0.274809	train-error:0.065776
[12]	eval-error:0.270992	train-error:0.065776
[13]	eval-error:0.263359	train-error:0.065848
[14]	eval-error:0.255725	train-error:0.065776
[15]	eval-error:0.267176	train-error:0.065776
[16]	eval-error:0.259542	train-error:0.065776
[17]	eval-error:0.255725	train-error:0.065776
[18]	eval-error:0.248092	train-error:0.065776
[19]	eval-error:0.251908	train-error:0.065776
[20]	eval-error:0.255725	train-error:0.065776
[21]	eval-error:0.255725	train-error:0.065776
[22]	eval-error:0.248092	train-error:0.065776
[23]	eval-error:0.270992	train-error:0.065776
[24]	eval-error:0.255725	train-error:0.065776
[25]	eval-error:0.259542	train-error:0.065776
[26]	eval-error:0.263359	train-error:0.

[51]	eval-error:0.170455	train-error:0
[52]	eval-error:0.170455	train-error:0
[53]	eval-error:0.170455	train-error:0
[54]	eval-error:0.170455	train-error:0
[55]	eval-error:0.170455	train-error:0
[56]	eval-error:0.170455	train-error:0
[57]	eval-error:0.170455	train-error:0
[58]	eval-error:0.159091	train-error:0
[59]	eval-error:0.170455	train-error:0
[60]	eval-error:0.170455	train-error:0
[61]	eval-error:0.170455	train-error:0
[62]	eval-error:0.170455	train-error:0
[63]	eval-error:0.170455	train-error:0
[64]	eval-error:0.170455	train-error:0
[65]	eval-error:0.170455	train-error:0
[66]	eval-error:0.159091	train-error:0
[67]	eval-error:0.159091	train-error:0
[68]	eval-error:0.159091	train-error:0
[69]	eval-error:0.159091	train-error:0
[0]	eval-error:0.306818	train-error:0.081787
[1]	eval-error:0.261364	train-error:0.033507
[2]	eval-error:0.238636	train-error:0.015673
[3]	eval-error:0.204545	train-error:0.005765
[4]	eval-error:0.204545	train-error:0.002342
[5]	eval-error:0.204545	train-erro

[49]	eval-error:0.181818	train-error:0
[50]	eval-error:0.181818	train-error:0
[51]	eval-error:0.181818	train-error:0
[52]	eval-error:0.181818	train-error:0
[53]	eval-error:0.181818	train-error:0
[54]	eval-error:0.193182	train-error:0
[55]	eval-error:0.193182	train-error:0
[56]	eval-error:0.193182	train-error:0
[57]	eval-error:0.193182	train-error:0
[58]	eval-error:0.193182	train-error:0
[59]	eval-error:0.181818	train-error:0
[60]	eval-error:0.193182	train-error:0
[61]	eval-error:0.193182	train-error:0
[62]	eval-error:0.193182	train-error:0
[63]	eval-error:0.204545	train-error:0
[64]	eval-error:0.193182	train-error:0
[65]	eval-error:0.193182	train-error:0
[66]	eval-error:0.193182	train-error:0
[67]	eval-error:0.193182	train-error:0
[68]	eval-error:0.193182	train-error:0
[69]	eval-error:0.204545	train-error:0
[0]	eval-error:0.244681	train-error:0.072166
[1]	eval-error:0.276596	train-error:0.035889
[2]	eval-error:0.234043	train-error:0.018978
[3]	eval-error:0.255319	train-error:0.01007
[4

[21]	eval-error:0.308511	train-error:0.003098
[22]	eval-error:0.308511	train-error:0.003098
[23]	eval-error:0.297872	train-error:0.003098
[24]	eval-error:0.287234	train-error:0.003098
[25]	eval-error:0.308511	train-error:0.003098
[26]	eval-error:0.308511	train-error:0.003098
[27]	eval-error:0.308511	train-error:0.003098
[28]	eval-error:0.308511	train-error:0.003098
[29]	eval-error:0.287234	train-error:0.003098
[30]	eval-error:0.287234	train-error:0.003098
[31]	eval-error:0.297872	train-error:0.003098
[32]	eval-error:0.297872	train-error:0.003098
[33]	eval-error:0.308511	train-error:0.003098
[34]	eval-error:0.308511	train-error:0.003098
[35]	eval-error:0.297872	train-error:0.003098
[36]	eval-error:0.319149	train-error:0.003098
[37]	eval-error:0.308511	train-error:0.003098
[38]	eval-error:0.297872	train-error:0.003098
[39]	eval-error:0.287234	train-error:0.003098
[40]	eval-error:0.265957	train-error:0.003098
[41]	eval-error:0.265957	train-error:0.003098
[42]	eval-error:0.265957	train-err

[60]	eval-error:0.217557	train-error:0.007785
[61]	eval-error:0.217557	train-error:0.007785
[62]	eval-error:0.217557	train-error:0.007785
[63]	eval-error:0.217557	train-error:0.007785
[64]	eval-error:0.21374	train-error:0.007785
[65]	eval-error:0.217557	train-error:0.007785
[66]	eval-error:0.21374	train-error:0.007785
[67]	eval-error:0.21374	train-error:0.007785
[68]	eval-error:0.21374	train-error:0.007785
[69]	eval-error:0.206107	train-error:0.007785
[0]	eval-error:0.21374	train-error:0.065562
[1]	eval-error:0.206107	train-error:0.038709
[2]	eval-error:0.194656	train-error:0.025068
[3]	eval-error:0.21374	train-error:0.016283
[4]	eval-error:0.198473	train-error:0.012498
[5]	eval-error:0.20229	train-error:0.010427
[6]	eval-error:0.194656	train-error:0.008784
[7]	eval-error:0.198473	train-error:0.008713
[8]	eval-error:0.198473	train-error:0.008427
[9]	eval-error:0.209924	train-error:0.008142
[10]	eval-error:0.209924	train-error:0.00807
[11]	eval-error:0.20229	train-error:0.007927
[12]	ev

[30]	eval-error:0.21374	train-error:0.007785
[31]	eval-error:0.21374	train-error:0.007785
[32]	eval-error:0.209924	train-error:0.007785
[33]	eval-error:0.209924	train-error:0.007785
[34]	eval-error:0.209924	train-error:0.007785
[35]	eval-error:0.206107	train-error:0.007785
[36]	eval-error:0.21374	train-error:0.007785
[37]	eval-error:0.217557	train-error:0.007785
[38]	eval-error:0.21374	train-error:0.007785
[39]	eval-error:0.21374	train-error:0.007785
[40]	eval-error:0.21374	train-error:0.007785
[41]	eval-error:0.217557	train-error:0.007785
[42]	eval-error:0.217557	train-error:0.007785
[43]	eval-error:0.21374	train-error:0.007785
[44]	eval-error:0.209924	train-error:0.007785
[45]	eval-error:0.217557	train-error:0.007785
[46]	eval-error:0.217557	train-error:0.007785
[47]	eval-error:0.217557	train-error:0.007785
[48]	eval-error:0.225191	train-error:0.007785
[49]	eval-error:0.221374	train-error:0.007785
[50]	eval-error:0.221374	train-error:0.007785
[51]	eval-error:0.217557	train-error:0.00

In [207]:
feature_eval_data_xg_dswp = create_eval_df_from_results_macro(results, False)
feature_eval_data_xg_dswp

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.532468,0.55,0.530888,linguistic
1,max,Wikipedia,0.435897,0.414634,0.459459,linguistic
2,min,Wikipedia,0.428571,0.4125,0.445946,linguistic
3,weighted_mean,Wikipedia,0.486329,0.491182,0.495174,linguistic
4,mean,WikiNews,0.637193,0.660256,0.626263,linguistic
5,max,WikiNews,0.605042,0.652439,0.594697,linguistic
6,min,WikiNews,0.563242,0.572574,0.559975,linguistic
7,weighted_mean,WikiNews,0.545731,0.559072,0.544192,linguistic
8,mean,News,0.620979,0.671484,0.606449,linguistic
9,max,News,0.614258,0.655088,0.601595,linguistic


In [208]:
idx = feature_eval_data_xg_dswp.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_xg_dswp['f1']
feature_eval_data_xg_dswp[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
20,mean,News,0.66997,0.692099,0.656813,frequency
67,weighted_mean,WikiNews,0.664025,0.661666,0.666667,wordnet
99,"(weighted_mean, <function <lambda> at 0x000000FF52232E18>)",Wikipedia,0.690359,0.685388,0.695946,semantic


## X.2.2 Random Forest

In [209]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*random_forest(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets]

[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    4.0s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    3.4s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    3.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    5.4s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    5.4s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    5.5s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    5.0s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s f

[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    7.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    5.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    3.9s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    8.9s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    5.0s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    3.9s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done 1800 out of 180

[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   13.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   11.0s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   19.8s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done 1800 out of 180

[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    6.7s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   10.5s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   11.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    9.8s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    8.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   13.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   27.5s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s f

In [210]:
feature_eval_data_rf = create_eval_df_from_results_macro(results, False)
feature_eval_data_rf

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.453416,0.41954,0.493243,linguistic
1,max,Wikipedia,0.511654,0.590196,0.522201,linguistic
2,min,Wikipedia,0.504923,0.547619,0.515444,linguistic
3,weighted_mean,Wikipedia,0.51875,0.674419,0.528958,linguistic
4,mean,WikiNews,0.538393,0.642045,0.547348,linguistic
5,max,WikiNews,0.580357,0.731061,0.57702,linguistic
6,min,WikiNews,0.502646,0.58764,0.524621,linguistic
7,weighted_mean,WikiNews,0.502646,0.58764,0.524621,linguistic
8,mean,News,0.574391,0.751776,0.570648,linguistic
9,max,News,0.613569,0.785569,0.597434,linguistic


In [211]:
idx = feature_eval_data_rf.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_rf['f1']
feature_eval_data_rf[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
38,"(min, <function agg_feat_num_min at 0x000000FF52232950>)",Wikipedia,0.540399,0.568783,0.537645,corpus
54,min,WikiNews,0.633039,0.739216,0.61553,psycholinguistic
153,"(max, <function agg_feat_num_max at 0x000000FF522328C8>)",News,0.646029,0.790289,0.621793,all


In [212]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*random_forest(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets_dswp]

[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   22.4s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   21.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   21.0s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   23.4s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   30.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   28.8s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   28.6s finished
[Parallel(n_jobs=

[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   28.0s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   24.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   24.0s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   31.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   32.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   27.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=

[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    4.5s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    5.4s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    4.9s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    4.8s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    6.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    9.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    8.7s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.2s f

In [213]:
feature_eval_data_rf_dswp = create_eval_df_from_results_macro(results, False)
feature_eval_data_rf_dswp

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.453416,0.41954,0.493243,linguistic
1,max,Wikipedia,0.45679,0.420455,0.5,linguistic
2,min,Wikipedia,0.446541,0.417647,0.47973,linguistic
3,weighted_mean,Wikipedia,0.45,0.418605,0.486486,linguistic
4,mean,WikiNews,0.469448,0.551282,0.508838,linguistic
5,max,WikiNews,0.426829,0.380435,0.486111,linguistic
6,min,WikiNews,0.469448,0.551282,0.508838,linguistic
7,weighted_mean,WikiNews,0.426829,0.380435,0.486111,linguistic
8,mean,News,0.489785,0.696887,0.521931,linguistic
9,max,News,0.506075,0.731771,0.53086,linguistic


In [214]:
idx = feature_eval_data_rf_dswp.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_rf_dswp['f1']
feature_eval_data_rf_dswp[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
14,min,Wikipedia,0.607788,0.605023,0.611004,frequency
20,mean,News,0.620979,0.671484,0.606449,frequency
29,max,WikiNews,0.6834,0.686562,0.680556,language_model


## X.2.3 Random Forest (Extra)

In [215]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*random_forest_extra(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

In [216]:
feature_eval_data_rfe = create_eval_df_from_results_macro(results, False)
feature_eval_data_rfe

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.45679,0.420455,0.5,linguistic
1,max,Wikipedia,0.51875,0.674419,0.528958,linguistic
2,min,Wikipedia,0.446541,0.417647,0.47973,linguistic
3,weighted_mean,Wikipedia,0.45679,0.420455,0.5,linguistic
4,mean,WikiNews,0.469448,0.551282,0.508838,linguistic
5,max,WikiNews,0.469448,0.551282,0.508838,linguistic
6,min,WikiNews,0.469448,0.551282,0.508838,linguistic
7,weighted_mean,WikiNews,0.430303,0.38172,0.493056,linguistic
8,mean,News,0.521881,0.757143,0.539788,linguistic
9,max,News,0.534573,0.734519,0.54629,linguistic


In [217]:
idx = feature_eval_data_rfe.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_rfe['f1']
feature_eval_data_rfe[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
1,max,Wikipedia,0.51875,0.674419,0.528958,linguistic
137,"(max, <function agg_feat_num_max at 0x000000FF522328C8>)",WikiNews,0.598291,0.9,0.590909,wordnet+psycholinguistic
154,"(min, <function agg_feat_num_min at 0x000000FF52232950>)",News,0.600846,0.775574,0.588506,all


In [218]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*random_forest_extra(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets_dswp]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [219]:
feature_eval_data_rfe_dswp = create_eval_df_from_results_macro(results, False)
feature_eval_data_rfe_dswp

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.45679,0.420455,0.5,linguistic
1,max,Wikipedia,0.45679,0.420455,0.5,linguistic
2,min,Wikipedia,0.45,0.418605,0.486486,linguistic
3,weighted_mean,Wikipedia,0.45679,0.420455,0.5,linguistic
4,mean,WikiNews,0.545894,0.693258,0.554293,linguistic
5,max,WikiNews,0.545894,0.693258,0.554293,linguistic
6,min,WikiNews,0.502646,0.58764,0.524621,linguistic
7,weighted_mean,WikiNews,0.538393,0.642045,0.547348,linguistic
8,mean,News,0.466033,0.518701,0.503294,linguistic
9,max,News,0.481884,0.561924,0.512223,linguistic


In [220]:
idx = feature_eval_data_rfe_dswp.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_rfe_dswp['f1']
feature_eval_data_rfe_dswp[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
20,mean,News,0.633566,0.66063,0.621099,frequency
39,"(weighted_mean, <function <lambda> at 0x000000FF52232E18>)",Wikipedia,0.579861,0.574603,0.590734,corpus
66,min,WikiNews,0.687586,0.779299,0.660985,wordnet


## X.2.4 AdaBoost

In [255]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*adaboost(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [256]:
feature_eval_data_ada = create_eval_df_from_results_macro(results, False)
feature_eval_data_ada

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.447876,0.447876,0.447876,linguistic
1,max,Wikipedia,0.50359,0.504386,0.503861,linguistic
2,min,Wikipedia,0.453144,0.451795,0.454633,linguistic
3,weighted_mean,Wikipedia,0.447876,0.447876,0.447876,linguistic
4,mean,WikiNews,0.539498,0.53869,0.541035,linguistic
5,max,WikiNews,0.570776,0.573649,0.568813,linguistic
6,min,WikiNews,0.578895,0.584211,0.575758,linguistic
7,weighted_mean,WikiNews,0.693412,0.700676,0.6875,linguistic
8,mean,News,0.624373,0.664474,0.610524,linguistic
9,max,News,0.604653,0.624662,0.59596,linguistic


In [257]:
idx = feature_eval_data_ada.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_ada['f1']
feature_eval_data_ada[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
7,weighted_mean,WikiNews,0.693412,0.700676,0.6875,linguistic
132,"(mean, <function agg_feat_num_average at 0x000000FF52232D90>)",Wikipedia,0.618056,0.609524,0.633205,wordnet+psycholinguistic
152,"(mean, <function agg_feat_num_average at 0x000000FF52232D90>)",News,0.677394,0.705827,0.661668,all


In [None]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*adaboost(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets_dswp]

In [None]:
feature_eval_data_ada_dswp = create_eval_df_from_results_macro(results)
feature_eval_data_ada_dswp

In [None]:
idx = feature_eval_data_ada_dswp.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_ada_dswp['f1']
feature_eval_data_ada_dswp[idx]

## X.2.5 Decision Tree

In [221]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*decision_tree(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets]

  'precision', 'predicted', average, warn_for)


In [222]:
feature_eval_data_dt = create_eval_df_from_results_macro(results, False)
feature_eval_data_dt

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.456485,0.472698,0.458494,linguistic
1,max,Wikipedia,0.526098,0.526471,0.534749,linguistic
2,min,Wikipedia,0.503472,0.504762,0.505792,linguistic
3,weighted_mean,Wikipedia,0.496568,0.505208,0.507722,linguistic
4,mean,WikiNews,0.645797,0.638528,0.661616,linguistic
5,max,WikiNews,0.572163,0.569652,0.579545,linguistic
6,min,WikiNews,0.540117,0.541892,0.539141,linguistic
7,weighted_mean,WikiNews,0.605594,0.604103,0.607323,linguistic
8,mean,News,0.574253,0.578019,0.571689,linguistic
9,max,News,0.583092,0.586423,0.580617,linguistic


In [223]:
idx = feature_eval_data_dt.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_dt['f1']
feature_eval_data_dt[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
14,min,Wikipedia,0.628031,0.620133,0.639961,frequency
105,"(max, <function agg_feat_num_max at 0x000000FF522328C8>)",News,0.657516,0.65187,0.665049,semantic
136,"(mean, <function agg_feat_num_average at 0x000000FF52232D90>)",WikiNews,0.714189,0.733187,0.701389,wordnet+psycholinguistic


In [224]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*decision_tree(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets_dswp]

In [225]:
feature_eval_data_dt_dswp = create_eval_df_from_results_macro(results, False)
feature_eval_data_dt_dswp

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.5338,0.532799,0.541506,linguistic
1,max,Wikipedia,0.525217,0.524658,0.526062,linguistic
2,min,Wikipedia,0.388889,0.4,0.378378,linguistic
3,weighted_mean,Wikipedia,0.571096,0.566362,0.583977,linguistic
4,mean,WikiNews,0.563242,0.572574,0.559975,linguistic
5,max,WikiNews,0.58294,0.580106,0.595328,linguistic
6,min,WikiNews,0.521493,0.523958,0.52904,linguistic
7,weighted_mean,WikiNews,0.623095,0.625245,0.621212,linguistic
8,mean,News,0.596406,0.59525,0.597694,linguistic
9,max,News,0.537863,0.536892,0.541782,linguistic


In [226]:
idx = feature_eval_data_dt_dswp.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_dt_dswp['f1']
feature_eval_data_dt_dswp[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
44,"(mean, <function agg_feat_num_average at 0x000000FF52232D90>)",News,0.640014,0.634088,0.648838,corpus
54,min,WikiNews,0.653841,0.660147,0.719697,psycholinguistic
61,max,Wikipedia,0.589868,0.583156,0.612934,wordnet


## X.2.6 Logistic Regression

In [227]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*logistic_regression(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets]

  'precision', 'predicted', average, warn_for)


In [228]:
feature_eval_data_lr = create_eval_df_from_results_macro(results, False)
feature_eval_data_lr

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.51875,0.674419,0.528958,linguistic
1,max,Wikipedia,0.548718,0.593496,0.544402,linguistic
2,min,Wikipedia,0.504923,0.547619,0.515444,linguistic
3,weighted_mean,Wikipedia,0.511654,0.590196,0.522201,linguistic
4,mean,WikiNews,0.509022,0.638889,0.531566,linguistic
5,max,WikiNews,0.545894,0.693258,0.554293,linguistic
6,min,WikiNews,0.469448,0.551282,0.508838,linguistic
7,weighted_mean,WikiNews,0.545894,0.693258,0.554293,linguistic
8,mean,News,0.587791,0.764401,0.579577,linguistic
9,max,News,0.584554,0.740216,0.57715,linguistic


In [229]:
idx = feature_eval_data_lr.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_lr['f1']
feature_eval_data_lr[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
1,max,Wikipedia,0.548718,0.593496,0.544402,linguistic
11,weighted_mean,News,0.61006,0.763145,0.595007,linguistic
125,"[(max, <function agg_feat_num_max at 0x000000FF522328C8>), (max, <function agg_feat_num_max at 0x000000FF522328C8>)]",WikiNews,0.603376,0.713663,0.592803,corpus+semantic


In [230]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*logistic_regression(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets_dswp]

  'precision', 'predicted', average, warn_for)


In [231]:
feature_eval_data_lr_dswp = create_eval_df_from_results_macro(results, False)
feature_eval_data_lr_dswp

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.616493,0.733735,0.593629,linguistic
1,max,Wikipedia,0.557086,0.564935,0.553089,linguistic
2,min,Wikipedia,0.45679,0.420455,0.5,linguistic
3,weighted_mean,Wikipedia,0.576767,0.762745,0.564672,linguistic
4,mean,WikiNews,0.614268,0.614268,0.614268,linguistic
5,max,WikiNews,0.63481,0.632884,0.636995,linguistic
6,min,WikiNews,0.614268,0.614268,0.614268,linguistic
7,weighted_mean,WikiNews,0.65497,0.647595,0.668561,linguistic
8,mean,News,0.447734,0.443615,0.454837,linguistic
9,max,News,0.458075,0.456405,0.460558,linguistic


In [232]:
idx = feature_eval_data_lr_dswp.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_lr_dswp['f1']
feature_eval_data_lr_dswp[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
7,weighted_mean,WikiNews,0.65497,0.647595,0.668561,linguistic
23,weighted_mean,News,0.553484,0.551663,0.559639,frequency
121,"[(max, <function agg_feat_num_max at 0x000000FF522328C8>), (max, <function agg_feat_num_max at 0x000000FF522328C8>)]",Wikipedia,0.630816,0.619792,0.677606,corpus+semantic


## X.2.7 SVM

In [251]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*svm(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets]

  'precision', 'predicted', average, warn_for)


In [252]:
feature_eval_data_svm = create_eval_df_from_results_macro(results, False)
feature_eval_data_svm

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.498491,0.521687,0.508687,linguistic
1,max,Wikipedia,0.469298,0.466667,0.474903,linguistic
2,min,Wikipedia,0.557086,0.564935,0.553089,linguistic
3,weighted_mean,Wikipedia,0.532468,0.55,0.530888,linguistic
4,mean,WikiNews,0.627947,0.644385,0.619318,linguistic
5,max,WikiNews,0.63481,0.632884,0.636995,linguistic
6,min,WikiNews,0.42293,0.419298,0.427399,linguistic
7,weighted_mean,WikiNews,0.553363,0.572321,0.551136,linguistic
8,mean,News,0.579631,0.616604,0.572382,linguistic
9,max,News,0.5781,0.601524,0.571602,linguistic


In [253]:
idx = feature_eval_data_svm.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_svm['f1']
feature_eval_data_svm[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
2,min,Wikipedia,0.557086,0.564935,0.553089,linguistic
5,max,WikiNews,0.63481,0.632884,0.636995,linguistic
10,min,News,0.597542,0.630676,0.587812,linguistic


In [None]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*svm(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets_dswp]

In [None]:
feature_eval_data_svm_dswp = create_eval_df_from_results_macro(results)
feature_eval_data_svm_dswp

In [None]:
idx = feature_eval_data_svm_dswp.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_svm_dswp['f1']
feature_eval_data_svm_dswp[idx]

## X.2.8 Naive Bayes

In [233]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*naive_bayes(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets]

  'precision', 'predicted', average, warn_for)


In [234]:
feature_eval_data_nb = create_eval_df_from_results_macro(results, False)
feature_eval_data_nb

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.233788,0.58642,0.547297,linguistic
1,max,Wikipedia,0.194139,0.583333,0.527027,linguistic
2,min,Wikipedia,0.365079,0.527302,0.541506,linguistic
3,weighted_mean,Wikipedia,0.207621,0.584337,0.533784,linguistic
4,mean,WikiNews,0.197724,0.36413,0.484217,linguistic
5,max,WikiNews,0.212507,0.448718,0.491162,linguistic
6,min,WikiNews,0.219141,0.41236,0.475379,linguistic
7,weighted_mean,WikiNews,0.220009,0.619565,0.513889,linguistic
8,mean,News,0.176101,0.10687,0.5,linguistic
9,max,News,0.176101,0.10687,0.5,linguistic


In [235]:
idx = feature_eval_data_nb.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_nb['f1']
feature_eval_data_nb[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
15,weighted_mean,Wikipedia,0.59447,0.646384,0.580116,frequency
39,"(weighted_mean, <function <lambda> at 0x000000FF52232E18>)",Wikipedia,0.59447,0.646384,0.580116,corpus
52,mean,WikiNews,0.673759,0.667536,0.682449,psycholinguistic
58,min,News,0.644597,0.643688,0.645544,psycholinguistic


In [236]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*naive_bayes(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets_dswp]

  'precision', 'predicted', average, warn_for)


In [237]:
feature_eval_data_nb_dswp = create_eval_df_from_results_macro(results, False)
feature_eval_data_nb_dswp

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.453416,0.41954,0.493243,linguistic
1,max,Wikipedia,0.498491,0.521687,0.508687,linguistic
2,min,Wikipedia,0.45679,0.420455,0.5,linguistic
3,weighted_mean,Wikipedia,0.45,0.418605,0.486486,linguistic
4,mean,WikiNews,0.479842,0.887097,0.522727,linguistic
5,max,WikiNews,0.474593,0.63587,0.515783,linguistic
6,min,WikiNews,0.433735,0.382979,0.5,linguistic
7,weighted_mean,WikiNews,0.479842,0.887097,0.522727,linguistic
8,mean,News,0.437768,0.392308,0.495146,linguistic
9,max,News,0.436559,0.391892,0.492718,linguistic


In [238]:
idx = feature_eval_data_nb_dswp.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_nb_dswp['f1']
feature_eval_data_nb_dswp[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
58,min,News,0.609113,0.603927,0.618845,psycholinguistic
123,"[(weighted_mean, <function <lambda> at 0x000000FF52232E18>), (weighted_mean, <function <lambda> at 0x000000FF52232E18>)]",Wikipedia,0.628138,0.632308,0.624517,corpus+semantic
124,"[(mean, <function agg_feat_num_average at 0x000000FF52232D90>), (mean, <function agg_feat_num_average at 0x000000FF52232D90>)]",WikiNews,0.585737,0.64881,0.578914,corpus+semantic
147,"(weighted_mean, <function <lambda> at 0x000000FF52232E18>)",Wikipedia,0.628138,0.632308,0.624517,all
148,"(mean, <function agg_feat_num_average at 0x000000FF52232D90>)",WikiNews,0.585737,0.64881,0.578914,all


## X.2.9 kNN

In [239]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*knn(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets]

  'precision', 'predicted', average, warn_for)


In [240]:
feature_eval_data_kn = create_eval_df_from_results_macro(results, False)
feature_eval_data_kn

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.511654,0.590196,0.522201,linguistic
1,max,Wikipedia,0.511654,0.590196,0.522201,linguistic
2,min,Wikipedia,0.526294,0.925287,0.535714,linguistic
3,weighted_mean,Wikipedia,0.446541,0.417647,0.47973,linguistic
4,mean,WikiNews,0.439901,0.432026,0.467172,linguistic
5,max,WikiNews,0.472756,0.480952,0.489899,linguistic
6,min,WikiNews,0.415936,0.404464,0.432449,linguistic
7,weighted_mean,WikiNews,0.484388,0.508721,0.503788,linguistic
8,mean,News,0.533063,0.619157,0.540655,linguistic
9,max,News,0.516962,0.585874,0.5293,linguistic


In [241]:
idx = feature_eval_data_kn.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_kn['f1']
feature_eval_data_kn[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
86,min,Wikipedia,0.605128,0.682927,0.586873,brown_clustering
91,weighted_mean,WikiNews,0.713979,0.757384,0.692551,brown_clustering
129,"[(max, <function agg_feat_num_max at 0x000000FF522328C8>), (max, <function agg_feat_num_max at 0x000000FF522328C8>)]",News,0.627793,0.672423,0.612951,corpus+semantic
153,"(max, <function agg_feat_num_max at 0x000000FF522328C8>)",News,0.627793,0.672423,0.612951,all


In [242]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*knn(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets_dswp]

In [243]:
feature_eval_data_kn_dswp = create_eval_df_from_results_macro(results, False)
feature_eval_data_kn_dswp

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.492308,0.504065,0.501931,linguistic
1,max,Wikipedia,0.498491,0.521687,0.508687,linguistic
2,min,Wikipedia,0.511654,0.590196,0.522201,linguistic
3,weighted_mean,Wikipedia,0.435897,0.414634,0.459459,linguistic
4,mean,WikiNews,0.435096,0.425,0.460227,linguistic
5,max,WikiNews,0.519006,0.530357,0.521465,linguistic
6,min,WikiNews,0.411133,0.400422,0.425505,linguistic
7,weighted_mean,WikiNews,0.510417,0.536905,0.519571,linguistic
8,mean,News,0.53048,0.605882,0.538228,linguistic
9,max,News,0.502932,0.532197,0.514736,linguistic


In [244]:
idx = feature_eval_data_kn_dswp.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_kn_dswp['f1']
feature_eval_data_kn_dswp[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
23,weighted_mean,News,0.657516,0.678188,0.645458,frequency
47,"(weighted_mean, <function <lambda> at 0x000000FF52232E18>)",News,0.657516,0.678188,0.645458,corpus
98,"(min, <function agg_feat_num_min at 0x000000FF52232950>)",Wikipedia,0.636364,0.632184,0.722008,semantic
138,"(min, <function agg_feat_num_min at 0x000000FF52232950>)",WikiNews,0.688226,0.677923,0.722854,wordnet+psycholinguistic


## X.2.10 MLP

In [245]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*mlp(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets]

  'precision', 'predicted', average, warn_for)


In [246]:
feature_eval_data_mlp = create_eval_df_from_results_macro(results, False)
feature_eval_data_mlp

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.45679,0.420455,0.5,linguistic
1,max,Wikipedia,0.43949,0.415663,0.466216,linguistic
2,min,Wikipedia,0.43949,0.415663,0.466216,linguistic
3,weighted_mean,Wikipedia,0.45679,0.420455,0.5,linguistic
4,mean,WikiNews,0.538393,0.642045,0.547348,linguistic
5,max,WikiNews,0.603376,0.713663,0.592803,linguistic
6,min,WikiNews,0.496429,0.55303,0.517677,linguistic
7,weighted_mean,WikiNews,0.496429,0.55303,0.517677,linguistic
8,mean,News,0.51701,0.676987,0.534934,linguistic
9,max,News,0.5577,0.711307,0.559293,linguistic


In [247]:
idx = feature_eval_data_mlp.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_mlp['f1']
feature_eval_data_mlp[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
5,max,WikiNews,0.603376,0.713663,0.592803,linguistic
45,"(max, <function agg_feat_num_max at 0x000000FF522328C8>)",News,0.662398,0.652386,0.683773,corpus
97,"(max, <function agg_feat_num_max at 0x000000FF522328C8>)",Wikipedia,0.617761,0.617761,0.617761,semantic


In [248]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*mlp(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))), average='macro')) for fs in all_fc_datasets_dswp]

  'precision', 'predicted', average, warn_for)


In [249]:
feature_eval_data_mlp_dswp = create_eval_df_from_results_macro(results, False)
feature_eval_data_mlp_dswp

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,mean,Wikipedia,0.45679,0.420455,0.5,linguistic
1,max,Wikipedia,0.45679,0.420455,0.5,linguistic
2,min,Wikipedia,0.45679,0.420455,0.5,linguistic
3,weighted_mean,Wikipedia,0.45679,0.420455,0.5,linguistic
4,mean,WikiNews,0.545894,0.693258,0.554293,linguistic
5,max,WikiNews,0.563713,0.645349,0.563131,linguistic
6,min,WikiNews,0.515583,0.723443,0.53851,linguistic
7,weighted_mean,WikiNews,0.580357,0.731061,0.57702,linguistic
8,mean,News,0.537228,0.776575,0.548717,linguistic
9,max,News,0.531957,0.700794,0.543863,linguistic


In [250]:
idx = feature_eval_data_mlp_dswp.groupby(['dataset'])['f1'].transform(max) == feature_eval_data_mlp_dswp['f1']
feature_eval_data_mlp_dswp[idx]

Unnamed: 0,agg,dataset,f1,prec,rec,zc
47,"(weighted_mean, <function <lambda> at 0x000000FF52232E18>)",News,0.676067,0.665181,0.697555,corpus
102,"(min, <function agg_feat_num_min at 0x000000FF52232950>)",WikiNews,0.641281,0.650175,0.635101,semantic
108,mean,Wikipedia,0.563692,0.5625,0.592664,dictionary


# X. CPI Model for final CWPI System

Here we first load the DS-P datasets

In [83]:
# Load datasets and preprocess data
datasets_original_phrases = load_datasets(['Wikipedia', 'WikiNews', 'News'], 'TrainDev', 'Test', \
                         type_train='phrase', type_test='phrase', header=None)

In [81]:
def compute_predictions(dataset, model):
    test, preds = model(*transform_feat_to_num(
                remove_labels_for_binary_df(dataset.train), 
                remove_labels_for_binary_df(dataset.test)))
    return preds

def concatenate_preds_and_dataframe(dataframe, preds):
    df = dataframe.copy()
    df['prediction'] = preds
    return df

# X.1 Wikipedia

In [61]:
datasets = load_datasets(['Wikipedia'], 'TrainDev', 'Test', type_train='both', type_test='phrase')
datasets = preprocess_datasets(datasets)
datasets_fc_wordnet_wp = compute_features_wordnet(datasets, aggs=agg_weighted)
datasets_fc_dbpedia_wp = compute_features_dbpedia(datasets, aggs=agg_weighted)
datasets_fc_brown_clustering_wp = compute_features_brown_clustering(datasets, aggs=agg_weighted)
datasets_fc_semantic_wp = compute_features_semantic([datasets_fc_wordnet_wp, datasets_fc_dbpedia_wp, 
                                            datasets_fc_brown_clustering_wp])

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [None]:
results_wiki = [Result(fs, fs.fc, fs.agg,
                precision_recall_fscore_support(*xgboost(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
                remove_labels_for_binary_df(fs.test))), average='macro')) for fs in datasets_fc_semantic_wp]

In [70]:
create_eval_df_from_results_macro(results_wiki, False)

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,"(weighted_mean, <function <lambda> at 0x000000AF2B768AE8>)",Wikipedia,0.531006,0.538462,0.529685,semantic


In [None]:
predictions_wiki = compute_predictions(datasets_fc_semantic_wp[0], xgboost)
dataframe_preds_wiki_phrases = concatenate_preds_and_dataframe(datasets_original_phrases[0].test, predictions_wiki)

In [98]:
MAIN_PATH_DATASET = "../cwishareddataset/traindevset/english/"
# Save best Wikipedia features
dataframe_preds_wiki_phrases.to_csv(MAIN_PATH_DATASET+'WikipediaPhrase_Test.tsv', sep='\t', \
                                encoding='utf-8', index=False)

# X.2 WikiNews

In [72]:
datasets = load_datasets(['WikiNews'], 'TrainDev', 'Test', type_train='phrase', type_test='phrase')
datasets = preprocess_datasets(datasets)
datasets_fc_wordnet_wikinews = compute_features_wordnet(datasets, aggs=agg_max)
datasets_fc_dbpedia_wikinews = compute_features_dbpedia(datasets, aggs=agg_max)
datasets_fc_brown_clustering_wikinews = compute_features_brown_clustering(datasets, aggs=agg_max)
datasets_fc_semantic_wikinews = compute_features_semantic([datasets_fc_wordnet_wikinews, 
                                    datasets_fc_dbpedia_wikinews, datasets_fc_brown_clustering_wikinews])

  out=out, **kwargs)
  return umr_maximum(a, axis, None, out, keepdims)
  ret = ret.dtype.type(ret / rcount)


In [None]:
results_wikinews = [Result(fs, fs.fc, fs.agg,
                precision_recall_fscore_support(*xgboost(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
                remove_labels_for_binary_df(fs.test))), average='macro')) for fs in datasets_fc_semantic_wikinews]

In [74]:
create_eval_df_from_results_macro(results_wikinews, False)

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,"(max, <function agg_feat_num_max at 0x000000AF2B768A60>)",WikiNews,0.604191,0.632785,0.594885,semantic


In [None]:
predictions_wikinews = compute_predictions(datasets_fc_semantic_wikinews[0], xgboost)
dataframe_preds_wikinews_phrases = concatenate_preds_and_dataframe(datasets_original_phrases[1].test, predictions_wikinews)

In [99]:
MAIN_PATH_DATASET = "../cwishareddataset/traindevset/english/"
# Save best Wikipedia features
dataframe_preds_wikinews_phrases.to_csv(MAIN_PATH_DATASET+'WikiNewsPhrase_Test.tsv', sep='\t', \
                                encoding='utf-8', index=False)

# X.3 News

In [76]:
datasets = load_datasets(['News'], 'TrainDev', 'Test', type_train='phrase', type_test='phrase')
datasets = preprocess_datasets(datasets)
# 1. Linguistic Features
datasets_fc_linguistic = compute_features_linguistic(datasets, aggs=agg_mean)
# 2. Corpus Features
datasets_fc_frequency = compute_features_frequency(datasets, aggs=agg_mean)
datasets_fc_language_model = compute_features_language_model(datasets, aggs=agg_mean)
datasets_fc_corpus = compute_features_corpus([datasets_fc_frequency, datasets_fc_language_model])
# 3. Psycholinguistic
datasets_fc_psycholinguistic = compute_features_psycholinguistic(datasets, aggs=agg_mean)
# 4. Semantic Features
datasets_fc_wordnet = compute_features_wordnet(datasets, aggs=agg_mean)
datasets_fc_dbpedia = compute_features_dbpedia(datasets, aggs=agg_mean)
datasets_fc_brown_clustering = compute_features_brown_clustering(datasets, aggs=agg_mean)
datasets_fc_semantic = compute_features_semantic([datasets_fc_wordnet, datasets_fc_dbpedia, datasets_fc_brown_clustering])
# 5. Dictionary Features
datasets_fc_dictionary = compute_features_dictionary(datasets, aggs=agg_mean)
#(3) All categories
datasets_fc_all = concat_feature_datasets(datasets_fc_linguistic, datasets_fc_psycholinguistic, \
                            datasets_fc_semantic, datasets_fc_corpus, datasets_fc_dictionary, name='all')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [78]:
results_news = [Result(fs, fs.fc, fs.agg,
                precision_recall_fscore_support(*adaboost(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
                remove_labels_for_binary_df(fs.test))), average='macro')) for fs in datasets_fc_all]

In [79]:
create_eval_df_from_results_macro(results_news, False)

Unnamed: 0,agg,dataset,f1,prec,rec,zc
0,"(mean, <function agg_feat_num_average at 0x000000AF2B768D08>)",News,0.519678,0.520635,0.525341,all


In [94]:
predictions_news = compute_predictions(datasets_fc_all[0], adaboost)
dataframe_preds_news_phrases = concatenate_preds_and_dataframe(datasets_original_phrases[2].test, predictions_news)

In [100]:
MAIN_PATH_DATASET = "../cwishareddataset/traindevset/english/"
# Save best Wikipedia features
dataframe_preds_news_phrases.to_csv(MAIN_PATH_DATASET+'NewsPhrase_Test.tsv', sep='\t', \
                                encoding='utf-8', index=False)

# Misc

In [101]:
len(datasets)

3

In [463]:
# Create correlation matrix
corr_matrix = datasets_fc_all[1].train.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.80) and column not in columns]
# Drop features 
#df.drop(df.columns[to_drop], axis=1)

In [464]:
datasets_fc_all_corr_feats = [FeatureDataset(ds.name, ds.fc, ds.agg, ds.train.drop(to_drop, axis=1),
                 ds.test.drop(to_drop, axis=1)) for ds in datasets_fc_all]

In [None]:
results = [Result(fs, fs.fc, fs.agg,
        precision_recall_fscore_support(*xgboost(*transform_feat_to_num(remove_labels_for_binary_df(fs.train), 
        remove_labels_for_binary_df(fs.test))))) for fs in datasets_fc_all_corr_feats]

In [466]:
feature_eval_data = create_eval_df_from_results(results)
feature_eval_data

Unnamed: 0,dataset,f1,prec,rec,zc
0,Wikipedia,0.718929,0.728682,0.709434,all
1,WikiNews,0.773619,0.775,0.772242,all
2,News,0.806551,0.809035,0.804082,all
