# Context-Aware Complex Word Identification
Here we devise and implement all the relevant methods for evaluating the influence of context words for the complexity of a given target word. Thus, we implement various context definition methods that extract context words for a target based on different ideas (e.g. local context, grammatical context and semantic context). Afterwards we compute features for the context and use these features to represent the context in the classification task.

## 1. Load Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.wsd import lesk
import nltk

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 200)

In [2]:
from collections import namedtuple
from collections import defaultdict

Model = namedtuple('Model', 'type, name, dimension, corpus, model')
Dataset = namedtuple('Dataset', 'name, train, test')
FeatureDataset = namedtuple('FeatureDataset', 'name, fc, agg, train, test')
FeatureCategory = namedtuple('FeatureCategory', 'name, func')
Aggregation = namedtuple('Aggregation', 'name, agg')

In [3]:
from nltk import word_tokenize

columns = ['id', 'sentence', "start", "end", "target", 
           "nat", "non_nat", "nat_marked", "non_nat_marked", "binary", "prob"]

def load_df(path, d_type, header):
    df = pd.read_csv(path, header=header, sep = "\t")
    if len(df.columns) == len(columns):
        df.columns = columns
    if d_type == 'word':
        df = df.loc[df.target.map(lambda target : len(word_tokenize(target)))<=1,]
    elif d_type == 'phrase':
        df = df.loc[df.target.map(lambda target : len(word_tokenize(target)))>1,]
    return df

def load_datasets(names, train_name, test_name, type_train = None, type_test = None, header=None):
    MAIN_PATH_DATASET = "../cwishareddataset/traindevset/english/"
    datasets = [Dataset(name, load_df(MAIN_PATH_DATASET + name + '_' + train_name + '.tsv', type_train, header),
                              load_df(MAIN_PATH_DATASET + name + '_' + test_name + '.tsv', type_test, header))
                              for name in names]
    return datasets

## 2.1 Preprocessing

In [4]:
from nltk.stem.wordnet import *
from nltk import word_tokenize
from functools import lru_cache
from utils import penn_to_wn
import re
import unicodedata
import sys

wordNetLemmatizer = WordNetLemmatizer()

def overlaps(start1, end1, start2, end2):
    return bool(range(max(start1, start2), min(end1, end2)+1))

tbl = dict.fromkeys(i for i in range(sys.maxunicode)
                      if unicodedata.category(chr(i)).startswith('P'))

def remove_punctuation(text):
    return text.translate(tbl)

@lru_cache(maxsize=None)
def targets_with_index(start, end, context):
    curr_pos = 0
    targets = []
    j = 0
    w = 0
    curr_split = ''
    ctx_split = context.split()
    whitespaces = re.findall('\s+', context)
    num_whitespaces = [len(token) for token in whitespaces]
    num_whitespaces.append(1)
    tokens = word_tokenize(context)
    tokens = ['"' if token not in context else token for token in tokens]
    for index, token in enumerate(tokens, 1):
        targets.append((token, index, curr_pos, (curr_pos + len(token))))
        curr_pos += len(token)
        curr_split += token
        if ctx_split[j] == curr_split:
            curr_pos += num_whitespaces[w]
            j += 1
            w += 1
            curr_split = ''
    vals = [(target[0], target[1]) for target in targets \
            if overlaps(start, end, target[2], target[3])]
    return [val for val in vals if val[0] != '"']

@lru_cache(maxsize=None)
def wordnet_pos_tagging(sentence):
    tokens = word_tokenize(sentence)
    return nltk.pos_tag(tokens)

def pos_tags(start, end, sentence):
    wordPOSPairs = wordnet_pos_tagging(sentence)
    targets_index = targets_with_index(start, end, sentence)
    results = [wordPOSPairs[tpl[1]-1][1] for tpl in targets_index]
    filtered_results = [result for result in results 
                        if remove_punctuation(result).strip() and result != 'POS']
    return filtered_results if len(filtered_results) > 0 else None

def wordnet_lemma(target, pos):
    tokens = nltk.word_tokenize(target)
    if pos:
        pos = [penn_to_wn(poss) if penn_to_wn(poss) else 'n' for poss in pos]
        lemmas = [wordNetLemmatizer.lemmatize(token, poss)
                     for token, poss in zip(tokens, pos)]
        return ' '.join(lemmas)
    return target

def preprocessing(dataframe):
    df = dataframe.copy()
    df['p_sentence'] = df.sentence.apply(lambda sent : sent.strip().lower())
    df['sentence'] = df.sentence.apply(lambda sent : sent.replace("''", "``"))
    df['p_target'] = df.target.apply(lambda target : target.strip().lower())
    df['pos_tags'] = df[['start', 'end', 'sentence']].apply(lambda vals : pos_tags(*vals), axis = 1)
    df['pos_tags_pt'] = df.pos_tags.apply(lambda pos : [penn_to_wn(poss) if penn_to_wn(poss) else 'n' for poss in pos])
    df['lemma'] = df[['target', 'pos_tags']].apply(lambda vals : wordnet_lemma(*vals), axis = 1)
    df['p_lemma'] = df.lemma.apply(lambda lemma : lemma.strip().lower())
    return df

In [5]:
def preprocess_datasets(datasets):
    return [Dataset(ds.name, preprocessing(ds.train), 
                             preprocessing(ds.test)) 
                             for ds in datasets]

## 2.2 Context-Token Aggregation
First we define how feature values of multiple context-tokens should be aggreagated.

In [6]:
from nltk.tokenize import word_tokenize

word_freq_wiki = {}
sum_counts = 0
with open("resources/word-freq-dumps/enwiki-20150602-words-frequency.txt", encoding="utf8") as file:
    for line in file:
        word, freq = line.partition(" ")[::2]
        sum_counts+=int(freq)
        word_freq_wiki[word.strip()] = int(freq)
        
def get_unigram_probability(word):
    return word_freq_wiki.get(word,1) / (sum_counts + len(word_freq_wiki))

def agg_ctx_feat_num_average(tokens, func_feature, *args, **kwargs):
    if all(isinstance(tpl, tuple) for tpl in tokens):
        return np.mean([func_feature(token, *args) for token, dist in tokens])
    return np.mean([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_weighted_average(tokens, func_feature, alpha, *args):
    if all(isinstance(tpl, tuple) for tpl in tokens):
        if len(tokens)==1:
            return np.mean([func_feature(token, *args) for token, dist in tokens])
        prob_sum = np.sum([(alpha/(alpha+get_unigram_probability(token))) for token, dist in tokens])
        return np.mean([((alpha/(alpha+get_unigram_probability(token)))/prob_sum) * 
                func_feature(token, *args) for token, dist in tokens])
    prob_sum = np.sum([(alpha/(alpha+get_unigram_probability(token))) for token in tokens])
    return np.mean([((alpha/(alpha+get_unigram_probability(token)))/prob_sum) * 
                func_feature(token, *args) for token in tokens])

agg_ctx_feat_num_weighted_average_medium = lambda target, func_feature, *args: \
                        agg_ctx_feat_num_weighted_average(target, func_feature, 0.0001, *args)

def agg_ctx_feat_num_distance(tokens, func_feature, *args):
    if all(isinstance(tpl, tuple) for tpl in tokens):
        if len(tokens)==1:
            return np.mean([func_feature(token, *args) for token, dist in tokens])
        dist_sum = np.sum(dist for token, dist in tokens)
        probs_sum = np.sum([((dist_sum-dist)/dist_sum) for token, dist in tokens])
        return np.sum([(func_feature(token, *args) * ((1/probs_sum)*((dist_sum-dist)/dist_sum)))
                    for token, dist in tokens])
    return np.mean([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_median(tokens, func_feature, *args):
    if all(isinstance(tpl, tuple) for tpl in tokens):
        return np.median([func_feature(token, *args) for token, dist in tokens])
    return np.median([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_max(tokens, func_feature, *args):
    if all(isinstance(tpl, tuple) for tpl in tokens):
        vals = [func_feature(token, *args) for token, dist in tokens]
        return np.max(vals) if vals else 0
    return np.max([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_min(tokens, func_feature, *args):
    if all(isinstance(tpl, tuple) for tpl in tokens):
        vals = [func_feature(token, *args) for token, dist in tokens]
        return np.min(vals) if vals else 0
    return np.min([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_sum(tokens, func_feature, *args):
    if all(isinstance(tpl, tuple) for tpl in tokens):
        return np.sum([func_feature(token, *args) for token, dist in tokens])
    return np.sum([func_feature(token, *args) for token in tokens])

In [7]:
agg_default = [Aggregation('mean', agg_ctx_feat_num_average)]
agg_distance = [Aggregation('dist', agg_ctx_feat_num_distance)]
agg_weighted = [Aggregation('weighted', agg_ctx_feat_num_weighted_average_medium)]
agg_max = [Aggregation('max', agg_ctx_feat_num_max)]
agg_min = [Aggregation('min', agg_ctx_feat_num_min)]
aggs_small = [Aggregation('mean', agg_ctx_feat_num_average), Aggregation('max', agg_ctx_feat_num_max)]
aggs_all = [Aggregation('mean', agg_ctx_feat_num_average),
            Aggregation('max', agg_ctx_feat_num_max), Aggregation('min', agg_ctx_feat_num_min),
            Aggregation('weighted', agg_ctx_feat_num_weighted_average_medium),
            Aggregation('dist', agg_ctx_feat_num_distance)]

In [8]:
def concat_feature_datasets(*args):
    zipped = zip(*args)
    concat_features = []
    for dataset in zipped:
        df_train = None
        df_test = None
        ctxs = []
        fcs = []
        aggs = []
        for tpl in dataset:
            if not fcs:
                df_train = tpl.train.copy()
                df_test = tpl.test.copy()
            else:
                df_train = pd.concat([df_train, tpl.train.copy()], axis = 1)
                df_test = pd.concat([df_test, tpl.test.copy()], axis = 1)
            ctxs.append(tpl.context)
            fcs.append(tpl.fc)
            aggs.append(tpl.agg)
        concat_features.append(ContextFeatureDataset(tpl.name, ctxs, fcs, aggs,
                    df_train.loc[:,~df_train.columns.duplicated()], 
                    df_test.loc[:,~df_test.columns.duplicated()]))
    return concat_features

## 3. Context Definition and Extraction
Here we compute different kinds of context definitions. For example, as a baseline we extract all tokens from the sentence except the target. A second approach is to use a n preceeding or n succeding tokens, or a combined window apporach were we extract n tokens preceeding and succeding of the target. A more sophisticated apporach involves dependency parsing of the sentence and applying different extraction heuristics. Finally we also implement a context extraction approach exploting FrameNet semantic parsing.

### 3.1 Context Definition

In [9]:
from nltk.tokenize import word_tokenize
from nltk.parse.corenlp import *
import os
from functools import lru_cache

# First make sure that the StanfordCoreNLP Server is running under port 9010
# cd to stanfordCoreNLP directory
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9011 -timeout 15000
parser = CoreNLPDependencyParser(url='http://localhost:9011/')

with open("resources/dictionaries/stopwords_en.txt", encoding="utf8") as file:
    content = [line.strip().lower() for line in file.readlines()]
    stop_words = set(content)
    
def overlaps(start1, end1, start2, end2):
    return bool(range(max(start1, start2), min(end1, end2)+1))

def post_process_ctx(context, filtering=True):
    return [token for token in context if 
            (token.isalnum() and (not filtering
        or preprocess_target(token).lower() not in stop_words))]

def preprocess_target(target):
    return target.strip()

def target_index_char_based(start, end, ctx_tokens):
    size = np.sum([len(token) for token in ctx_tokens]) + len(ctx_tokens)
    target_pos = (start + end) / 2
    target_pos_rel = target_pos / size
    return int(target_pos_rel * len(post_process_ctx(ctx_tokens)))

@lru_cache(maxsize=None)
def targets_with_index(start, end, context):
    curr_pos = 0
    targets = []
    j = 0
    w = 0
    curr_split = ''
    ctx_split = context.split()
    whitespaces = re.findall('\s+', context)
    num_whitespaces = [len(token) for token in whitespaces]
    num_whitespaces.append(1)
    tokens = word_tokenize(context)
    tokens = ['"' if token not in context else token for token in tokens]
    for index, token in enumerate(tokens, 1):
        targets.append((token, index, curr_pos, (curr_pos + len(token))))
        curr_pos += len(token)
        curr_split += token
        if ctx_split[j] == curr_split:
            curr_pos += num_whitespaces[w]
            j += 1
            w += 1
            curr_split = ''
    vals = [(target[0], target[1]) for target in targets \
            if overlaps(start, end, target[2], target[3])]
    return [val for val in vals if val[0] != '"']

from joblib import Memory
memory = Memory(location='resources/dependency-cache-corenlp', verbose=0)
@memory.cache
def dependency_parse_with_root(sentence):
    try:
        dependency_parser = parser.raw_parse(sentence)
        dependencies = []
        parsetree = list(dependency_parser)[0]
        for index, node in parsetree.nodes.items():
            for relation, dependant in parsetree.nodes[index]['deps'].items():
                for dep in dependant:
                    triple = ((node['word'], index), relation, \
                              (parsetree.nodes[dep]['word'], dep))
                    dependencies.append(triple)
        return dependencies
    except:
        return []

@lru_cache(maxsize=None)
def dependency_parse(sentence):
    dependencies = dependency_parse_with_root(sentence)
    filtered_dependencies = [triple for triple in dependencies if triple[1] != 'ROOT']
    return filtered_dependencies

def ctx_extraction_sentence(context, target):
    ctx_tokens = word_tokenize(context)
    if target in ctx_tokens:
        ctx_tokens.remove(target)
    return ctx_tokens

def ctx_extraction_sentence_filtered(context, target, start, end, filtering = True):
    context = context[:start] + context[end:]
    ctx_tokens = word_tokenize(context)
    post_ctx_tokens = post_process_ctx(ctx_tokens, filtering)
    return ctx_tokens

def ctx_extraction_hit(context, filtering = True):
    hit_tokens = [token for sentence in context for token in word_tokenize(sentence)]
    post_ctx_tokens = post_process_ctx(hit_tokens, filtering)
    return post_ctx_tokens

def ctx_extraction_window_pre_n(context, target, start, end, 
                            filtering = True, n = 3, dist = True):
    target = preprocess_target(target)
    ctx_tokens = word_tokenize(context[:start])
    post_ctx_tokens = post_process_ctx(ctx_tokens, filtering)
    return [(elem, index) for index, elem in zip(range(n, 0, -1), post_ctx_tokens[-n:])] \
                if dist else post_ctx_tokens[-n:]

def ctx_extraction_window_suc_n(context, target, start, end, 
                            filtering = True, n = 3, dist = True):
    target = preprocess_target(target)
    ctx_tokens = word_tokenize(context[end:])
    post_ctx_tokens = post_process_ctx(ctx_tokens, filtering)
    return [(elem, index) for index, elem in zip(range(1, (n+1)), post_ctx_tokens[:n])] \
                if dist else post_ctx_tokens[:n]

def ctx_extraction_window_pre_suc_n(context, target, start, end, 
                                filtering = True, n = 3, dist = True):
    ctx_tokens_pre = ctx_extraction_window_pre_n(context, target, start, end, filtering, n, dist)
    ctx_tokens_suc = ctx_extraction_window_suc_n(context, target, start, end, filtering, n, dist)
    ctx_tokens_pre.extend(ctx_tokens_suc)
    return ctx_tokens_pre

def ctx_extraction_dep_in(context, target, start, end, dist = True):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    selec_tuples = list(set([triple for triple in triples \
                if triple[2] in targets and triple[0] not in targets]))
    return [(triple[0][0], np.abs(triple[0][1]-triple[2][1])) for triple in selec_tuples] if dist \
                else [triple[0][0] for triple in selec_tuples]

def ctx_extraction_dep_out(context, target, start, end, dist = True):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    selec_tuples = list(set([triple for triple in triples \
                if triple[0] in targets and triple[2] not in targets]))
    return [(triple[2][0], np.abs(triple[0][1]-triple[2][1])) for triple in selec_tuples] if dist \
                else [triple[2][0] for triple in selec_tuples]

def ctx_extraction_dep_in_out(context, target, start, end, dist = True):
    ctx_tokens_in = ctx_extraction_dep_in(context, target, start, end, dist)
    ctx_tokens_out = ctx_extraction_dep_out(context, target, start, end, dist)
    ctx_tokens_in.extend(ctx_tokens_out)
    return list(set(ctx_tokens_in))

def ctx_extraction_dep_recu_in_n_steps(context, target, start, 
                            end, n = 2, dist = True):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    result_tokens = []
    curr_target = targets
    for step in range(0, n):
        step_result = [triple[0] for triple in triples 
                       if triple[2] in curr_target]
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
    mean_target_index = np.mean([tgt[1] for tgt in targets])
    unique = list(set([result for result in result_tokens if result not in targets]))
    return [(token[0], np.abs(token[1]-mean_target_index))
                    for token in unique] if dist \
                else [token[0] for token in unique]

def ctx_extraction_dep_recu_out_n_steps(context, target, start, 
                        end, n = 2, dist = True):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    result_tokens = []
    curr_target = targets
    for step in range(0, n):
        step_result = [triple[2] for triple in triples 
                       if triple[0] in curr_target]
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
    mean_target_index = np.mean([tgt[1] for tgt in targets])
    unique = list(set([result for result in result_tokens if result not in targets]))
    return [(token[0], np.abs(token[1]-mean_target_index))
                    for token in unique] if dist \
                else [token[0] for token in unique]

def ctx_extraction_dep_recu_in_out_n_steps(context, target, start, 
                            end, n = 2, dist = True):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    result_tokens = []
    curr_target = targets
    for step in range(0, n):
        step_result = [triple[2] for triple in triples 
                       if triple[0] in curr_target]
        step_result_out = [triple[0] for triple in triples 
                       if triple[2] in curr_target]
        step_result.extend(step_result_out)
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
    mean_target_index = np.mean([tgt[1] for tgt in targets])
    unique = list(set([result for result in result_tokens if result not in targets]))
    return [(token[0], np.abs(token[1]-mean_target_index))
                    for token in unique] if dist \
                else [token[0] for token in unique]

def ctx_extraction_dep_recu_in_cover(context, target, start, 
                        end, cover = 0.1, dist = True):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    ctx_tokens = word_tokenize(context)
    ctx_tokens_post = post_process_ctx(ctx_tokens)
    result_tokens = []
    curr_target = targets
    curr_cover = 0
    while curr_cover < cover:
        step_result = [triple[0] for triple in triples 
                       if triple[2] in curr_target]
        if set(step_result) == set(curr_target):
                break
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
        curr_cover = len(result_tokens) / len(ctx_tokens_post)
    mean_target_index = np.mean([tgt[1] for tgt in targets])
    unique = list(set([result for result in result_tokens if result not in targets]))
    return [(token[0], np.abs(token[1]-mean_target_index))
                    for token in unique] if dist \
                else [token[0] for token in unique]

def ctx_extraction_dep_recu_out_cover(context, target, start, 
                        end, cover = 0.1, dist = True):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    ctx_tokens = word_tokenize(context)
    ctx_tokens_post = post_process_ctx(ctx_tokens)
    result_tokens = []
    curr_target = targets
    curr_cover = 0
    while curr_cover < cover:
        step_result = [triple[2] for triple in triples 
                       if triple[0] in curr_target]
        if set(step_result) == set(curr_target):
                break
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
        curr_cover = len(result_tokens) / len(ctx_tokens_post)
    mean_target_index = np.mean([tgt[1] for tgt in targets])
    unique = list(set([result for result in result_tokens if result not in targets]))
    return [(token[0], np.abs(token[1]-mean_target_index))
                    for token in unique] if dist \
                else [token[0] for token in unique]

def ctx_extraction_dep_recu_in_out_cover(context, target, start,
                        end, cover = 0.1, dist = True):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    ctx_tokens = word_tokenize(context)
    ctx_tokens_post = post_process_ctx(ctx_tokens)
    result_tokens = []
    curr_target = targets
    curr_cover = 0
    while curr_cover < cover:
        step_result = [triple[2] for triple in triples 
                       if triple[0] in curr_target]
        step_result_out = [triple[0] for triple in triples 
                       if triple[2] in curr_target]
        step_result.extend(step_result_out)
        if set(step_result) == set(curr_target):
                break
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
        curr_cover = len(result_tokens) / len(ctx_tokens_post)
    mean_target_index = np.mean([tgt[1] for tgt in targets])
    unique = list(set([result for result in result_tokens if result not in targets]))
    return [(token[0], np.abs(token[1]-mean_target_index))
                    for token in unique] if dist \
                else [token[0] for token in unique]

In [10]:
sentence = "Normally, the land will be passed down by future generations in a way " + \
             "that recognizes the community's traditional connection to that country ."
target = 'passed'

print('ctx_etraction_all:')
print(ctx_extraction_sentence_filtered(sentence, target, 28, 34,))

print('ctx_extraction_window_pre_n:')
print(ctx_extraction_window_pre_n(sentence, "Normally", 0, 8, filtering=False))
print(ctx_extraction_window_pre_n(sentence, "the", 11, 14, filtering=False))
print(ctx_extraction_window_pre_n(sentence, "land", 15, 19, filtering=False))
print(ctx_extraction_window_pre_n(sentence, "to", 126, 128, filtering=False))
print(ctx_extraction_window_pre_n(sentence, target, 28, 34, n = 5, filtering=False))

print('ctx_extraction_window_suc_n:')
print(ctx_extraction_window_suc_n(sentence, "country", 135, 142, filtering=False))
print(ctx_extraction_window_suc_n(sentence, "to", 126, 128, filtering=False))
print(ctx_extraction_window_suc_n(sentence, "connection", 115, 125, filtering=False))
print(ctx_extraction_window_suc_n(sentence, "community", 91, 100, n = 5, filtering=False))

print('ctx_extraction_window_pre_suc_n:')
print(ctx_extraction_window_pre_suc_n(sentence, "passed", 28, 34, filtering=False))
print(ctx_extraction_window_pre_suc_n(sentence, "the", 11, 14, filtering=False))
print(ctx_extraction_window_pre_suc_n(sentence, "to", 127, 129, filtering=False))

print('ctx_extraction_dep_in:')
print(ctx_extraction_dep_in(sentence, "land", 15, 19))

print('ctx_extraction_dep_out:')
print(ctx_extraction_dep_out(sentence, target, 28, 34))
print(ctx_extraction_dep_out(sentence, "land", 15, 19))

print('ctx_extraction_dep_in_out:')
print(ctx_extraction_dep_in_out(sentence, "land", 15, 19))

print('ctx_extraction_dep_recu_in_n_steps:')
print(ctx_extraction_dep_recu_in_n_steps(sentence, "the", 11, 14, n = 3))

print('ctx_extraction_dep_recu_out_n_steps:')
print(ctx_extraction_dep_recu_out_n_steps(sentence, "the", 11, 14))

print('ctx_extraction_dep_recu_in_out_n_steps:')
print(ctx_extraction_dep_recu_in_out_n_steps(sentence, "the", 11, 14))

print('ctx_extraction_dep_recu_in_cover:')
print(ctx_extraction_dep_recu_in_cover(sentence, "the", 11, 14, cover=0.1))

print('ctx_extraction_dep_recu_out_cover:')
print(ctx_extraction_dep_recu_out_cover(sentence, "the", 11, 14, cover=0.1))

print('ctx_extraction_dep_recu_in_out_cover:')
print(ctx_extraction_dep_recu_in_out_cover(sentence, "the", 11, 14, cover=0.1))

ctx_etraction_all:
['Normally', ',', 'the', 'land', 'will', 'be', 'pdown', 'by', 'future', 'generations', 'in', 'a', 'way', 'that', 'recognizes', 'the', 'community', "'s", 'traditional', 'connection', 'to', 'that', 'country', '.']
ctx_extraction_window_pre_n:
[]
[('Normally', 3), ('t', 2)]
[('Normally', 3), ('the', 2), ('l', 1)]
[('traditional', 3), ('connection', 2), ('t', 1)]
[('the', 5), ('land', 4), ('will', 3), ('be', 2), ('p', 1)]
ctx_extraction_window_suc_n:
[]
[('that', 1), ('country', 2)]
[('to', 1), ('that', 2), ('country', 3)]
[('s', 1), ('traditional', 2), ('connection', 3), ('to', 4), ('that', 5)]
ctx_extraction_window_pre_suc_n:
[('will', 3), ('be', 2), ('p', 1), ('down', 1), ('by', 2), ('future', 3)]
[('Normally', 3), ('t', 2), ('land', 1), ('will', 2), ('be', 3)]
[('traditional', 3), ('connection', 2), ('to', 1), ('hat', 1), ('country', 2)]
ctx_extraction_dep_in:
[]
ctx_extraction_dep_out:
[]
[]
ctx_extraction_dep_in_out:
[]
ctx_extraction_dep_recu_in_n_steps:
[]
ctx_ex

### 3.2 Context Extraction

After we defined all the context extraction approaches, we can apply them on the actual dataset. To do so, we first extract all the distinct sentences from the actual training set and create a new dataframe containing only the sentence ids, the sentence, the target and all the computed contexts. This also makes it easier to integrate context extraction functions implemented in other languages. Afterwards we can compute the context features and join them back with the target features dataframe.

In [11]:
Context = namedtuple('Context', 'name, params, func')
ContextFeatureCategory = namedtuple('ContextFeatureCategory', 'name, func')
ContextDataset = namedtuple('ContextDataset', 'name, context, train, test')
ContextFeatureDataset = namedtuple('ContextFeatureDataset', 'name, context, fc, agg, train, test')

### (3.2.1) Extraction functions

In [12]:
def ctx_window(dataframe, n, filtering, window_func):
    df = dataframe.copy()
    df['context'] = df.apply(lambda columns : 
                window_func(columns['sentence'], columns['target'], \
                columns['start'], columns['end'],  n = n, filtering = filtering), axis = 1)
    return df

ctx_window_pre_3_nf = Context('ctx_window_pre_n', {'n':3, 'filtering':False}, \
                              lambda dataframe : ctx_window(dataframe, 3, False, ctx_extraction_window_pre_n))
ctx_window_pre_3_f = Context('ctx_window_pre_n', {'n':3, 'filtering':True}, \
                             lambda dataframe : ctx_window(dataframe, 3, True, ctx_extraction_window_pre_n))
ctx_window_suc_n_2_nf = Context('ctx_window_suc_n',  {'n':2, 'filtering':False}, \
                            lambda dataframe : ctx_window(dataframe, 2, False, ctx_extraction_window_suc_n))
ctx_window_pre_suc_n_2_nf = Context('ctx_window_pre_suc_n',  {'n':2, 'filtering':False}, \
                            lambda dataframe : ctx_window(dataframe, 2, False, ctx_extraction_window_pre_suc_n))
ctx_window_pre_suc_n_2_f = Context('ctx_window_pre_suc_n',  {'n':2, 'filtering':True}, \
                            lambda dataframe : ctx_window(dataframe, 2, True, ctx_extraction_window_pre_suc_n))
ctx_window_pre_suc_n_3_f = Context('ctx_window_pre_suc_n',  {'n':3, 'filtering':True}, \
                            lambda dataframe : ctx_window(dataframe, 3, True, ctx_extraction_window_pre_suc_n))
ctx_window_pre_suc_n_4_f = Context('ctx_window_pre_suc_n',  {'n':4, 'filtering':True}, \
                            lambda dataframe : ctx_window(dataframe, 4, True, ctx_extraction_window_pre_suc_n))
ctx_window_pre_suc_n_5_f = Context('ctx_window_pre_suc_n',  {'n':5, 'filtering':True}, \
                            lambda dataframe : ctx_window(dataframe, 5, True, ctx_extraction_window_pre_suc_n))
ctx_window_pre_suc_n_6_f = Context('ctx_window_pre_suc_n',  {'n':6, 'filtering':True}, \
                            lambda dataframe : ctx_window(dataframe, 6, True, ctx_extraction_window_pre_suc_n))


def ctx_dependency(dataframe, filtering, dep_func):
    df = dataframe.copy()
    df['context'] = df.apply(lambda columns : 
            dep_func(columns['sentence'], columns['target'], \
            columns['start'], columns['end']), axis = 1)
    return df

ctx_dep_in_1_nf = Context('ctx_dep_in', {'n':1, 'filtering':False}, \
                              lambda dataframe : ctx_dependency(dataframe, False, ctx_extraction_dep_in))
ctx_dep_out_1_nf = Context('ctx_dep_out', {'n':1, 'filtering':False}, \
                              lambda dataframe : ctx_dependency(dataframe, False, ctx_extraction_dep_out))
ctx_dep_in_out_1_nf = Context('ctx_dep_in_out', {'n':1, 'filtering':False}, \
                              lambda dataframe : ctx_dependency(dataframe, False, ctx_extraction_dep_in_out))

ctx_dep_in_1_f = Context('ctx_dep_in', {'n':1,'filtering':True}, \
                              lambda dataframe : ctx_dependency(dataframe, True, ctx_extraction_dep_in))
ctx_dep_out_1_f = Context('ctx_dep_out', {'n':1,'filtering':True}, \
                              lambda dataframe : ctx_dependency(dataframe, True, ctx_extraction_dep_out))
ctx_dep_in_out_1_f = Context('ctx_dep_in_out', {'n':1,'filtering':True}, \
                              lambda dataframe : ctx_dependency(dataframe, True, ctx_extraction_dep_in_out))

def ctx_dependency_recu_steps(dataframe, n, filtering, dep_func):
    df = dataframe.copy()
    df['context'] = df.apply(lambda columns : 
            dep_func(columns['sentence'], columns['target'], \
            columns['start'], columns['end'], n), axis = 1)
    return df

ctx_dep_in_2_f = Context('ctx_dep_in', {'n':2,'filtering':True}, \
                              lambda dataframe : ctx_dependency_recu_steps(dataframe, 2, True, ctx_extraction_dep_recu_in_n_steps))
ctx_dep_out_2_f = Context('ctx_dep_out', {'n':2,'filtering':True}, \
                              lambda dataframe : ctx_dependency_recu_steps(dataframe, 2, True, ctx_extraction_dep_recu_out_n_steps))
ctx_dep_in_out_2_f = Context('ctx_dep_in_out', {'n':2,'filtering':True}, \
                              lambda dataframe : ctx_dependency_recu_steps(dataframe, 2, True, ctx_extraction_dep_recu_in_out_n_steps))



def ctx_dependency_recu_steps(dataframe, n, filtering, dep_func):
    df = dataframe.copy()
    df['context']  = df.apply(lambda columns : 
                dep_func(columns['sentence'], columns['target'], \
                columns['start'], columns['end'], n=n), axis = 1)
    return df

ctx_dep_rec_in_2_nf = Context('ctx_dep_rec_in_n', {'n':2, 'filtering':False}, \
                lambda dataframe : ctx_dependency_recu_steps(dataframe, 2, False, ctx_extraction_dep_recu_in_n_steps))
ctx_dep_rec_out_2_nf = Context('ctx_dep_rec_out_n', {'n':2, 'filtering':False}, \
                lambda dataframe : ctx_dependency_recu_steps(dataframe, 2, False, ctx_extraction_dep_recu_out_n_steps))
ctx_dep_rec_in_out_2_nf = Context('ctx_dep_rec_in_out_n', {'n':2, 'filtering':False}, \
                lambda dataframe : ctx_dependency_recu_steps(dataframe, 2, False, ctx_extraction_dep_recu_in_out_n_steps))



def ctx_dependency_recu_cover(dataframe, cover, filtering, dep_func):
    df = dataframe.copy()
    df['context']  = df.apply(lambda columns : 
                dep_func(columns['sentence'], columns['target'], \
                columns['start'], columns['end'], cover=cover), axis = 1)
    return df

ctx_dep_rec_in_02_nf = Context('ctx_dep_rec_in_02', {'cover': 0.2, 'filtering':False}, \
                lambda dataframe : ctx_dependency_recu_cover(dataframe, 0.2, False, ctx_extraction_dep_recu_in_cover))
ctx_dep_rec_out_02_nf = Context('ctx_dep_rec_out_02', {'cover': 0.2, 'filtering':False}, \
                lambda dataframe : ctx_dependency_recu_cover(dataframe, 0.2, False, ctx_extraction_dep_recu_out_cover))
ctx_dep_rec_in_out_02_nf = Context('ctx_dep_rec_in_out_02', {'cover': 0.2, 'filtering':False}, \
                lambda dataframe : ctx_dependency_recu_cover(dataframe, 0.2, False, ctx_extraction_dep_recu_in_out_cover))



def ctx_sentence(dataframe, filtering):
    df = dataframe.copy()
    df['context']  = df.apply(lambda columns : 
                ctx_extraction_sentence_filtered(columns['sentence'], columns['target'], \
                columns['start'], columns['end'], filtering=filtering), axis = 1)
    return df

ctx_sentence_nf = Context('ctx_sentence', {'filtering':False}, lambda dataframe : ctx_sentence(dataframe, False))
ctx_sentence_f = Context('ctx_sentence', {'filtering':True}, lambda dataframe : ctx_sentence(dataframe, True))



def ctx_hit(dataframe, filtering):
    df = dataframe.copy()
    df = df.join(df.groupby('id')['sentence'].apply(lambda sentences : \
                    tuple(ctx_extraction_hit(list(set(sentences))))), on='id', rsuffix='_hits')
    df['sentence_hits'] = df.sentence_hits.apply(lambda hits : list(hits))
    df.rename(columns={'sentence_hits':'context'}, inplace=True)
    return df

ctx_hit_nf = Context('ctx_sentence', {'filtering':False}, lambda dataframe : ctx_hit(dataframe, False))

### (3.2.2) Context Preprocessing

In [13]:
import re
import unicodedata
import sys

tbl = dict.fromkeys(i for i in range(sys.maxunicode)
                      if unicodedata.category(chr(i)).startswith('P'))

def remove_punctuation(text):
    return text.translate(tbl)

def preprocess_ctx(context):
    if all(isinstance(tpl, tuple) for tpl in context):
        stripped = [(token.strip().lower(), dist) for token, dist in context]
        return [(token, dist) for token, dist in stripped if remove_punctuation(token)]
    stripped = [token.strip().lower() for token in context]
    return [token for token in stripped if remove_punctuation(token)]

def preprocess_ctx_df(dataframe):
    df = dataframe.copy()
    df['p_context_dist'] = df.context.apply(lambda context : preprocess_ctx(context))
    df['p_context'] = df.context.apply(lambda context : [token for token, dist in preprocess_ctx(context)] \
                                      if all(isinstance(tpl, tuple) for tpl in context) else preprocess_ctx(context))
    return df

In [14]:
def compute_context_datasets(datasets, contexts):
    return [ContextDataset(ds.name, ctx, preprocess_ctx_df(ctx.func(ds.train)), 
                preprocess_ctx_df(ctx.func(ds.test)))
                for ctx in contexts
                for ds in datasets]

## 4. Context Features
After defining all the context definitions and extracting the different kinds of contexts from the sentence, we compute features on the context words. Therefore we first define which of the precomputed contexts to use.

### (4.1) Context (Aggregated Word-Level) Complexity Features
Here we compute features that measure the complexity of the extracted context itself. These features are divded into two categories. First, we compute the most important target features as found in the other notebook on feature importance on the context. The target features are already adapted to MWE, which makes it straightforward to apply them to context of any length and apply proper aggregation. Second, we compute features that are computed on the context alone as, for example, several traditional readability metrics and the number of characters.

In [15]:
from wordmodel import Word
import textatistic

words_mrc_database = {}
with open("resources/mrc-database/mrc2.dct", encoding="utf8") as file:
    for index, line in enumerate(file):
        line = line.strip()
        word, phon, dphon, stress = line[51:].split('|')
        w = Word(
                wid = index,
                nlet = int(line[0:2]),
                nphon = int(line[2:4]),
                nsyl = int(line[4]),
                kf_freq = int(line[5:10]),
                kf_ncats = int(line[10:12]),
                kf_nsamp = int(line[12:15]),
                tl_freq = int(line[15:21]),
                brown_freq = int(line[21:25]),
                fam = int(line[25:28]),
                conc = int(line[28:31]),
                imag = int(line[31:34]),
                meanc = int(line[34:37]),
                meanp = int(line[37:40]),
                aoa = int(line[40:43]),
                tq2 = line[43],
                wtype = line[44],
                pdwtype = line[45],
                alphasyl = line[46],
                status = line[47],
                var = line[48],
                cap = line[49],
                irreg = line[50],
                word=word,
                phon=phon,
                dphon=dphon,
                stress=stress)
        words_mrc_database[w.word.strip().lower()] = w

def mrc_database(target, func, missing_val):
    word = words_mrc_database.get(target.strip().lower())
    val = func(word) if word else missing_val
    return val if val != 0 else missing_val

word_concreteness = {}
with open("resources/word-freq-dumps/concreteness_brysbaert_et_al.txt", encoding="utf8") as file:
    for line in file:
        word, bigram, conc_m, conc_sd, \
        unknown, total, percent_known, \
        subtlex, dom_pos = line.split('\t')
        word_concreteness[word.strip()] = float(conc_m)

word_freq_wiki = {}
freq_sum_wiki = 0
with open("resources/word-freq-dumps/enwiki-20150602-words-frequency.txt", encoding="utf8") as file:
    for line in file:
        word, freq = line.partition(" ")[::2]
        word_freq_wiki[word.strip()] = int(freq)
        freq_sum_wiki+=int(freq)
        
word_freq_simple_wiki = {}
freq_sum_simple_wiki = 0
with open("resources/word-freq-dumps/simple_wiki_word_freqs.txt", encoding="ISO-8859-1") as file:
    for line in file:
        word, freq = line.split()
        word_freq_simple_wiki[word.strip()] = int(freq)
        freq_sum_simple_wiki+=int(freq)
        
word_freq_lang8 = {}
freq_sum_lang8 = 0
with open("resources/word-freq-dumps/word_freqs_lang8.txt", encoding="ISO-8859-1") as file:
    for line in file:
        word, freq = line.split()
        word_freq_lang8[word.strip()] = int(freq)
        freq_sum_lang8+=int(freq)
        
def ratio_cap_letters(target):
    return np.sum([1 for letter in target if letter.isupper()]) / len(target)

def num_vowels(target):
    return np.sum([target.lower().count(vowel) for vowel in 'aeiouy'])
        
def get_dict_count(target, freqs):
    return freqs.get(target.strip().lower(), 0)

subtlex_us = {}
with open("resources/dictionaries/SUBTLEXus.txt", encoding="utf8") as file:
    for line in file:
        word, freq, cd_count, freq_low, cd_low, subtl_wf, lg10_wf, Subtlcd, lg10_cd = line.split('\t')
        subtlex_us[word.strip().lower()] = (int(freq), int(cd_count))
        
subtlex_uk = pd.read_csv("resources/dictionaries/SUBTLEXuk.txt", sep = "\t")
subtlex_uk_dict = dict(zip(subtlex_uk['Spelling'], subtlex_uk['CD_count']))

brown_cluster_word2cluster = {}
brown_cluster_cluster2words = defaultdict(list)
with open("resources/brown-clustering/paths/rcv1.clean-c6000-p1.paths", encoding="utf8") as file:
    for line in file:
        binary_cluster, word, _ = line.split()
        brown_cluster_word2cluster[word] = binary_cluster
        brown_cluster_cluster2words[binary_cluster].append(word)

def brown_clustering_cluster_depth_simple(target):
    cluster = brown_cluster_word2cluster.get(target)
    return int(cluster, 2) if cluster else 0

word_age_of_aquisition = {}
with open("resources/word-freq-dumps/AoA_ratings_Kuperman_et_al_BRM.csv", encoding="utf8") as file:
    for line in file:
        word, occur_total, occur_num, freq_pm, rating_Mean, rating_SD, dunno = line.split()
        word_age_of_aquisition[word.strip()] = float(rating_Mean.replace(',', '.')) if rating_Mean != 'NA' else 0

        
WEIGHT_WIKI_LANG_8 = freq_sum_wiki / freq_sum_lang8
WEIGHT_WIKI_SIMPLE_WIKI = freq_sum_wiki / freq_sum_simple_wiki

def weighted_freq_ratio(target, word_freq_n, word_freq_m, weight):
    freq_n = word_freq_n.get(target.strip().lower(), 1)
    freq_m = word_freq_m.get(target.strip().lower(), 1)
    return -1 + (2 * (freq_n / ((freq_m * weight) + freq_n)))

def wn_synset_avg_lemma_freq(target):
    return np.nan_to_num(np.mean([len(synset.lemmas()) 
            for synset in wn.synsets(target)]))

import pickle

with open('resources/language-models/ngram_char_1.json', 'rb') as fp:
    ngram_char_1 = pickle.load(fp)
    
with open('resources/language-models/ngram_word_1.json', 'rb') as fp:
    ngram_word_1 = pickle.load(fp)

with open('resources/language-models/ngram_char_2.json', 'rb') as fp:
    ngram_char_2 = pickle.load(fp)
    
with open('resources/language-models/ngram_word_2.json', 'rb') as fp:
    ngram_word_2 = pickle.load(fp)
    
with open('resources/language-models/ngram_char_3.json', 'rb') as fp:
    ngram_char_3 = pickle.load(fp)
    
with open('resources/language-models/ngram_char_2_complex.json', 'rb') as fp:
    ngram_char_2_complex = pickle.load(fp)
    
with open('resources/language-models/ngram_char_2_non_complex.json', 'rb') as fp:
    ngram_char_2_non_complex = pickle.load(fp)
    
with open('resources/language-models/ngram_char_2_complex_cleaned.json', 'rb') as fp:
    ngram_char_2_complex_cleaned = pickle.load(fp)
    
with open('resources/language-models/ngram_char_2_non_complex_cleaned.json', 'rb') as fp:
    ngram_char_2_non_complex_cleaned = pickle.load(fp)
    
def kneser_ney_word_uni_gram(target):
    return ngram_word_1.cond_prob(target)

def kneser_ney_word_bi_gram(target):
    words = target.split()
    if len(words) <= 1:
        return ngram_word_2.cond_prob(target)
    return np.mean([ngram_word_2.cond_prob(words[index+1], (word,)) 
                for index, word in enumerate(words) 
                if index <= len(words)-2])
    
def kneser_ney_char_uni_gram_avg(target):
    return np.mean([ngram_char_1.cond_prob(character) 
            for character in target])

def kneser_ney_char_bi_gram_avg(target):
    return np.mean([ngram_char_2.cond_prob(target[index+1], (character,)) 
            for index, character in enumerate(target) if index <= len(target)-2])

def kneser_ney_char_bi_gram_avg_model(target, kn_model):
    return np.mean([kn_model.cond_prob(target[index+1], (character,)) 
            for index, character in enumerate(target) if index <= len(target)-2])

def kneser_ney_char_tri_gram_avg(target):
    return np.mean([ngram_char_3.cond_prob(target[index+2], (character, target[index+1])) 
            for index, character in enumerate(target) if index <= len(target)-3])

  interactivity=interactivity, compiler=compiler, result=result)


In [16]:
def ctx_features_target(dataframe, agg):
    df = dataframe.copy()
    df['ctx_length'] = df.p_context_dist.apply(lambda context : agg(context, len))
    df['ctx_freq_wiki'] = df.p_context_dist.apply(lambda context : agg(context, get_dict_count, word_freq_wiki))
    df['ctx_freq_simple_wiki'] = df.p_context_dist.apply(lambda context : agg(context, get_dict_count, word_freq_simple_wiki))
    df['ctx_ratio_cap_letters'] = df.context.apply(lambda target : agg(target, ratio_cap_letters))
    df['ctx_dict_dale_chall'] = df.p_context_dist.apply(lambda target : agg(target, \
                            lambda target :  0 if textatistic.notdalechall_count(target) >= 1 else 1))
    df['ctx_num_vowels'] = df.p_context_dist.apply(lambda target : agg(target, num_vowels))
    df['ctx_subtlex_cd_us'] = df.p_context_dist.apply(lambda target : agg(target, \
                    lambda target : subtlex_us[target.strip().lower()][1] if subtlex_us.get(target.strip().lower()) else 0))
    df['ctx_subtlex_cd_uk'] = df.p_context_dist.apply(lambda target : agg(target, \
                                    lambda target : subtlex_uk_dict.get(target, 0)))
    df['ctx_is_title'] = df.context.apply(lambda target : agg(target, lambda t : t.istitle()))
    df['ctx_brown_clustering_cluster_depth_simple'] = df.p_context_dist.apply(lambda target : agg(target, \
                                                            brown_clustering_cluster_depth_simple))
    df['ctx_mrc_fam'] = df.p_context_dist.apply(lambda target : agg(target, mrc_database, lambda word : word.fam, 400))
    df['ctx_concreteness'] = df.p_context_dist.apply(lambda target : agg(target, \
                                                 lambda target : word_concreteness.get(target, 2.5)))
    df['ctx_age_of_aquisition'] = df.p_context_dist.apply(lambda target : agg(target, \
                                                    lambda target : word_age_of_aquisition.get(target, 8.5)))
    df['ctx_weighted_wiki_lang8_ratio'] = df.p_context_dist.apply(lambda target : agg(target, \
                                    weighted_freq_ratio, word_freq_wiki, word_freq_lang8, WEIGHT_WIKI_SIMPLE_WIKI))
    df['ctx_wn_synset_avg_lemma_freq'] = df.p_context_dist.apply(lambda target : agg(target, wn_synset_avg_lemma_freq))
    df['ctx_kneser_ney_char_bi_complex'] = df.p_context_dist.apply(lambda target : agg(target, \
                                                    kneser_ney_char_bi_gram_avg_model, ngram_char_2_complex))
    df['ctx_kneser_ney_char_bi_non_complex'] = df.p_context_dist.apply(lambda target : agg(target, \
                                                    kneser_ney_char_bi_gram_avg_model, ngram_char_2_non_complex))
    df['ctx_kneser_ney_char_bi_c_nc_ratio'] = df['ctx_kneser_ney_char_bi_complex'] / \
                                                    df['ctx_kneser_ney_char_bi_non_complex']
    df = df.fillna(0)
    return df

ctx_fc_target = ContextFeatureCategory('context_complexity_from_target', \
                                ctx_features_target)

In [17]:
def compute_ctx_features_target(datasets, aggs = agg_default):
    return [ContextFeatureDataset(ctx_ds.name, ctx_ds.context, 
                    ctx_fc_target, agg, 
                    ctx_fc_target.func(ctx_ds.train, agg.agg),
                    ctx_fc_target.func(ctx_ds.test, agg.agg)) 
                    for ctx_ds in datasets for agg in aggs]

### (4.2) Context-only Complexity Features
Here we compute features that measure the complexity of the extracted context itself. These features are divded into two categories. First, we compute the most important target features as found in the other notebook on feature importance on the context. The target features are already adapted to MWE, which makes it straightforward to apply them to context of any length and apply proper aggregation. Second, we compute features that are computed on the context alone as, for example, several traditional readability metrics and the number of characters.

In [18]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

MAIN_PATH = 'D:/workspace_python/CoWoReId/python/resources/word-embeddings/'

glove_defs = [Model('glove', 'glove.6B.50d.txt', 50, 'wikipedia+gigaword5', None),
              Model('glove', 'glove.6B.300d.txt', 300, 'wikipedia+gigaword5', None)]

glove_models = []
for model in glove_defs:
    glove_file = datapath(MAIN_PATH + model.name)
    tmp_file = get_tmpfile(model.name + '-temp')
    glove2word2vec(glove_file, tmp_file)
    vecs = KeyedVectors.load_word2vec_format(tmp_file)
    glove_models.append(Model(model.type, model.name, model.dimension, model.corpus, vecs))
    print('load model : {}'.format(model.name))
    
print(glove_models)
models = glove_models



load model : glove.6B.50d.txt
load model : glove.6B.300d.txt
[Model(type='glove', name='glove.6B.50d.txt', dimension=50, corpus='wikipedia+gigaword5', model=<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x0000008E33810CF8>), Model(type='glove', name='glove.6B.300d.txt', dimension=300, corpus='wikipedia+gigaword5', model=<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x0000008F3A160BE0>)]


In [19]:
from ngram_representation import ngram_repr_bow_max
from ngram_representation import ngram_repr_wiki_weighted_bow
from ngram_representation import ngram_repr_bow_min
from ngram_representation import missing_strat_random

def word_embedding_ngram_repr(target, model, missing_strat, ngram_repr, *args):
    tokens = word_tokenize(target)
    if len(tokens) > 1:
        # First construct multi-word query
        query = '_'.join(tokens)
        if query in model.vocab:
            return model[query]
        query = '_'.join([token.strip().lower() for token in tokens])
        if query in model.vocab:
            return model[query]
        all_embeddings = [model[word.strip().lower()] 
                              if word.strip().lower() in model.vocab 
                              else missing_strat(word, model.vector_size) 
                              for word in tokens]
        return ngram_repr(all_embeddings, tokens)
    else:
        if target in model.vocab:
            return ngram_repr(model[target], [target])
        else:
            return ngram_repr(model[target.strip().lower()], [target]) \
                if target.strip().lower() in model.vocab \
                else ngram_repr([missing_strat(target, model.vector_size)], [target])

def cosine_similarity(vec_l, vec_r):
    return np.dot(vec_l,vec_r) / (np.linalg.norm(vec_l) \
                * np.linalg.norm(vec_r))

@lru_cache(maxsize=None)
def ctx_target_embed_cos_sims(context, target, model):
    ngram_repr = ngram_repr_bow_max
    missing_strat = missing_strat_random
    embed_target = word_embedding_ngram_repr(target, model, missing_strat, ngram_repr)
    embed_contexts = [word_embedding_ngram_repr(ctx, model, 
                        missing_strat_random, ngram_repr) for ctx in context]
    return [cosine_similarity(embed_target, ctx_embed) for ctx_embed in embed_contexts]

def ctx_target_embed_cos_max(context, target, model):
    cos_sims = ctx_target_embed_cos_sims(tuple(context), target, model)
    return np.min(cos_sims) if cos_sims else 0

def ctx_target_embed_cos_min(context, target, model):
    cos_sims = ctx_target_embed_cos_sims(tuple(context), target, model)
    return np.max(cos_sims) if cos_sims else 0

def ctx_target_embed_cos_mean(context, target, model):
    cos_sims = ctx_target_embed_cos_sims(tuple(context), target, model)
    return np.mean(cos_sims) if cos_sims else 0

In [20]:
from textatistic import Textatistic
    
def ctx_features_context(dataframe, agg):
    df = dataframe.copy()
    df['ctx_len_chars'] = df.p_context.apply(lambda context : np.sum([len(word) for word in context]))
    df['ctx_len_words'] = df.p_context.apply(lambda context : len(context))
    df['ctx_norm_chars'] = df.ctx_len_chars / df.ctx_len_words
#     df['ctx_rb_dalechall_score'] = df.p_context.apply(lambda context : \
#                             Textatistic(' '.join(context) + '.').dalechall_score if context else 7)
#     df['ctx_rb_flesch_score'] = df.p_context.apply(lambda context : \
#                             Textatistic(' '.join(context) + '.').flesch_score if context else 50)
#     df['ctx_rb_fleschkincaid_score'] = df.p_context.apply(lambda context : \
#                             Textatistic(' '.join(context) + '.').fleschkincaid_score if context else 50)
#     df['ctx_rb_gunningfog_score'] = df.p_context.apply(lambda context : \
#                             Textatistic(' '.join(context) + '.').gunningfog_score if context else 14)
#     df['ctx_rb_polysyblword_count'] = df.p_context.apply(lambda context : \
#                             Textatistic(' '.join(context) + '.').polysyblword_count if context else 0)
#     df['ctx_rb_sybl_count'] = df.p_context.apply(lambda context : \
#                             Textatistic(' '.join(context) + '.').sybl_count if context else 0)
#     df['ctx_rb_sybl_count_ratio'] = df.p_context.apply(lambda context : \
#                             Textatistic(' '.join(context) + '.').sybl_count / len(context) if context else 0)
    df['ctx_target_embed_cos_glove_300_min'] = df[['p_context', 'p_target']].apply(lambda vals : \
                                                        ctx_target_embed_cos_min(*vals, models[0].model), axis=1)
    df['ctx_target_embed_cos_glove_300_max'] = df[['p_context', 'p_target']].apply(lambda vals : \
                                                        ctx_target_embed_cos_max(*vals, models[0].model), axis=1)
    df['ctx_target_embed_cos_glove_300_mean'] = df[['p_context', 'p_target']].apply(lambda vals : \
                                                        ctx_target_embed_cos_mean(*vals, models[0].model), axis=1)
    df = df.fillna(0)
    return df

ctx_fc_context = ContextFeatureCategory('context_complexity_from_context', \
                                ctx_features_context)

In [21]:
def compute_ctx_features_context(datasets, aggs = agg_default):
    return [ContextFeatureDataset(ctx_ds.name, ctx_ds.context, 
                    ctx_fc_context, agg, 
                    ctx_fc_context.func(ctx_ds.train, agg.agg),
                    ctx_fc_context.func(ctx_ds.test, agg.agg)) 
                    for ctx_ds in datasets for agg in aggs]

In [22]:
ctx_fc_context_complexity = ContextFeatureCategory('context_complexity', \
                        [ctx_fc_target, ctx_fc_context])

def compute_ctx_features(datasets):
    return [ContextFeatureDataset(ds.name, ds.context, ctx_fc_context_complexity, ds.agg,
                ds.train, ds.test) for ds in concat_feature_datasets(*datasets)]

## 5. Evaluation

In [23]:
from collections import namedtuple
Result = namedtuple('Result', 'dataset, fc, agg, measure')
ContextResult = namedtuple('ContextFeatureDataset', 'name, fc, agg, context, measure')
Dataset = namedtuple('Dataset', 'name, train, test')
FeatureDataset = namedtuple('FeatureDataset', 'name, fc, agg, train, test')
FeatureCategory = namedtuple('FeatureCategory', 'name, func')
Feature = namedtuple('Feature', 'name, fc_name, train, test')
Metric = namedtuple('Metric', 'name, func')

## 5.1 Utility Functions
Here we provide several utility functions for working with the datasets and classification algorithms. For example, we provide functions to clean the datasets from all non-features (such as id, sentence, the annotator information etc.) and functions to transform the feature datasets into a proper representation for the algorithms (such as one-hot-encoding of categorical attributes).

In [24]:
def remove_labels_ctx_for_binary_df(dataframe, drop=[]):
    drop_list = ['id', 'sentence', 'target', 'nat', 'non_nat', 
                 'nat_marked', 'non_nat_marked', 'prob', 'start', 
                 'end', 'p_target', 'lemma', 'p_lemma', 'pos_tags', 'pos_tags_pt',
                 'p_sentence', 'context', 'p_context_dist', 'p_context']
    drop_list.extend(drop)
    df = dataframe.copy()
    df = df.drop(drop_list, axis = 1)
    return df

def remove_labels_ctx_for_regr_df(dataframe, drop=[]):
    drop_list = ['id', 'sentence', 'target', 'nat', 'non_nat', 
                 'nat_marked', 'non_nat_marked', 'binary', 'start', 
                 'end', 'p_target', 'lemma', 'p_lemma', 'pos_tags', 'pos_tags_pt',
                 'p_sentence', 'context', 'p_context_dist', 'p_context']
    drop_list.extend(drop)
    df = dataframe.copy()
    df = df.drop(drop_list, axis = 1)
    return df

def remove_labels_for_binary_df(dataframe, drop=[]):
    drop_list = ['id', 'sentence', 'target', 'nat', 'non_nat', 
                  'nat_marked', 'non_nat_marked', 'prob', 'start', 
                  'end', 'p_target', 'lemma', 'p_lemma', 'pos_tags', 'pos_tags_pt', 'p_sentence']
    drop_list.extend(drop)
    df = dataframe.copy()
    df = df.drop(drop_list, axis = 1)
    return df

def remove_labels_for_regr_df(dataframe, drop=[]):
    drop_list = ['id', 'sentence', 'target', 'nat', 'non_nat', 
                  'nat_marked', 'non_nat_marked', 'binary', 'start', 
                  'end', 'p_target', 'lemma', 'p_lemma', 'pos_tags', 'pos_tags_pt', 'p_sentence']
    drop_list.extend(drop)
    df = dataframe.copy()
    df = df.drop(drop_list, axis = 1)
    return df
    
def transform_feat_to_num(train, test):
    train_copy = train.copy()
    test_copy = test.copy()
    train_copy = train_copy.replace([np.inf, -np.inf], np.nan)
    train_copy = train_copy.fillna(0)
    test_copy = test_copy.replace([np.inf, -np.inf], np.nan)
    test_copy = test_copy.fillna(0)
    shape_train = train.shape
    shape_test = test.shape
    df = train_copy.append(test_copy, ignore_index=True)
    df = pd.get_dummies(df)
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0)
    df = df.applymap(lambda x: 1 if x == True else x)
    df = df.applymap(lambda x: 0 if x == False else x)
    return (df.loc[0:(shape_train[0]-1),], 
            df.loc[shape_train[0]:df.shape[0],])

def prep_data(train, test):
    x_train = train.loc[:, train.columns != 'binary']
    y_train = train['binary'].values
    x_test = test.loc[:, test.columns != 'binary']
    y_test = test.binary.values
    return x_train, y_train, x_test, y_test

def create_eval_df_from_results(results, remove_agg=True):
    if remove_agg:
        evaluation = [{'dataset' : result.dataset.name,
                        'zc' : result.fc[0], 'prec' : result.measure[0][1],
                   'rec' : result.measure[1][1], 'f1' : result.measure[2][1]} 
                       for result in results]
    else:
        evaluation = [{'dataset' : result.dataset.name, 'agg' : result.agg[0],
                        'zc' : result.fc[0], 'prec' : result.measure[0][1],
                   'rec' : result.measure[1][1], 'f1' : result.measure[2][1]} 
                       for result in results]
    return pd.DataFrame.from_records(evaluation)

## 5.3 Classification Models

In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn import model_selection
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier

def xgboost(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    xgtrain = xgb.DMatrix(x_train.values, label=y_train)
    xgtest = xgb.DMatrix(x_test.values, label=y_test)
    xg_test_x = xgb.DMatrix(x_test.values)
    param = {'max_depth': 30, 'eta': 1, 'silent': 1, \
             'objective': 'binary:logistic',  'n_estimators':5000}
    evallist = [(xgtest, 'eval'), (xgtrain, 'train')]
    num_round = 70
    bst = xgb.train(param, xgtrain, num_round, evallist)
    prediction = bst.predict(xg_test_x)
    prediction_binary = list(map(lambda val: 1 if val>0.5 else 0, prediction))
    return y_test, prediction_binary

def adaboost(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    adab = AdaBoostClassifier(base_estimator=None, n_estimators=5000, 
                          learning_rate=1.0, algorithm='SAMME.R',
                          random_state=61231)
    adab.fit(x_train, y_train) 
    prediction = adab.predict(x_test)
    return y_test, prediction

def random_forest(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    x_train = x_train.values.astype(np.float)
    x_test = x_test.values.astype(np.float)
    clf = RandomForestClassifier(max_depth=10, random_state=14521, n_estimators=1800, \
                    verbose=1, min_samples_split=5, min_samples_leaf=4, bootstrap=False)
    clf.fit(x_train, y_train)
    prediction = clf.predict(x_test)
    prediction_binary = list(map(lambda val: 1 if val>0.5 else 0, prediction))
    return y_test, prediction_binary

def random_forest_extra(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    x_train = x_train.values.astype(np.float)
    x_test = x_test.values.astype(np.float)
    clf = ExtraTreesClassifier(n_estimators=1800, criterion='gini', max_depth=None,
                     min_samples_split=5, min_samples_leaf=4, min_weight_fraction_leaf=0.0,
                     max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
                     min_impurity_split=None, bootstrap=False, oob_score=False,
                     random_state=15325, verbose=0, warm_start=False)
    clf.fit(x_train, y_train)
    prediction = clf.predict(x_test)
    prediction_binary = list(map(lambda val: 1 if val>0.5 else 0, prediction))
    return y_test, prediction_binary

def svm(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    seed = 7
    svc = SVC(C=10, kernel='rbf', degree=3, gamma='auto', 
            coef0=0.0, shrinking=True, probability=False, tol=0.001, 
            cache_size=200, class_weight=None, verbose=False, max_iter=-1, 
            decision_function_shape='ovr', random_state=41231)
    svc.fit(x_train, y_train) 
    prediction = svc.predict(x_test)
    f1score = f1_score(y_test, prediction)
    return y_test, prediction


# def mlp(train, test):
#     x_train, y_train, x_test, y_test = prep_data(train, test)
#     mlp = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto',
#           beta_1=0.9, beta_2=0.999, early_stopping=False,
#           epsilon=1e-08, hidden_layer_sizes=(50, 20), learning_rate='constant',
#           learning_rate_init=0.001, max_iter=200, momentum=0.9,
#           nesterovs_momentum=True, power_t=0.5, random_state=54123, shuffle=True,
#           solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
#           warm_start=False)
#     mlp.fit(x_train, y_train) 
#     prediction = mlp.predict(x_test)
#     return y_test, prediction

def mlp(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    mlp = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto',
          beta_1=0.9, beta_2=0.999, early_stopping=False,
          epsilon=1e-08, hidden_layer_sizes=(50, 20), learning_rate='constant',
          learning_rate_init=0.001, max_iter=200, momentum=0.9,
          nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
          solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
          warm_start=False)
    mlp.fit(x_train, y_train) 
    prediction = mlp.predict(x_test)
    return y_test, prediction

def decision_tree(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    seed = 7
    dt = DecisionTreeClassifier(criterion='gini', splitter='best', 
                                 max_depth=None, min_samples_split=2, 
                                 min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                 max_features=None, random_state=81982, max_leaf_nodes=None, 
                                 min_impurity_decrease=0.0, min_impurity_split=None, 
                                 class_weight=None, presort=False)
    dt.fit(x_train, y_train) 
    prediction = dt.predict(x_test)
    return y_test, prediction


def knn(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', 
                     leaf_size=30, p=2, metric='minkowski')
    knn.fit(x_train, y_train) 
    prediction = knn.predict(x_test)
    return y_test, prediction

def naive_bayes(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    naive_bayes = GaussianNB(priors=None)
    naive_bayes.fit(x_train, y_train) 
    prediction = naive_bayes.predict(x_test)
    return y_test, prediction

def logistic_regression(train, test):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    logistic_regression = LogisticRegression(penalty='l2', dual=False, tol=0.0001,
                                     C=1.0, fit_intercept=True, intercept_scaling=1, 
                                     class_weight=None, random_state=89101, solver='lbfgs',
                                     max_iter=100, verbose=0, 
                                     warm_start=False)
    logistic_regression.fit(x_train, y_train) 
    prediction = logistic_regression.predict(x_test)
    return y_test, prediction


def xgboost_with_bst(train, test, silent):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    xgtrain = xgb.DMatrix(x_train.values, label=y_train, feature_names=x_train.columns.values)
    xgtest = xgb.DMatrix(x_test.values, label=y_test, feature_names=x_test.columns.values)
    xg_test_x = xgb.DMatrix(x_test.values, feature_names=x_test.columns.values)
    param = {'max_depth': 30, 'eta': 1, 'silent': silent, 'objective': 'binary:logistic',  'n_estimators':5000}
    evallist = [(xgtest, 'eval'), (xgtrain, 'train')]
    num_round = 70
    bst = xgb.train(param, xgtrain, num_round, evallist)
    prediction = bst.predict(xg_test_x)
    prediction_binary = list(map(lambda val: 1 if val>0.5 else 0, prediction))
    f1score = precision_recall_fscore_support(y_test, prediction_binary)
    return f1score, bst

def random_forest_with_forest(train, test, label):
    x_train, y_train, x_test, y_test = prep_data(train, test)
    x_train = x_train.as_matrix().astype(np.float)
    x_test = x_test.as_matrix().astype(np.float)
    clf = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=1800, \
                            verbose=1, min_samples_split=5, min_samples_leaf=4, bootstrap=False)
    clf.fit(x_train, y_train)
    prediction = clf.predict(x_test)
    prediction_binary = list(map(lambda val: 1 if val>0.5 else 0, prediction))
    f1score = precision_recall_fscore_support(y_test, prediction_binary)
    return f1score, clf

In [26]:
def compute_results_target_only(datasets, model):
    results = [Result(fs, '', '', precision_recall_fscore_support(*model(*transform_feat_to_num(\
            remove_labels_for_binary_df(fs.train), 
            remove_labels_for_binary_df(fs.test))))) for fs in datasets]
    evaluation = [{'dataset' : result.dataset.name,
                'prec' : result.measure[0][1],
               'rec' : result.measure[1][1], 'f1' : result.measure[2][1]} 
                   for result in results]
    return pd.DataFrame.from_records(evaluation)
    
def compute_results_with_context(datasets, model):
    results = [ContextResult(fs, fs.fc, fs.agg, fs.context,
    precision_recall_fscore_support(*model(*transform_feat_to_num(
        remove_labels_ctx_for_binary_df(fs.train), 
        remove_labels_ctx_for_binary_df(fs.test))))) for fs in datasets]
    evaluation = [{'dataset' : result.name, 'agg' : result.agg[0],
                    'zc' : result.fc, 'context':result.context , 'prec' : result.measure[0][1],
               'rec' : result.measure[1][1], 'f1' : result.measure[2][1]} 
                   for result in results]
    return pd.DataFrame.from_records(evaluation)

In [27]:
def compute_ctx_with_higher_f1_fraction(target_results, ctx_results):
    domains = ['WikipediaFA', 'WikiNewsFA', 'NewsFA']
    curr = ctx_results
    curr_feature_res = target_results
    for domain in domains:
        target_f1 = curr_feature_res.loc[curr_feature_res.dataset==domain,'f1'].values[0]
        curr_domain = curr.loc[curr.dataset.map(lambda val : val.name)==domain,]
        rows = curr_domain.shape[0]
        ctx_higher_f1 = curr_domain.loc[curr_domain['f1']>target_f1,]
        better_rows = ctx_higher_f1.shape[0]
        print('{} : {} ({}/{})'.format(domain, better_rows/rows, better_rows, rows))

In [28]:
def compute_ctx_with_higher_f1(ctx_results):
    domains = ['WikipediaFA', 'WikiNewsFA', 'NewsFA']
    curr = ctx_results
    for domain in domains:
        curr_domain = curr.loc[curr.dataset.map(lambda val : val.name)==domain,]
        ctx_higher = curr_domain.loc[curr_domain['f1']>=np.max(curr_domain['f1']),['agg','context', 'f1']]
        print('{} : {}, {}, {}'.format(domain, ctx_higher['agg'], ctx_higher['context'], ctx_higher['f1']))

In [123]:
fs = datasets[0]
train, test = (remove_labels_for_binary_df(fs.train), 
            remove_labels_for_binary_df(fs.test))

## 5.4 Experiments

In [29]:
# Load datasets and preprocess data
datasets = load_datasets(['WikipediaFA', 'WikiNewsFA', 'NewsFA'], 'Train', 'Dev', \
                         type_train='word', type_test='word', header=0)
datasets = preprocess_datasets(datasets)
# Define contexts that should be used and compute them
# Also include only pre and suc window
contexts = [ctx_sentence_f, ctx_window_pre_suc_n_2_f,ctx_window_pre_suc_n_3_f, \
            ctx_window_pre_suc_n_4_f,ctx_window_pre_suc_n_5_f, ctx_window_pre_suc_n_6_f, \
            ctx_dep_in_1_f, ctx_dep_out_1_f, ctx_dep_in_out_1_f,
            ctx_dep_in_2_f, ctx_dep_out_2_f, ctx_dep_in_out_2_f]
print('Datasets loaded')
context_datasets = compute_context_datasets(datasets, contexts)
print('Contexts loaded')
datasets_fc_target = compute_ctx_features_target(context_datasets, aggs=aggs_all)
print('targetsfes loaded')
datasets_fc_context = compute_ctx_features_context(context_datasets, aggs=aggs_all)
print('contextfes loaded')
datasets_fc_all = compute_ctx_features([datasets_fc_target, datasets_fc_context])

Datasets loaded
Contexts loaded


  out=out, **kwargs)


targetsfes loaded
contextfes loaded


In [49]:
train = datasets_fc_context[30].train
train.loc[train.target=='theater',['target', 'p_context', \
    'ctx_target_embed_cos_glove_300_min', 'ctx_target_embed_cos_glove_300_max', 'ctx_target_embed_cos_glove_300_mean']]

Unnamed: 0,target,p_context,ctx_target_embed_cos_glove_300_min,ctx_target_embed_cos_glove_300_max,ctx_target_embed_cos_glove_300_mean
393,theater,"[tradition, comic, film]",0.702274,0.359093,0.519586
708,theater,"[carpets, wallpaper, sets, termed]",0.354014,0.120082,0.185138
4831,theater,"[film, television, radio, capacity]",0.702274,0.37506,0.546904


## (5.4.1) Random Forest

In [51]:
feature_results_rf = compute_results_target_only(datasets, random_forest)
feature_results_rf

[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   40.2s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   53.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.6s finished


Unnamed: 0,dataset,f1,prec,rec
0,WikipediaFA,0.723404,0.742063,0.70566
1,WikiNewsFA,0.794326,0.791519,0.797153
2,NewsFA,0.832661,0.822709,0.842857


In [41]:
feature_context_results_rf = compute_results_with_context(datasets_fc_all, random_forest)

[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   50.8s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   45.0s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   43.7s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   52.4s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   50.9s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.0min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s f

[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   51.0s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   47.5s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   46.1s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   52.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s f

[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.0min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   49.4s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s f

[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.0min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.0min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.9min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    0.6s f

In [106]:
feature_context_results_rf

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
0,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.726214,0.748000,0.705660,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
1,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.722330,0.744000,0.701887,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
2,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.718447,0.740000,0.698113,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
3,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.720930,0.741036,0.701887,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
4,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.726214,0.748000,0.705660,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
5,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.793594,0.793594,0.793594,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
6,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.797153,0.797153,0.797153,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
7,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.797153,0.797153,0.797153,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
8,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.792857,0.795699,0.790036,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
9,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.793594,0.793594,0.793594,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [109]:
res['agg'].dtype

dtype('O')

In [114]:
res = feature_context_results_rf
res.loc[(res.dataset.map(lambda val : val.name)=='NewsFA'),].groupby('agg').mean()

Unnamed: 0_level_0,f1,prec,rec
agg,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)",0.828968,0.823822,0.834184
"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)",0.82986,0.823269,0.836565
"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)",0.829546,0.823642,0.835544
"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)",0.829832,0.823377,0.836395
"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)",0.829054,0.823502,0.834694


In [96]:
res = feature_context_results_rf
res.loc[(res.dataset.map(lambda val : val.name)=='WikiNewsFA'),]

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
5,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.793594,0.793594,0.793594,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
6,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.797153,0.797153,0.797153,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
7,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.797153,0.797153,0.797153,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
8,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.792857,0.795699,0.790036,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
9,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.793594,0.793594,0.793594,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
20,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.793594,0.793594,0.793594,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
21,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.795737,0.794326,0.797153,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
22,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.795737,0.794326,0.797153,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
23,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.797153,0.797153,0.797153,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
24,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.79929,0.797872,0.800712,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [147]:
res = feature_context_results_rf
res.loc[(res.dataset.map(lambda val : val.name)=='NewsFA'),]

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
10,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.828077,0.825558,0.830612,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
11,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.828077,0.825558,0.830612,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
12,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.829268,0.825911,0.832653,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
13,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.827586,0.822581,0.832653,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
14,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.828077,0.825558,0.830612,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
25,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.831143,0.823647,0.838776,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
26,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.829465,0.820359,0.838776,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
27,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.830303,0.822,0.838776,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
28,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.827795,0.817097,0.838776,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
29,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.827098,0.819639,0.834694,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [52]:
compute_ctx_with_higher_f1_fraction(feature_results_rf, feature_context_results_rf)

WikipediaFA : 0.8 (48/60)
WikiNewsFA : 0.6833333333333333 (41/60)
NewsFA : 0.08333333333333333 (5/60)


In [71]:
compute_ctx_with_higher_f1(feature_context_results_rf)

WikipediaFA : 49    (dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)
Name: agg, dtype: object, 49    [(ctx_window_pre_suc_n, {'n': 4, 'filtering': True}, <function <lambda> at 0x0000008E330636A8>), (ctx_window_pre_suc_n, {'n': 4, 'filtering': True}, <function <lambda> at 0x0000008E330636A8>)]
Name: context, dtype: object, 49    0.735409
Name: f1, dtype: float64
WikiNewsFA : 158    (weighted, <function <lambda> at 0x0000008E2CDFBE18>)
Name: agg, dtype: object, 158    [(ctx_dep_out, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063D08>), (ctx_dep_out, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063D08>)]
Name: context, dtype: object, 158    0.802139
Name: f1, dtype: float64
NewsFA : 148    (weighted, <function <lambda> at 0x0000008E2CDFBE18>)
Name: agg, dtype: object, 148    [(ctx_dep_in, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063C80>), (ctx_dep_in, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E3

## (5.4.2) Random Forest Extra

In [54]:
feature_results_rfe = compute_results_target_only(datasets, random_forest_extra)
feature_results_rfe

Unnamed: 0,dataset,f1,prec,rec
0,WikipediaFA,0.744722,0.757812,0.732075
1,WikiNewsFA,0.79929,0.797872,0.800712
2,NewsFA,0.832821,0.837113,0.828571


In [122]:
res = feature_context_results_rfe
res.loc[(res.dataset.map(lambda val : val.name)=='WikiNewsFA')&(res.context.map(lambda val : val[0].name)=='ctx_dep_in_out'),]

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
125,"(mean, <function agg_ctx_feat_num_average at 0x000000E29297FD08>)","[(ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>), (ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>)]","(WikiNewsFA, [(ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>), (ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>)], (context_complexi...",0.800712,0.800712,0.800712,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x000000E2C6D7BAE8>), (context_complexity_from_context, <function ctx_features_context at 0x000000E3C89F6378..."
126,"(max, <function agg_ctx_feat_num_max at 0x000000E298B13048>)","[(ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>), (ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>)]","(WikiNewsFA, [(ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>), (ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>)], (context_complexi...",0.801418,0.798587,0.80427,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x000000E2C6D7BAE8>), (context_complexity_from_context, <function ctx_features_context at 0x000000E3C89F6378..."
127,"(min, <function agg_ctx_feat_num_min at 0x000000E298B130D0>)","[(ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>), (ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>)]","(WikiNewsFA, [(ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>), (ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>)], (context_complexi...",0.800712,0.800712,0.800712,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x000000E2C6D7BAE8>), (context_complexity_from_context, <function ctx_features_context at 0x000000E3C89F6378..."
128,"(weighted, <function <lambda> at 0x000000E29297FE18>)","[(ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>), (ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>)]","(WikiNewsFA, [(ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>), (ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>)], (context_complexi...",0.797853,0.802158,0.793594,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x000000E2C6D7BAE8>), (context_complexity_from_context, <function ctx_features_context at 0x000000E3C89F6378..."
129,"(dist, <function agg_ctx_feat_num_distance at 0x000000E29297FEA0>)","[(ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>), (ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>)]","(WikiNewsFA, [(ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>), (ctx_dep_in_out, {'filtering': True}, <function <lambda> at 0x000000E299309A60>)], (context_complexi...",0.798574,0.8,0.797153,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x000000E2C6D7BAE8>), (context_complexity_from_context, <function ctx_features_context at 0x000000E3C89F6378..."


In [43]:
feature_context_results_rfe = compute_results_with_context(datasets_fc_all, random_forest_extra)
feature_context_results_rfe

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
0,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.743295,0.754864,0.732075,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
1,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.743738,0.759843,0.728302,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
2,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.742308,0.756863,0.728302,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
3,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.740883,0.753906,0.728302,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
4,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.743295,0.754864,0.732075,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
5,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.805704,0.807143,0.804270,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
6,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.801431,0.805755,0.797153,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
7,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.799290,0.797872,0.800712,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
8,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.809947,0.808511,0.811388,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
9,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.805704,0.807143,0.804270,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [155]:
res = feature_context_results_rfe
res.loc[(res.dataset.map(lambda val : val.name)=='WikiNewsFA'),]

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
5,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.805704,0.807143,0.80427,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
6,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.801431,0.805755,0.797153,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
7,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.79929,0.797872,0.800712,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
8,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.809947,0.808511,0.811388,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
9,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.805704,0.807143,0.80427,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
20,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.804965,0.80212,0.807829,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
21,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.804965,0.80212,0.807829,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
22,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.80212,0.796491,0.807829,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
23,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.802842,0.801418,0.80427,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
24,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.80776,0.800699,0.814947,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [156]:
res = feature_context_results_xg
res.loc[(res.dataset.map(lambda val : val.name)=='WikiNewsFA'),]

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
5,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.790861,0.78125,0.800712,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
6,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.779541,0.772727,0.786477,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
7,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.791738,0.766667,0.818505,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
8,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.793594,0.793594,0.793594,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
9,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.790861,0.78125,0.800712,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
20,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.790614,0.802198,0.779359,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
21,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.783542,0.78777,0.779359,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
22,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.806394,0.804965,0.807829,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
23,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.779599,0.798507,0.761566,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
24,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.805654,0.8,0.811388,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [55]:
compute_ctx_with_higher_f1_fraction(feature_results_rfe, feature_context_results_rfe)

WikipediaFA : 0.2833333333333333 (17/60)
WikiNewsFA : 0.8166666666666667 (49/60)
NewsFA : 0.5333333333333333 (32/60)


In [72]:
compute_ctx_with_higher_f1(feature_context_results_rfe)

WikipediaFA : 109    (dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)
Name: agg, dtype: object, 109    [(ctx_dep_out, {'n': 1, 'filtering': True}, <function <lambda> at 0x0000008E33063AE8>), (ctx_dep_out, {'n': 1, 'filtering': True}, <function <lambda> at 0x0000008E33063AE8>)]
Name: context, dtype: object, 109    0.751445
Name: f1, dtype: float64
WikiNewsFA : 35    (mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)
Name: agg, dtype: object, 35    [(ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x0000008E33063620>), (ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x0000008E33063620>)]
Name: context, dtype: object, 35    0.812721
Name: f1, dtype: float64
NewsFA : 149    (dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)
Name: agg, dtype: object, 149    [(ctx_dep_in, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063C80>), (ctx_dep_in, {'n': 2, 'filtering': True}, <functio

## (5.4.3) XGBoost

In [None]:
feature_results_xg = compute_results_target_only(datasets, xgboost)

In [119]:
feature_results_xg

Unnamed: 0,dataset,f1,prec,rec
0,WikipediaFA,0.708571,0.715385,0.701887
1,WikiNewsFA,0.804921,0.795139,0.814947
2,NewsFA,0.815416,0.810484,0.820408


In [None]:
feature_context_results_xg = compute_results_with_context(datasets_fc_all, xgboost)

In [34]:
feature_context_results_xg

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
0,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.746154,0.760784,0.732075,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
1,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.744722,0.757812,0.732075,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
2,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.731429,0.738462,0.724528,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
3,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.741996,0.740602,0.743396,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
4,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.746154,0.760784,0.732075,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
5,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.790861,0.781250,0.800712,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
6,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.779541,0.772727,0.786477,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
7,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.791738,0.766667,0.818505,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
8,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.793594,0.793594,0.793594,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
9,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.790861,0.781250,0.800712,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [117]:
res = feature_context_results_xg
res.loc[(res.dataset.map(lambda val : val.name)=='NewsFA'),].groupby('agg').mean()

Unnamed: 0_level_0,f1,prec,rec
agg,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)",0.805721,0.812788,0.79881
"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)",0.806726,0.808942,0.804592
"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)",0.804514,0.80948,0.79966
"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)",0.804107,0.808007,0.80034
"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)",0.808608,0.814792,0.802551


In [35]:
res = feature_context_results_xg
res.loc[(res.dataset.map(lambda val : val.name)=='WikipediaFA'),]

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
0,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.746154,0.760784,0.732075,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
1,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.744722,0.757812,0.732075,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
2,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.731429,0.738462,0.724528,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
3,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.741996,0.740602,0.743396,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
4,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.746154,0.760784,0.732075,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
15,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikipediaFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E3...",0.726577,0.736434,0.716981,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
16,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikipediaFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E3...",0.752852,0.758621,0.74717,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
17,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikipediaFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E3...",0.739884,0.755906,0.724528,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
18,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikipediaFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E3...",0.734226,0.744186,0.724528,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
19,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikipediaFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E3...",0.717557,0.725869,0.709434,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [36]:
res = feature_context_results_xg
res.loc[(res.dataset.map(lambda val : val.name)=='WikiNewsFA'),]

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
5,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.790861,0.78125,0.800712,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
6,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.779541,0.772727,0.786477,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
7,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.791738,0.766667,0.818505,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
8,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.793594,0.793594,0.793594,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
9,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.790861,0.78125,0.800712,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
20,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.790614,0.802198,0.779359,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
21,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.783542,0.78777,0.779359,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
22,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.806394,0.804965,0.807829,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
23,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.779599,0.798507,0.761566,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
24,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.805654,0.8,0.811388,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [38]:
res = feature_context_results_xg
res.loc[(res.dataset.map(lambda val : val.name)=='NewsFA'),]

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
10,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.800821,0.805785,0.795918,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
11,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.812183,0.808081,0.816327,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
12,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.804435,0.794821,0.814286,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
13,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.810645,0.813142,0.808163,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
14,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.800821,0.805785,0.795918,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
25,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.804145,0.816842,0.791837,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
26,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.816495,0.825,0.808163,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
27,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.799591,0.80123,0.797959,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
28,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.799176,0.806653,0.791837,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
29,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.803719,0.813808,0.793878,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [154]:
res = feature_context_results_xg
res.loc[(res.dataset.map(lambda val : val.name)=='WikiNewsFA'),]

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
5,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.790861,0.78125,0.800712,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
6,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.779541,0.772727,0.786477,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
7,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.791738,0.766667,0.818505,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
8,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.793594,0.793594,0.793594,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
9,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.790861,0.78125,0.800712,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
20,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.790614,0.802198,0.779359,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
21,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.783542,0.78777,0.779359,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
22,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.806394,0.804965,0.807829,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
23,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.779599,0.798507,0.761566,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
24,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.805654,0.8,0.811388,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [39]:
compute_ctx_with_higher_f1_fraction(feature_results_xg, feature_context_results_xg)

WikipediaFA : 0.9666666666666667 (58/60)
WikiNewsFA : 0.1 (6/60)
NewsFA : 0.1 (6/60)


In [40]:
compute_ctx_with_higher_f1(feature_context_results_xg)

WikipediaFA : 168    (weighted, <function <lambda> at 0x0000008E2CDFBE18>)
Name: agg, dtype: object, 168    [(ctx_dep_in_out, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063D90>), (ctx_dep_in_out, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063D90>)]
Name: context, dtype: object, 168    0.768939
Name: f1, dtype: float64
WikiNewsFA : 114    (dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)
Name: agg, dtype: object, 114    [(ctx_dep_out, {'n': 1, 'filtering': True}, <function <lambda> at 0x0000008E33063AE8>), (ctx_dep_out, {'n': 1, 'filtering': True}, <function <lambda> at 0x0000008E33063AE8>)]
Name: context, dtype: object, 114    0.812721
Name: f1, dtype: float64
NewsFA : 88    (weighted, <function <lambda> at 0x0000008E2CDFBE18>)
Name: agg, dtype: object, 88    [(ctx_window_pre_suc_n, {'n': 6, 'filtering': True}, <function <lambda> at 0x0000008E330637B8>), (ctx_window_pre_suc_n, {'n': 6, 'filtering': True}, <function <lambda> at 0x

## (5.4.4) AdaBoost

In [96]:
feature_results_ab = compute_results_target_only(datasets, adaboost)
feature_results_ab

Unnamed: 0,dataset,f1,prec,rec
0,WikipediaFA,0.716475,0.727626,0.70566
1,WikiNewsFA,0.787346,0.777778,0.797153
2,NewsFA,0.825593,0.835073,0.816327


In [97]:
feature_context_results_ab = compute_results_with_context(datasets_fc_all, adaboost)
feature_context_results_ab

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
0,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>), (ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>)]","(WikipediaFA, [(ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>), (ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>)], (context_complexit...",0.734127,0.774059,0.698113,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
1,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>), (ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>), (ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>)], (context_complexity...",0.769231,0.773381,0.765125,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
2,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>), (ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>)]","(NewsFA, [(ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>), (ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>)], (context_complexity, [C...",0.8107,0.817427,0.804082,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
3,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>)], (context_complexity,...",0.734127,0.774059,0.698113,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
4,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>)], (context_complexity, ...",0.769231,0.773381,0.765125,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
5,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>)], (context_complexity, [Con...",0.8107,0.817427,0.804082,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
6,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>)]","(WikipediaFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2...",0.720307,0.731518,0.709434,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
7,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F...",0.767857,0.770609,0.765125,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
8,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F45...",0.80829,0.821053,0.795918,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
9,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x0000006DD50CD7B8>), (ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x0000006DD50CD7B8>)]","(WikipediaFA, [(ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x0000006DD50CD7B8>), (ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x0000006DD...",0.725564,0.722846,0.728302,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."


## (5.4.5) Decision Tree

In [56]:
feature_results_dt = compute_results_target_only(datasets, decision_tree)
feature_results_dt

Unnamed: 0,dataset,f1,prec,rec
0,WikipediaFA,0.661538,0.67451,0.649057
1,WikiNewsFA,0.738617,0.701923,0.779359
2,NewsFA,0.773931,0.772358,0.77551


In [44]:
feature_context_results_dt = compute_results_with_context(datasets_fc_all, decision_tree)
feature_context_results_dt

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
0,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.651601,0.650376,0.652830,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
1,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.674074,0.661818,0.686792,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
2,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.645161,0.648855,0.641509,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
3,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.653846,0.666667,0.641509,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
4,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.651601,0.650376,0.652830,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
5,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.723842,0.698675,0.750890,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
6,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.713551,0.688742,0.740214,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
7,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.712565,0.690000,0.736655,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
8,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.686207,0.665552,0.708185,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
9,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.723842,0.698675,0.750890,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [142]:
res = feature_context_results_dt
res.loc[(res.dataset.map(lambda val : val.name)=='NewsFA'),]

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
10,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.756269,0.74359,0.769388,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
11,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.771255,0.76506,0.777551,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
12,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.764293,0.751479,0.777551,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
13,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.749239,0.745455,0.753061,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
14,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.756269,0.74359,0.769388,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
25,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.761026,0.764948,0.757143,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
26,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.765005,0.762677,0.767347,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
27,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.783231,0.784836,0.781633,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
28,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.768443,0.771605,0.765306,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
29,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.773797,0.776181,0.771429,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [57]:
compute_ctx_with_higher_f1_fraction(feature_results_dt, feature_context_results_dt)

WikipediaFA : 0.5333333333333333 (32/60)
WikiNewsFA : 0.06666666666666667 (4/60)
NewsFA : 0.25 (15/60)


In [73]:
compute_ctx_with_higher_f1(feature_context_results_dt)

WikipediaFA : 150    (mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)
Name: agg, dtype: object, 150    [(ctx_dep_out, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063D08>), (ctx_dep_out, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063D08>)]
Name: context, dtype: object, 150    0.698529
Name: f1, dtype: float64
WikiNewsFA : 156    (max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)
Name: agg, dtype: object, 156    [(ctx_dep_out, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063D08>), (ctx_dep_out, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063D08>)]
Name: context, dtype: object, 156    0.752166
Name: f1, dtype: float64
NewsFA : 134    (dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)
Name: agg, dtype: object, 134    [(ctx_dep_in_out, {'n': 1, 'filtering': True}, <function <lambda> at 0x0000008E33063B70>), (ctx_dep_in_out, {'n': 1, 'filtering': True}, <function <lambda> at

## (5.4.6) MLP

In [59]:
feature_results_mlp = compute_results_target_only(datasets, mlp)
feature_results_mlp

Unnamed: 0,dataset,f1,prec,rec
0,WikipediaFA,0.612855,0.507426,0.773585
1,WikiNewsFA,0.562232,0.402458,0.932384
2,NewsFA,0.691002,0.59157,0.830612


In [45]:
feature_context_results_mlp = compute_results_with_context(datasets_fc_all, mlp)
feature_context_results_mlp

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
0,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.611018,0.547904,0.690566,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
1,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.638718,0.474359,0.977358,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
2,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.517413,0.759124,0.392453,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
3,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.520681,0.732877,0.403774,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
4,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.611018,0.547904,0.690566,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
5,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.070968,0.379310,0.039146,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
6,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.593088,0.431818,0.946619,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
7,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.690909,0.601583,0.811388,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
8,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.626556,0.751244,0.537367,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
9,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.070968,0.379310,0.039146,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [137]:
res = feature_context_results_mlp
res.loc[(res.dataset.map(lambda val : val.name)=='WikipediaFA'),]

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
0,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.611018,0.547904,0.690566,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
1,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.638718,0.474359,0.977358,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
2,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.517413,0.759124,0.392453,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
3,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.520681,0.732877,0.403774,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
4,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.611018,0.547904,0.690566,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
15,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikipediaFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E3...",0.557604,0.715976,0.456604,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
16,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikipediaFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E3...",0.60146,0.490476,0.777358,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
17,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikipediaFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E3...",0.56092,0.717647,0.460377,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
18,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikipediaFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E3...",0.57931,0.741176,0.475472,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
19,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikipediaFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E3...",0.678233,0.582656,0.811321,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [60]:
compute_ctx_with_higher_f1_fraction(feature_results_mlp, feature_context_results_mlp)

WikipediaFA : 0.35 (21/60)
WikiNewsFA : 0.75 (45/60)
NewsFA : 0.06666666666666667 (4/60)


In [74]:
compute_ctx_with_higher_f1(feature_context_results_mlp)

WikipediaFA : 19    (dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)
Name: agg, dtype: object, 19    [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]
Name: context, dtype: object, 19    0.678233
Name: f1, dtype: float64
WikiNewsFA : 7    (min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)
Name: agg, dtype: object, 7    [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]
Name: context, dtype: object, 7    0.690909
Name: f1, dtype: float64
NewsFA : 12    (min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)
Name: agg, dtype: object, 12    [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]
Name: con

## (5.4.7) Logistic Regression

In [62]:
feature_results_lr = compute_results_target_only(datasets, logistic_regression)
feature_results_lr

Unnamed: 0,dataset,f1,prec,rec
0,WikipediaFA,0.666667,0.651079,0.683019
1,WikiNewsFA,0.619718,0.712963,0.548043
2,NewsFA,0.590853,0.749216,0.487755


In [46]:
feature_context_results_lr = compute_results_with_context(datasets_fc_all, logistic_regression)
feature_context_results_lr

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
0,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.672598,0.636364,0.713208,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
1,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.671480,0.643599,0.701887,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
2,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.684492,0.648649,0.724528,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
3,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.705263,0.659016,0.758491,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
4,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.672598,0.636364,0.713208,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
5,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.630691,0.599359,0.665480,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
6,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.619565,0.630996,0.608541,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
7,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.665406,0.709677,0.626335,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
8,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.659200,0.598837,0.733096,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
9,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.630691,0.599359,0.665480,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [130]:
res = feature_context_results_lr
res.loc[(res.dataset.map(lambda val : val.name)=='WikiNewsFA'),]

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
5,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.630691,0.599359,0.66548,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
6,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.619565,0.630996,0.608541,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
7,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.665406,0.709677,0.626335,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
8,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.6592,0.598837,0.733096,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
9,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.630691,0.599359,0.66548,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
20,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.660929,0.64,0.683274,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
21,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.616896,0.688596,0.558719,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
22,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.640884,0.664122,0.619217,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
23,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.659686,0.64726,0.672598,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
24,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33...",0.595194,0.619231,0.572954,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [63]:
compute_ctx_with_higher_f1_fraction(feature_results_lr, feature_context_results_lr)

WikipediaFA : 0.6333333333333333 (38/60)
WikiNewsFA : 0.8666666666666667 (52/60)
NewsFA : 0.5 (30/60)


In [75]:
compute_ctx_with_higher_f1(feature_context_results_lr)

WikipediaFA : 3    (weighted, <function <lambda> at 0x0000008E2CDFBE18>)
Name: agg, dtype: object, 3    [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]
Name: context, dtype: object, 3    0.705263
Name: f1, dtype: float64
WikiNewsFA : 125    (mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)
Name: agg, dtype: object, 125    [(ctx_dep_in_out, {'n': 1, 'filtering': True}, <function <lambda> at 0x0000008E33063B70>), (ctx_dep_in_out, {'n': 1, 'filtering': True}, <function <lambda> at 0x0000008E33063B70>)]
Name: context, dtype: object, 125    0.668842
Name: f1, dtype: float64
NewsFA : 13    (weighted, <function <lambda> at 0x0000008E2CDFBE18>)
Name: agg, dtype: object, 13    [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]
Name: context, dtype: object, 13 

## (5.4.8) Naive Bayes

In [65]:
feature_results_nb = compute_results_target_only(datasets, naive_bayes)
feature_results_nb

Unnamed: 0,dataset,f1,prec,rec
0,WikipediaFA,0.687237,0.546875,0.924528
1,WikiNewsFA,0.634434,0.474427,0.957295
2,NewsFA,0.609883,0.447519,0.957143


In [47]:
feature_context_results_nb = compute_results_with_context(datasets_fc_all, naive_bayes)
feature_context_results_nb

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
0,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.686275,0.545657,0.924528,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
1,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.675900,0.533917,0.920755,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
2,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.687237,0.546875,0.924528,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
3,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.689076,0.547884,0.928302,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
4,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.686275,0.545657,0.924528,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
5,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.634434,0.474427,0.957295,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
6,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.621089,0.460481,0.953737,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
7,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.634434,0.474427,0.957295,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
8,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.634434,0.474427,0.957295,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
9,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.634434,0.474427,0.957295,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [133]:
res = feature_context_results_nb
res.loc[(res.dataset.map(lambda val : val.name)=='NewsFA'),]

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
10,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.609883,0.447519,0.957143,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
11,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.609314,0.446023,0.961224,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
12,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.611872,0.449664,0.957143,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
13,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.609883,0.447519,0.957143,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
14,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.609883,0.447519,0.957143,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
25,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.610677,0.448375,0.957143,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
26,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.61028,0.447947,0.957143,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
27,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.608241,0.447546,0.94898,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
28,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.609436,0.448842,0.94898,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
29,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.611075,0.448804,0.957143,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [66]:
compute_ctx_with_higher_f1_fraction(feature_results_nb, feature_context_results_nb)

WikipediaFA : 0.16666666666666666 (10/60)
WikiNewsFA : 0.13333333333333333 (8/60)
NewsFA : 0.3 (18/60)


In [76]:
compute_ctx_with_higher_f1(feature_context_results_nb)

WikipediaFA : 33     (weighted, <function <lambda> at 0x0000008E2CDFBE18>)
138    (weighted, <function <lambda> at 0x0000008E2CDFBE18>)
Name: agg, dtype: object, 33     [(ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x0000008E33063620>), (ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x0000008E33063620>)]
138                        [(ctx_dep_in, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063C80>), (ctx_dep_in, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063C80>)]
Name: context, dtype: object, 33     0.690042
138    0.690042
Name: f1, dtype: float64
WikiNewsFA : 52    (min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)
53           (weighted, <function <lambda> at 0x0000008E2CDFBE18>)
68           (weighted, <function <lambda> at 0x0000008E2CDFBE18>)
83           (weighted, <function <lambda> at 0x0000008E2CDFBE18>)
Name: agg, dtype: object, 52    [(ctx_window_pre_suc_n, {'n': 4, 'filt

## (5.4.9) SVM

In [115]:
feature_results_svm = compute_results_target_only(datasets, svm)
feature_results_svm

Unnamed: 0,dataset,f1,prec,rec
0,WikipediaFA,0.256098,0.666667,0.158491
1,WikiNewsFA,0.385417,0.718447,0.263345
2,NewsFA,0.599746,0.794613,0.481633


In [116]:
feature_context_results_svm = compute_results_with_context(datasets_fc_all, svm)
feature_context_results_svm

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
0,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>), (ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>)]","(WikipediaFA, [(ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>), (ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>)], (context_complexit...",0.0,0.0,0.0,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
1,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>), (ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>), (ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>)], (context_complexity...",0.0,0.0,0.0,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
2,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>), (ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>)]","(NewsFA, [(ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>), (ctx_sentence, {'filtering': False}, <function <lambda> at 0x0000006DD50CDBF8>)], (context_complexity, [C...",0.221831,0.807692,0.128571,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
3,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>)], (context_complexity,...",0.0,0.0,0.0,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
4,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>)], (context_complexity, ...",0.0,0.0,0.0,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
5,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000006C2F62F158>)], (context_complexity, [Con...",0.221831,0.807692,0.128571,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
6,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>)]","(WikipediaFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2...",0.0,0.0,0.0,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
7,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>)]","(WikiNewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F...",0.0,0.0,0.0,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
8,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F4510>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000006C2F7F45...",0.30033,0.784483,0.185714,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."
9,"(mean, <function agg_ctx_feat_num_average at 0x0000006C2E594E18>)","[(ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x0000006DD50CD7B8>), (ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x0000006DD50CD7B8>)]","(WikipediaFA, [(ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x0000006DD50CD7B8>), (ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x0000006DD...",0.0,0.0,0.0,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000006C5C38A9D8>), (context_complexity_from_context, <function ctx_features_context at 0x0000006D49EC9BF8..."


## (5.4.10) kNN

In [68]:
feature_results_kn = compute_results_target_only(datasets, knn)
feature_results_kn

Unnamed: 0,dataset,f1,prec,rec
0,WikipediaFA,0.662768,0.685484,0.641509
1,WikiNewsFA,0.683544,0.694853,0.672598
2,NewsFA,0.767296,0.788793,0.746939


In [48]:
feature_context_results_kn = compute_results_with_context(datasets_fc_all, knn)
feature_context_results_kn

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
0,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.573614,0.581395,0.566038,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
1,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.624254,0.659664,0.592453,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
2,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.653620,0.678862,0.630189,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
3,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.637097,0.683983,0.596226,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
4,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikipediaFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity,...",0.573614,0.581395,0.566038,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
5,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.583026,0.605364,0.562278,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
6,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.618375,0.614035,0.622776,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
7,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.678700,0.688645,0.669039,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
8,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.673874,0.682482,0.665480,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
9,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(WikiNewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, ...",0.583026,0.605364,0.562278,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [144]:
res = feature_context_results_kn
res.loc[(res.dataset.map(lambda val : val.name)=='NewsFA'),]

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
10,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.602198,0.652381,0.559184,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
11,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.677182,0.698482,0.657143,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
12,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.759201,0.78308,0.736735,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
13,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.769072,0.777083,0.761224,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
14,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)]","(NewsFA, [(ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>), (ctx_sentence, {'filtering': True}, <function <lambda> at 0x0000008E33802378>)], (context_complexity, [Con...",0.602198,0.652381,0.559184,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
25,"(mean, <function agg_ctx_feat_num_average at 0x0000008E2CDFBD08>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.709609,0.73523,0.685714,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
26,"(max, <function agg_ctx_feat_num_max at 0x0000008E2CE1A048>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.686504,0.716186,0.659184,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
27,"(min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.740434,0.750524,0.730612,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
28,"(weighted, <function <lambda> at 0x0000008E2CDFBE18>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.750524,0.771552,0.730612,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."
29,"(dist, <function agg_ctx_feat_num_distance at 0x0000008E2CDFBEA0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>)]","(NewsFA, [(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063598>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E330635...",0.714588,0.741228,0.689796,"(context_complexity, [(context_complexity_from_target, <function ctx_features_target at 0x0000008E611B2C80>), (context_complexity_from_context, <function ctx_features_context at 0x0000008F488EC620..."


In [69]:
compute_ctx_with_higher_f1_fraction(feature_results_kn, feature_context_results_kn)

WikipediaFA : 0.016666666666666666 (1/60)
WikiNewsFA : 0.06666666666666667 (4/60)
NewsFA : 0.03333333333333333 (2/60)


In [77]:
compute_ctx_with_higher_f1(feature_context_results_kn)

WikipediaFA : 77    (min, <function agg_ctx_feat_num_min at 0x0000008E2CE1A0D0>)
Name: agg, dtype: object, 77    [(ctx_window_pre_suc_n, {'n': 6, 'filtering': True}, <function <lambda> at 0x0000008E330637B8>), (ctx_window_pre_suc_n, {'n': 6, 'filtering': True}, <function <lambda> at 0x0000008E330637B8>)]
Name: context, dtype: object, 77    0.664062
Name: f1, dtype: float64
WikiNewsFA : 158    (weighted, <function <lambda> at 0x0000008E2CDFBE18>)
Name: agg, dtype: object, 158    [(ctx_dep_out, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063D08>), (ctx_dep_out, {'n': 2, 'filtering': True}, <function <lambda> at 0x0000008E33063D08>)]
Name: context, dtype: object, 158    0.692998
Name: f1, dtype: float64
NewsFA : 88    (weighted, <function <lambda> at 0x0000008E2CDFBE18>)
Name: agg, dtype: object, 88    [(ctx_window_pre_suc_n, {'n': 6, 'filtering': True}, <function <lambda> at 0x0000008E330637B8>), (ctx_window_pre_suc_n, {'n': 6, 'filtering': True}, <function <lambda> at

## Context-Dependent Complex words Fractions

In [83]:
from collections import Counter

def build_weighted_vocabulary(train):
    targets_complex = [mwe.strip().lower() for mwe in 
                train.loc[train['binary'] == 1,]['target'].tolist()]
    targets_non_complex = [mwe.strip().lower() for mwe in
                train.loc[train['binary'] == 0,]['target'].tolist()]
    counts_complex = Counter(targets_complex)
    counts_non_complex = Counter(targets_non_complex)
    vocabulary = {}
    for word, count in counts_complex.items():
        count_nc = counts_non_complex.get(word, None)
        if count_nc:
            vocabulary[word] = count / (count + count_nc)
        else:
            vocabulary[word] = 1
    for word, count in counts_non_complex.items():
        if word not in vocabulary:
            vocabulary[word] = 0
    return vocabulary

def compute_ctx_dep_words(vocabulary):
    return [word for word, score in vocabulary.items() 
            if (score > 0) and (score < 1)]

In [93]:
vocabulary_wiki = build_weighted_vocabulary(datasets[0].train)
words = compute_ctx_dep_words(vocabulary_wiki)
wiki_fraction = len(words) / len(vocabulary_wiki)
print('Wikipedia Ctx-Dep. Complex Words Fraction : {}'.format(wiki_fraction))

vocabulary_wikinews = build_weighted_vocabulary(datasets[1].train)
words = compute_ctx_dep_words(vocabulary_wikinews)
wikinews_fraction = len(words) / len(vocabulary_wikinews)
print('WikiNews Ctx-Dep. Complex Words Fraction : {}'.format(wikinews_fraction))

vocabulary_news = build_weighted_vocabulary(datasets[2].train)
words = compute_ctx_dep_words(vocabulary_news)
news_fraction = len(words) / len(vocabulary_news)
print('News Ctx-Dep. Complex Words Fraction : {}'.format(news_fraction))

Wikipedia Ctx-Dep. Complex Words Fraction : 0.08784722222222222
WikiNews Ctx-Dep. Complex Words Fraction : 0.11452604335166612
News Ctx-Dep. Complex Words Fraction : 0.14963847846589123


# Misc

In [40]:
evaluation = [{'dataset' : result.dataset.name, 'agg' : result.agg[0],
                    'zc' : result.fc, 'context':result.context , 'prec' : result.measure[0][1],
               'rec' : result.measure[1][1], 'f1' : result.measure[2][1]} 
                   for result in results]
feature_eval_data = pd.DataFrame.from_records(evaluation)
feature_eval_data

Unnamed: 0,agg,context,dataset,f1,prec,rec,zc
0,"(dist, <function agg_ctx_feat_num_distance at 0x000000FC01E1DEA0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD620>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD620>)]",Wikipedia,0.764622,0.740331,0.79056,"(context_complexity, [(context_complexity_from_target, <function ctx_features_context_complexity_from_target at 0x000000FC25B7FD90>), (context_complexity_from_context, <function ctx_features_conte..."
1,"(dist, <function agg_ctx_feat_num_distance at 0x000000FC01E1DEA0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD620>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD620>)]",WikiNews,0.789474,0.772358,0.807365,"(context_complexity, [(context_complexity_from_target, <function ctx_features_context_complexity_from_target at 0x000000FC25B7FD90>), (context_complexity_from_context, <function ctx_features_conte..."
2,"(dist, <function agg_ctx_feat_num_distance at 0x000000FC01E1DEA0>)","[(ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD620>), (ctx_window_pre_suc_n, {'n': 2, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD620>)]",News,0.838122,0.818057,0.859195,"(context_complexity, [(context_complexity_from_target, <function ctx_features_context_complexity_from_target at 0x000000FC25B7FD90>), (context_complexity_from_context, <function ctx_features_conte..."
3,"(dist, <function agg_ctx_feat_num_distance at 0x000000FC01E1DEA0>)","[(ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD6A8>), (ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD6A8>)]",Wikipedia,0.758226,0.736111,0.781711,"(context_complexity, [(context_complexity_from_target, <function ctx_features_context_complexity_from_target at 0x000000FC25B7FD90>), (context_complexity_from_context, <function ctx_features_conte..."
4,"(dist, <function agg_ctx_feat_num_distance at 0x000000FC01E1DEA0>)","[(ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD6A8>), (ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD6A8>)]",WikiNews,0.787795,0.771739,0.804533,"(context_complexity, [(context_complexity_from_target, <function ctx_features_context_complexity_from_target at 0x000000FC25B7FD90>), (context_complexity_from_context, <function ctx_features_conte..."
5,"(dist, <function agg_ctx_feat_num_distance at 0x000000FC01E1DEA0>)","[(ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD6A8>), (ctx_window_pre_suc_n, {'n': 3, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD6A8>)]",News,0.837307,0.817808,0.857759,"(context_complexity, [(context_complexity_from_target, <function ctx_features_context_complexity_from_target at 0x000000FC25B7FD90>), (context_complexity_from_context, <function ctx_features_conte..."
6,"(dist, <function agg_ctx_feat_num_distance at 0x000000FC01E1DEA0>)","[(ctx_window_pre_suc_n, {'n': 4, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD730>), (ctx_window_pre_suc_n, {'n': 4, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD730>)]",Wikipedia,0.758523,0.731507,0.787611,"(context_complexity, [(context_complexity_from_target, <function ctx_features_context_complexity_from_target at 0x000000FC25B7FD90>), (context_complexity_from_context, <function ctx_features_conte..."
7,"(dist, <function agg_ctx_feat_num_distance at 0x000000FC01E1DEA0>)","[(ctx_window_pre_suc_n, {'n': 4, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD730>), (ctx_window_pre_suc_n, {'n': 4, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD730>)]",WikiNews,0.787204,0.773224,0.8017,"(context_complexity, [(context_complexity_from_target, <function ctx_features_context_complexity_from_target at 0x000000FC25B7FD90>), (context_complexity_from_context, <function ctx_features_conte..."
8,"(dist, <function agg_ctx_feat_num_distance at 0x000000FC01E1DEA0>)","[(ctx_window_pre_suc_n, {'n': 4, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD730>), (ctx_window_pre_suc_n, {'n': 4, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD730>)]",News,0.837079,0.818681,0.856322,"(context_complexity, [(context_complexity_from_target, <function ctx_features_context_complexity_from_target at 0x000000FC25B7FD90>), (context_complexity_from_context, <function ctx_features_conte..."
9,"(dist, <function agg_ctx_feat_num_distance at 0x000000FC01E1DEA0>)","[(ctx_window_pre_suc_n, {'n': 5, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD7B8>), (ctx_window_pre_suc_n, {'n': 5, 'filtering': True}, <function <lambda> at 0x000000FC7A0AD7B8>)]",Wikipedia,0.759602,0.733516,0.787611,"(context_complexity, [(context_complexity_from_target, <function ctx_features_context_complexity_from_target at 0x000000FC25B7FD90>), (context_complexity_from_context, <function ctx_features_conte..."


In [152]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
fs = ctx_datasets_fc_context_complexity[0]
train, test = transform_feat_to_num(remove_label_for_binary_df_and_ctx_features(fs.train), \
        remove_label_for_binary_df_and_ctx_features(fs.test))
x_train = train.loc[:, train.columns != 'binary']
y_train = train.binary.values
rf_random.fit(x_train, y_train)
rf_random.best_params_

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 13.5min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)