# Load Data
First, we load all the data we need into pandas dataframes.

In [265]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.wsd import lesk
import nltk

In [266]:
TRAIN_ENGLISH_WIKIPEDIA = "../cwishareddataset/traindevset/" + \
                           "english/Wikipedia_Train.tsv"
df = pd.read_csv(TRAIN_ENGLISH_WIKIPEDIA, sep = "\t")
df.columns = ['id', 'sentence', "start", "end", "target", 
              "nat", "non_nat", "nat_marked", "non_nat_marked", "binary", "prob"]

# Aggregation
Since many labels are multi-word expression, we first of all define some aggregation functions that aggregate feature values over multiple tokens. Implementing this seperately allows to easily exchange the used aggregation function and keeps the feature computation functions clean. These feature computation functions should only compute features for a single target word.

In [268]:
from nltk.tokenize import word_tokenize

def agg_feat_num_average(target, func_feature, *args):
    return np.mean([func_feature(token, *args) for token in word_tokenize(target)])

def agg_feat_num_median(target, func_feature, *args):
    return np.median([func_feature(token, *args) for token in word_tokenize(target)])

def agg_feat_num_max(target, func_feature, *args):
    return np.max([func_feature(token, *args) for token in word_tokenize(target)])

def agg_feat_num_min(target, func_feature, *args):
    return np.min([func_feature(token, *args) for token in word_tokenize(target)])

# Orthographic features
Here we start computing simple features like the length of the target word.

In [269]:
df['length'] = df.target.apply(lambda target : agg_feat_num_average(target, len))
#Relative position of the target word based on tokens
df['relative_position'] = df[['sentence', 'target']].apply(lambda vals : 
            (nltk.word_tokenize(vals[0]).index(vals[1].split()[0])) / len((nltk.word_tokenize(vals[0]))), axis = 1)
# Relative positions of the target word based on character counting
df['relative_position_left'] = df[['sentence', 'start']].apply(lambda vals : vals[1] / len(vals[0]), axis = 1)
df['relative_position_centered'] = df[['sentence', 'start', 'end']].apply(lambda vals : 
            ((vals[1] + vals[2]) / 2) / len(vals[0]), axis = 1)
df['relative_position_right'] = df[['sentence', 'end']].apply(lambda vals : vals[1] / len(vals[0]), axis = 1)

# WordNet Features
Here we implement all the relevant features based on WordNet and SentiWordNet. For example, the number of synsets the target word is contained in or the average length of the lemmas of all the synsets the target word is contained in. Note that all features that are computed in the following exploit neither the POS-Tag of the target word nor Word Sense Disambiguation by e.g. UKB-Algorithm.

In [None]:
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.wsd import lesk

def wn_synset_freq(target):
    return len(wn.synsets(target))

def wn_synset_avg_lemma_freq(target):
    return np.nan_to_num(np.mean([len(synset.lemmas()) 
            for synset in wn.synsets(target)]))

def wn_synset_avg_lemma_len(target):
    return np.nan_to_num(np.nanmean([len(lemma.name()) 
            for synset in wn.synsets(target) 
            for lemma in synset.lemmas()]))

def wn_synset_avg_hypernyms(target):
    return np.nan_to_num(np.nanmean([len(synset.hypernyms()) 
            for synset in wn.synsets(target)]))

def wn_synset_avg_hyponyms(target):
    return np.nan_to_num(np.mean([len(synset.hyponyms()) 
            for synset in wn.synsets(target)]))

def wn_synset_sum_hypernyms(target):
    return np.sum(([len(synset.hypernyms()) 
            for synset in wn.synsets(target)]))

def wn_synset_avg_definition_len(target):
    return np.nan_to_num(np.mean([len(str(synset.definition())) 
            for synset in wn.synsets(target)]))

def wn_synset_avg_hyptree_depth(target):
    return np.nan_to_num(np.mean([synset.max_depth() 
            for synset in wn.synsets(target)]))

def wn_synset_num_distinct_pos(target):
    return len(set([synset.pos() for synset in wn.synsets(target)]))

def wn_synset_avg_num_relations(target):
    return np.nan_to_num(np.mean([np.sum([len(synset.hypernyms()), len(synset.hyponyms()), 
             len(synset.instance_hypernyms()), len(synset.instance_hyponyms()),
             len(synset.member_holonyms()), len(synset.substance_holonyms()),
             len(synset.part_holonyms()), len(synset.member_meronyms()),
             len(synset.substance_meronyms()), len(synset.part_meronyms())]) 
             for synset in wn.synsets(target)]))

def wn_synset_avg_freq_pos(target, pos):
    return len(wn.synsets(target, pos = pos))

def wn_synset_sense_entropy_uniform(target):
    num_senses = len(wn.synsets(target))
    return -np.sum([((1 / num_senses) * np.log2(1 / num_senses)) 
                     for index in range(0, num_senses)])

def wn_synset_sense_entropy_pos_uniform(target):
    num_senses = len(wn.synsets(target))
    pos_distribution = [len(wn.synsets(target, pos = wn.NOUN)),
                        len(wn.synsets(target, pos = wn.VERB)),
                        len(wn.synsets(target, pos = wn.ADJ)),
                        len(wn.synsets(target, pos = wn.ADV))]
    return -np.sum([(np.nan_to_num((count / num_senses) * np.log2(count / num_senses))) 
            for count in pos_distribution]) if num_senses != 0 else 0

def wn_synsets_sense_entropy_pos_central(target, pos):
    num_senses_pos = len(wn.synsets(target, pos = pos))
    return -np.sum([((1 / num_senses_pos) * np.log2(1 / num_senses_pos))
                     for index in range(0, num_senses_pos)])
     

def swn_avg_objective_score(target):
    return np.nan_to_num(np.mean([senti_synset.obj_score() 
                for senti_synset in swn.senti_synsets(target)]))

def pos_tag(sentence, target):
    tokens = nltk.word_tokenize(sentence)
    wordPOSPairs = [token for token in nltk.pos_tag(tokens) if token[0] == target]
    return wordPOSPairs[0][1] if len(wordPOSPairs) > 0 else None

# TODO consider using stanford lemmatizer and compute word similarity metric
# to orignal target
def wordnet_lemma_len(target):
    return len(wordNetLemmatizer.lemmatize(target))

def penn_to_wn(tag):
    if not tag:
        return None
    if tag.startswith('N'):
        return 'n'
    if tag.startswith('V'):
        return 'v'
    if tag.startswith('J'):
        return 'a'
    if tag.startswith('R'):
        return 'r'
    return None

In [248]:
df['wn_synset_freq'] = df.target.apply(lambda target : agg_feat_num_average(target, wn_synset_freq))
df['wn_synset_avg_lemma_freq'] = df.target.apply(lambda target : agg_feat_num_average(target, wn_synset_avg_lemma_freq))
df['wn_synset_avg_lemma_len'] = df.target.apply(lambda target : agg_feat_num_average(target, wn_synset_avg_lemma_len))

df['wn_synset_diff_len_avg_lemma_len'] = df.wn_synset_avg_lemma_len - df.length
df['wn_synset_avg_hypernyms'] = df.target.apply(lambda target : agg_feat_num_average(target, wn_synset_avg_hypernyms))
df['wn_synset_sum_hypernyms'] = df.target.apply(lambda target : agg_feat_num_average(target, wn_synset_sum_hypernyms))
df['wn_synset_avg_hyponyms'] = df.target.apply(lambda target : agg_feat_num_average(target, wn_synset_avg_hyponyms))

df['wn_synset_avg_definition_len'] = df.target.apply(lambda target : 
                                                     agg_feat_num_average(target, wn_synset_avg_definition_len))
df['wn_synset_avg_hyptree_depth'] = df.target.apply(lambda target :
                                                     agg_feat_num_average(target, wn_synset_avg_hyptree_depth))
df['wn_synset_num_distinct_pos'] = df.target.apply(lambda target : 
                                                     agg_feat_num_average(target, wn_synset_num_distinct_pos))
df['wn_synset_avg_num_relations'] = df.target.apply(lambda target : 
                                                     agg_feat_num_average(target, wn_synset_avg_num_relations))

df['wn_synset_avg_freq_pos_noun'] = df.target.apply(lambda target : 
                                                    agg_feat_num_average(target, wn_synset_avg_freq_pos, wn.NOUN))
df['wn_synset_avg_freq_pos_verb'] = df.target.apply(lambda target : 
                                                    agg_feat_num_average(target, wn_synset_avg_freq_pos, wn.VERB))
df['wn_synset_avg_freq_pos_adj'] = df.target.apply(lambda target : 
                                                   agg_feat_num_average(target, wn_synset_avg_freq_pos, wn.ADJ))
df['wn_synset_avg_freq_pos_adv'] = df.target.apply(lambda target : 
                                                   agg_feat_num_average(target, wn_synset_avg_freq_pos, wn.ADV))

df['wn_synset_avg_freq_pos_noun_norm'] = np.nan_to_num(df.wn_synset_avg_freq_pos_noun / df.wn_synset_freq)
df['wn_synset_avg_freq_pos_verb_norm'] = np.nan_to_num(df.wn_synset_avg_freq_pos_verb / df.wn_synset_freq)
df['wn_synset_avg_freq_pos_adj_norm'] = np.nan_to_num(df.wn_synset_avg_freq_pos_adj / df.wn_synset_freq)
df['wn_synset_avg_freq_pos_adv_norm'] = np.nan_to_num(df.wn_synset_avg_freq_pos_adv / df.wn_synset_freq)

df['pos_tag'] = df[['sentence', 'target']].apply(lambda vals : pos_tag(*vals), axis = 1)
df['wn_synset_sense_entropy_uniform'] = df.target.apply(lambda target : 
                                                        agg_feat_num_average(target, wn_synset_sense_entropy_uniform))
df['wn_synset_sense_entropy_pos_uniform'] = df.target.apply(lambda target :
                                                        agg_feat_num_average(target, wn_synset_sense_entropy_pos_uniform))
df['wn_synsets_sense_entropy_pos_central'] = df[['target', 'pos_tag']].apply(
    lambda vals : wn_synsets_sense_entropy_pos_central(vals[0], penn_to_wn(vals[1])), axis = 1)

df['swn_avg_objective_score'] = df.target.apply(lambda target : agg_feat_num_average(target, swn_avg_objective_score))

df['wordnet_lemma_len'] = df.target.apply(lambda target : agg_feat_num_average(target, wordnet_lemma_len))
df['diff_len_wordnet_lemma_len'] = df.length - df.wordnet_lemma_len
df['reduction_lemma_len'] = 1 - df.wordnet_lemma_len / df.length

  out=out, **kwargs)
  


In [None]:
df.loc[:,['target', 'length', 'wordnet_lemma_len', 'diff_len_wordnet_lemma_len', 'reduction_lemma_len']]

# PorterStemmer, StanfordNLP and Dependency Tree Features
Here we implement features based on the PorterStemmer library from nltk.

In [240]:
from nltk.stem.porter import *
from nltk.stem.wordnet import *
from nltk.tag.stanford import StanfordNERTagger
from nltk.parse.stanford import StanfordDependencyParser
from nltk.tokenize import word_tokenize
import os

java_path = "C:/Program Files (x86)/Java/jdk1.8.0_144/bin/java.exe"
os.environ['JAVAHOME'] = java_path
path_to_jar = 'resources/stanford-dependency-parser/stanford-parser.jar'
path_to_models_jar = 'resources/stanford-dependency-parser/stanford-parser-3.9.1-models.jar'

porterStemmer = PorterStemmer()
wordNetLemmatizer = WordNetLemmatizer()
nerTagger = StanfordNERTagger('resources/stanford-ner-tagger/classifiers/english.all.3class.distsim.crf.ser.gz',
               'resources/stanford-ner-tagger/stanford-ner.jar',
               encoding='utf-8')
dependencyParser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

def porter_stem_len(target):
    return len(str(porterStemmer.stem(target)))

def porter_stemmer_num_steps(target):
    stem = target.lower()
    applied_steps = 0
    if porterStemmer.mode == porterStemmer.NLTK_EXTENSIONS and target in porterStemmer.pool:
            return applied_steps
    if porterStemmer.mode != porterStemmer.ORIGINAL_ALGORITHM and len(target) <= 2:
            return applied_steps
    step_funcs = [porterStemmer._step1a, porterStemmer._step1b, porterStemmer._step1c,
                  porterStemmer._step2, porterStemmer._step3, porterStemmer._step3,
                  porterStemmer._step4, porterStemmer._step5a, porterStemmer._step5b]
    for step_func in step_funcs:
        stem_step = step_func(stem)
        if stem_step != stem:
            stem = stem_step
            applied_steps += 1
    return applied_steps

def is_named_entity(sentence, target):
    tokenized_sent = word_tokenize(sentence)
    tagged_sent = nerTagger.tag(tokenized_sent)
    for token, tag in tagged_sent:
        if token == target and tag != 'O':
            return 1
    return 0

def named_entity_type(sentence, target):
    tokenized_sent = word_tokenize(sentence)
    tagged_sent = nerTagger.tag(tokenized_sent)
    return [tag for token, tag in tagged_sent if token == target][0]


In [244]:
# Porter stemmer stem length, number of applied steps,
# difference of stem length to target and reduction ratio
df['porter_stem_len'] = df.target.apply(lambda target : agg_feat_num_average(target, porter_stem_len))
df['porter_stemmer_num_steps'] = df.target.apply(lambda target : agg_feat_num_average(target, porter_stemmer_num_steps))
df['diff_len_stem_len'] = df.length - df.porter_stem_len
df['reduction_stem_len'] = 1 - df.porter_stem_len / df.length

In [245]:
df.loc[:, ['target', 'length', 'porter_stem_len', 'porter_stemmer_num_steps', 'diff_len_stem_len', 'reduction_stem_len']]

Unnamed: 0,target,length,porter_stem_len,porter_stemmer_num_steps,diff_len_stem_len,reduction_stem_len
0,passed,6.000000,4.000000,1.000000,2.000000,0.333333
1,land,4.000000,4.000000,0.000000,0.000000,0.000000
2,future,6.000000,5.000000,1.000000,1.000000,0.166667
3,future generations,8.500000,5.000000,2.000000,3.500000,0.411765
4,generations,11.000000,5.000000,3.000000,6.000000,0.545455
5,recognizes,10.000000,6.000000,2.000000,4.000000,0.400000
6,community,9.000000,6.000000,2.000000,3.000000,0.333333
7,traditional,11.000000,6.000000,2.000000,5.000000,0.454545
8,traditional connection to that country,6.800000,5.200000,0.800000,1.600000,0.235294
9,country,7.000000,7.000000,1.000000,0.000000,0.000000


# Context-Aware Features
Here we compute not only the context extraction/definition in the first place but also the corresponding context features afterwards. Also we need to implement proper strategies to cope with the target occuring multiple times in the sentence. To avoid mistakes, we should use the actual start and end tags from the dataset.

### a. Context-Token Aggregation
First we define how feature values of multiple context-tokens should be aggreagated.

In [288]:
from nltk.tokenize import word_tokenize

def agg_feat_num_average(tokens, func_feature, *args):
    return np.mean([func_feature(token, *args) for token in tokens])

def agg_feat_num_median(tokens, func_feature, *args):
    return np.median([func_feature(token, *args) for token in tokens])

def agg_feat_num_max(tokens, func_feature, *args):
    return np.max([func_feature(token, *args) for token in tokens])

def agg_feat_num_min(tokens, func_feature, *args):
    return np.min([func_feature(token, *args) for token in tokens])

### b. Context Definition
Here we compute different kinds of context definitions. For example, as a baseline we extract all tokens from the sentence except the target. A second approach is to use a n preceeding or n succeding tokens, or a combined window apporach were we extract n tokens preceeding and succeding of the target. A more sophisticated apporach involves dependency parsing of the sentence and applying different extraction heuristics. Finally we also implement a context extraction approach exploting FrameNet semantic parsing.

In [328]:
from nltk.tokenize import word_tokenize
from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.stanford import StanfordNeuralDependencyParser
import os

java_path = "C:/Program Files (x86)/Java/jdk1.8.0_144/bin/java.exe"
os.environ['JAVAHOME'] = java_path
path_to_jar = 'resources/stanford-dependency-parser/stanford-parser.jar'
path_to_models_jar = 'resources/stanford-dependency-parser/stanford-parser-3.9.1-models.jar'

dependencyParser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

def post_process_ctx(context):
    return [token for token in context if token not in (",", "'", "'s")]

def preprocess_target(target):
    return target.strip()
    
def ctx_extraction_all(context, target):
    ctx_tokens = word_tokenize(context)
    if target in ctx_tokens:
        ctx_tokens.remove(target)
    return ctx_tokens

def ctx_extraction_all_filtered(context, target):
    ctx_tokens = word_tokenize(context)
    return post_process_ctx(ctx_tokens)

def ctx_extraction_window_pre_n(context, target, start, end, n = 3):
    target = preprocess_target(target)
    ctx_tokens = word_tokenize(context)
    post_ctx_tokens = post_process_ctx(ctx_tokens)
    target_tokens = word_tokenize(target)
    target_index = post_ctx_tokens.index(target) if len(target_tokens) == 1 else post_ctx_tokens.index(target_tokens[0])
    start_index = (target_index - n) if (target_index - n) > 0 else 0
    return post_ctx_tokens[start_index:target_index]

def ctx_extraction_window_suc_n(context, target, start, end, n = 3):
    ctx_tokens = word_tokenize(context)
    post_ctx_tokens = post_process_ctx(ctx_tokens)
    target_index = post_ctx_tokens.index(target)
    end_index = (target_index + 1 + n) if (target_index + 1 + n) \
                < len(post_ctx_tokens) else len(post_ctx_tokens)
    return post_ctx_tokens[target_index+1:end_index]

def ctx_extraction_window_pre_suc_n(context, target, start, end, n = 3):
    ctx_tokens_pre = ctx_extraction_window_pre_n(context, target, start, end, n)
    ctx_tokens_suc = ctx_extraction_window_suc_n(context, target, start, end, n)
    ctx_tokens_pre.extend(ctx_tokens_suc)
    return ctx_tokens_pre

def ctx_extraction_dep_in(context, target):
    return [triple[0][0] for parse in dependencyParser.raw_parse(context)
            for triple in list(parse.triples()) if triple[2][0] == target]

def ctx_extraction_dep_out(context, target):
    return [triple[2][0] for parse in dependencyParser.raw_parse(context)
            for triple in list(parse.triples()) if triple[0][0] == target]

def ctx_extraction_dep_in_out(context, target):
    ctx_tokens_in = ctx_extraction_dep_in(context, target)
    ctx_tokens_out = ctx_extraction_dep_out(context, target)
    ctx_tokens_in.extend(ctx_tokens_out)
    return ctx_tokens_in

def ctx_extraction_dep_recu_in_n_steps(context, target, n = 2):
    deps = [triple for parse in dependencyParser.raw_parse(context)
            for triple in parse.triples()]
    result_tokens = []
    curr_target = [target]
    for step in range(0, n):
        step_result = [triple[0][0] for triple in deps 
                       if triple[2][0] in curr_target]
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
    return list(set(result_tokens))

def ctx_extraction_dep_recu_out_n_steps(context, target, n = 2):
    deps = [triple for parse in dependencyParser.raw_parse(context)
            for triple in parse.triples()]
    result_tokens = []
    curr_target = [target]
    for step in range(0, n):
        step_result = [triple[2][0] for triple in deps 
                       if triple[0][0] in curr_target]
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
    return list(set(result_tokens))

def ctx_extraction_dep_recu_in_out_n_steps(context, target, n = 2):
    deps = [triple for parse in dependencyParser.raw_parse(context)
            for triple in parse.triples()]
    result_tokens = []
    curr_target = [target]
    for step in range(0, n):
        step_result = [triple[2][0] for triple in deps 
                       if triple[0][0] in curr_target]
        step_result_out = [triple[0][0] for triple in deps 
                       if triple[2][0] in curr_target]
        step_result.extend(step_result_out)
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
    return list(set(result_tokens))

def ctx_extraction_dep_recu_in_cover(context, target, cover = 0.1):
    deps = [triple for parse in dependencyParser.raw_parse(context)
            for triple in parse.triples()]
    ctx_tokens = word_tokenize(context)
    ctx_tokens_post = post_process_ctx(ctx_tokens)
    result_tokens = []
    curr_target = [target]
    curr_cover = 0
    while curr_cover < cover:
        step_result = [triple[0][0] for triple in deps 
                       if triple[2][0] in curr_target]
        if set(step_result) == set(curr_target):
                break
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
        curr_cover = len(result_tokens) / len(ctx_tokens_post)
    return list(set(result_tokens))

def ctx_extraction_dep_recu_out_cover(context, target, cover = 0.1):
    deps = [triple for parse in dependencyParser.raw_parse(context)
            for triple in parse.triples()]
    ctx_tokens = word_tokenize(context)
    ctx_tokens_post = post_process_ctx(ctx_tokens)
    result_tokens = []
    curr_target = [target]
    curr_cover = 0
    while curr_cover < cover:
        step_result = [triple[2][0] for triple in deps 
                       if triple[0][0] in curr_target]
        if set(step_result) == set(curr_target):
                break
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
        curr_cover = len(result_tokens) / len(ctx_tokens_post)
    return list(set(result_tokens))

def ctx_extraction_dep_recu_in_out_cover(context, target, cover = 0.1):
    deps = [triple for parse in dependencyParser.raw_parse(context)
            for triple in parse.triples()]
    ctx_tokens = word_tokenize(context)
    ctx_tokens_post = post_process_ctx(ctx_tokens)
    result_tokens = []
    curr_target = [target]
    curr_cover = 0
    while curr_cover < cover:
        step_result = [triple[2][0] for triple in deps 
                       if triple[0][0] in curr_target]
        step_result_out = [triple[0][0] for triple in deps 
                       if triple[2][0] in curr_target]
        step_result.extend(step_result_out)
        if set(step_result) == set(curr_target):
                break
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
        curr_cover = len(result_tokens) / len(ctx_tokens_post)
    print(curr_cover)
    return list(set(result_tokens))

In [None]:
sentence = "Normally, the land will be passed down by future generations in a way " + \
             "that recognizes the community's traditional connection to that country "
target = 'passed'

print('ctx_etraction_all:')
print(ctx_extraction_all_filtered(sentence, target))

print('ctx_extraction_window_pre_n:')
print(ctx_extraction_window_pre_n(sentence, "Normally"))
print(ctx_extraction_window_pre_n(sentence, "the"))
print(ctx_extraction_window_pre_n(sentence, "land"))
print(ctx_extraction_window_pre_n(sentence, target, n = 5))

print('ctx_extraction_window_suc_n:')
print(ctx_extraction_window_suc_n(sentence, "country"))
print(ctx_extraction_window_suc_n(sentence, "to"))
print(ctx_extraction_window_suc_n(sentence, "connection"))
print(ctx_extraction_window_suc_n(sentence, "community", n = 5))

print('ctx_extraction_window_pre_suc_n:')
print(ctx_extraction_window_pre_suc_n(sentence, "passed"))
print(ctx_extraction_window_pre_suc_n(sentence, "the"))
print(ctx_extraction_window_pre_suc_n(sentence, "to"))

print('ctx_extraction_dep_in:')
print(ctx_extraction_dep_in(sentence, "land"))

print('ctx_extraction_dep_out:')
print(ctx_extraction_dep_out(sentence, target))
print(ctx_extraction_dep_out(sentence, "land"))

print('ctx_extraction_dep_in_out:')
print(ctx_extraction_dep_in_out(sentence, "land"))

print('ctx_extraction_dep_recu_in_n_steps:')
print(ctx_extraction_dep_recu_in_n_steps(sentence, "the", n = 3))

print('ctx_extraction_dep_recu_out_n_steps:')
print(ctx_extraction_dep_recu_out_n_steps(sentence, "the"))

print('ctx_extraction_dep_recu_in_out_n_steps:')
print(ctx_extraction_dep_recu_in_out_n_steps(sentence, "the"))

print('ctx_extraction_dep_recu_in_cover:')
print(ctx_extraction_dep_recu_in_cover(sentence, "the", cover=0.1))

print('ctx_extraction_dep_recu_out_cover:')
print(ctx_extraction_dep_recu_out_cover(sentence, "the", cover=0.1))

print('ctx_extraction_dep_recu_in_out_cover:')
print(ctx_extraction_dep_recu_in_out_cover(sentence, "the", cover=0.1))

### c. Context Extraction

After we defined all the context extraction approaches, we can apply them on the actual dataset. To do so, we first extract all the distinct sentences from the actual training set and create a new dataframe containing only the sentence ids, the sentence, the target and all the computed contexts. This also makes it easier to integrate context extraction functions implemented in other languages. Afterwards we can compute the context features and join them back with the target features dataframe.

In [329]:
df_context = df.loc[:, ['id', 'sentence', 'target', 'start', 'end']]
#df_context['ctx_avg_word_length'] = \
#    df_context[['sentence', 'target']].apply(lambda vals : 
#                               ctx_extraction_window_pre_suc_n(vals[0], vals[1]), axis = 1)
df_context['ctx_extraction_window_pre_n'] = df.apply(lambda columns : 
                                        ctx_extraction_window_pre_n(columns['sentence'], columns['target'], \
                                                                   columns['start'], columns['end']), axis = 1)
df_context.head()

Unnamed: 0,id,sentence,target,start,end,ctx_extraction_window_pre_n
0,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",passed,28,34,"[land, will, be]"
1,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",land,15,19,"[Normally, the]"
2,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",future,43,49,"[passed, down, to]"
3,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",future generations,43,61,"[passed, down, to]"
4,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",generations,50,61,"[down, to, future]"


# Feature Importance
Here we compute individual feature importance based on different metrics. For example, we implement and compute the F-Score, providing an idea of the discrimination power the feature has.

In [None]:
def feat_importance_f_score(dataframe, feat_name, label_name):
    df = dataframe.copy()
    mean_feat = np.mean(df.loc[:, [feat_name]])[0]
    means = df.loc[: , [feat_name, label_name]].groupby(label_name).mean().reset_index()
    mean_negativ = means.loc[means[label_name] == 0, [feat_name]][feat_name][0]
    mean_positiv = means.loc[means[label_name] == 1, [feat_name]][feat_name][1]
    # Compute the sum of deviations of the class mean from the overall mean
    class_mean_devs = (mean_positiv - mean_feat)**2 + (mean_negativ - mean_feat)**2
    # Compute neagtive instance based values
    neg_inst = df.loc[df[label_name] == 0, [feat_name]]
    std_dev_neg = (np.sum((neg_inst - mean_negativ)**2) / (len(neg_inst) - 1))[feat_name]
    #Compute positive instance based values
    pos_inst = df.loc[df[label_name] == 1, [feat_name]]
    std_dev_pos = (np.sum((pos_inst - mean_positiv)**2) / (len(pos_inst) - 1))[feat_name]
    return class_mean_devs / (std_dev_neg + std_dev_pos)

def compute_all_feat_importance_metrics(dataframe, label_name):
    pass
    

df_feat = df.drop(['id', 'sentence', 'target', 'nat', 'non_nat', 
                   'nat_marked', 'non_nat_marked', 'prob'], axis = 1)
print(df_feat.mean())
print(df_feat.groupby('binary').mean())

In [None]:
'id', 'sentence', "start", "end", "target", 
              "nat", "non_nat", "nat_marked", "non_nat_marked", "binary", "prob"]