# Context-Aware Complex Word Identification
Here we devise and implement all the relevant methods for evaluating the influence of context words for the complexity of a given target word. Thus, we implement various context definition methods that extract context words for a target based on different ideas (e.g. local context, grammatical context and semantic context). Afterwards we compute features for the context and use these features to represent the context in the classification task.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.wsd import lesk
import nltk

In [2]:
from collections import namedtuple
from collections import defaultdict

Dataset = namedtuple('Dataset', 'name, train, test')
FeatureDataset = namedtuple('FeatureDataset', 'name, fc, agg, train, test')
FeatureCategory = namedtuple('FeatureCategory', 'name, func')
Aggregation = namedtuple('Aggregation', 'name, agg')

In [3]:
pd.set_option('display.max_columns', 500)
MAIN_PATH_DATASET = "../cwishareddataset/traindevset/english/"
genres = ['Wikipedia', 'WikiNews', 'News']
datasets = ['Train', 'Dev']
columns = ['id', 'sentence', "start", "end", "target", 
           "nat", "non_nat", "nat_marked", "non_nat_marked", "binary", "prob"]


datasets = [Dataset('Wikipedia', 'Train', 'Dev'),
            Dataset('WikiNews', 'Train', 'Dev'),
            Dataset('News', 'Train', 'Dev')]

feature_categories = []

def load_df(path):
    df = pd.read_csv(path, header=None, sep = "\t")
    df.columns = columns
    return df

datasets = [Dataset(d.name, load_df(MAIN_PATH_DATASET + d.name + '_' + d.train + '.tsv'),
                            load_df(MAIN_PATH_DATASET + d.name + '_' + d.test + '.tsv'))
                            for d in datasets]

## 1. Preprocessing

In [7]:
from nltk.stem.wordnet import *
from nltk import word_tokenize
from functools import lru_cache
from utils import penn_to_wn
import re
import unicodedata
import sys

wordNetLemmatizer = WordNetLemmatizer()

def overlaps(start1, end1, start2, end2):
    return bool(range(max(start1, start2), min(end1, end2)+1))

tbl = dict.fromkeys(i for i in range(sys.maxunicode)
                      if unicodedata.category(chr(i)).startswith('P'))

def remove_punctuation(text):
    return text.translate(tbl)

@lru_cache(maxsize=None)
def targets_with_index(start, end, context):
    curr_pos = 0
    targets = []
    j = 0
    w = 0
    curr_split = ''
    ctx_split = context.split()
    whitespaces = re.findall('\s+', context)
    num_whitespaces = [len(token) for token in whitespaces]
    num_whitespaces.append(1)
    tokens = word_tokenize(context)
    tokens = ['"' if token not in context else token for token in tokens]
    for index, token in enumerate(tokens, 1):
        targets.append((token, index, curr_pos, (curr_pos + len(token))))
        curr_pos += len(token)
        curr_split += token
        if ctx_split[j] == curr_split:
            curr_pos += num_whitespaces[w]
            j += 1
            w += 1
            curr_split = ''
    vals = [(target[0], target[1]) for target in targets \
            if overlaps(start, end, target[2], target[3])]
    return [val for val in vals if val[0] != '"']

@lru_cache(maxsize=None)
def wordnet_pos_tagging(sentence):
    tokens = word_tokenize(sentence)
    return nltk.pos_tag(tokens)

def pos_tags(start, end, sentence):
    wordPOSPairs = wordnet_pos_tagging(sentence)
    targets_index = targets_with_index(start, end, sentence)
    results = [wordPOSPairs[tpl[1]-1][1] for tpl in targets_index]
    filtered_results = [result for result in results 
                        if remove_punctuation(result).strip() and result != 'POS']
    return filtered_results if len(filtered_results) > 0 else None

def wordnet_lemma(target, pos):
    tokens = nltk.word_tokenize(target)
    if pos:
        pos = [penn_to_wn(poss) if penn_to_wn(poss) else 'n' for poss in pos]
        lemmas = [wordNetLemmatizer.lemmatize(token, poss)
                     for token, poss in zip(tokens, pos)]
        return ' '.join(lemmas)
    return target

def preprocessing(dataframe):
    df = dataframe.copy()
    df['p_sentence'] = df.sentence.apply(lambda sent : sent.strip().lower())
    df['sentence'] = df.sentence.apply(lambda sent : sent.replace("''", "``"))
    df['p_target'] = df.target.apply(lambda target : target.strip().lower())
    df['pos_tags'] = df[['start', 'end', 'sentence']].apply(lambda vals : pos_tags(*vals), axis = 1)
    df['pos_tags_pt'] = df.pos_tags.apply(lambda pos : [penn_to_wn(poss) if penn_to_wn(poss) else 'n' for poss in pos])
    df['lemma'] = df[['target', 'pos_tags']].apply(lambda vals : wordnet_lemma(*vals), axis = 1)
    df['p_lemma'] = df.lemma.apply(lambda lemma : lemma.strip().lower())
    return df

In [8]:
preprocessed_datasets = [Dataset(ds.name, preprocessing(ds.train), 
                               preprocessing(ds.test)) for ds in datasets]

In [9]:
datasets = preprocessed_datasets

### a. Context-Token Aggregation
First we define how feature values of multiple context-tokens should be aggreagated.

In [186]:
from nltk.tokenize import word_tokenize

def agg_ctx_feat_num_average(tokens, func_feature, *args, **kwargs):
#     if 'pos' in kwargs:
#         pos = kwargs.pop('pos')
#         return np.mean([func_feature(token, *args, pos=poss) for token, poss in zip(word_tokenize(tokens), pos)])
    return np.mean([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_weighted_average(target, func_feature, alpha, *args):
    return np.mean([(alpha/(alpha+get_unigram_probability(token))) * 
                func_feature(token, *args) for token in word_tokenize(target)])

def agg_ctx_feat_num_distance(target, func_feature, *args):
    pass

def agg_ctx_feat_num_median(tokens, func_feature, *args):
    return np.median([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_max(tokens, func_feature, *args):
    return np.max([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_min(tokens, func_feature, *args):
    return np.min([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_sum(tokens, func_feature, *args):
    return np.sum([func_feature(token, *args) for token in tokens])

In [187]:
agg_default = [Aggregation('mean', agg_ctx_feat_num_average)]
aggs_small = [Aggregation('mean', agg_ctx_feat_num_average), Aggregation('max', agg_ctx_feat_num_max)]
aggs_all = [Aggregation('mean', agg_ctx_feat_num_average), Aggregation('median', agg_ctx_feat_num_median),
            Aggregation('max', agg_ctx_feat_num_max), Aggregation('min', agg_ctx_feat_num_min)]
           #Aggregation('weighted_mean', agg_ctx_feat_num_weighted_average_medium)]

In [188]:
aggs = agg_default

In [None]:
def concat_feature_datasets(*args):
    zipped = zip(*args)
    concat_features = []
    for dataset in zipped:
        df_train = None
        df_test = None
        fcs = []
        aggs = []
        for tpl in dataset:
            if not fcs:
                df_train = tpl.train.copy()
                df_test = tpl.test.copy()
            else:
                df_train = pd.concat([df_train, tpl.train.copy()], axis = 1)
                df_test = pd.concat([df_test, tpl.test.copy()], axis = 1)
            fcs.append(tpl.fc)
            aggs.append(tpl.agg)
        concat_features.append(FeatureDataset(tpl.name, fcs, aggs,
                    df_train.loc[:,~df_train.columns.duplicated()], 
                    df_test.loc[:,~df_test.columns.duplicated()]))
    return concat_features

### b. Context Definition
Here we compute different kinds of context definitions. For example, as a baseline we extract all tokens from the sentence except the target. A second approach is to use a n preceeding or n succeding tokens, or a combined window apporach were we extract n tokens preceeding and succeding of the target. A more sophisticated apporach involves dependency parsing of the sentence and applying different extraction heuristics. Finally we also implement a context extraction approach exploting FrameNet semantic parsing.

In [146]:
from nltk.tokenize import word_tokenize
from nltk.parse.corenlp import *
import os
from functools import lru_cache

# First make sure that the StanfordCoreNLP Server is running under port 9010
parser = CoreNLPDependencyParser(url='http://localhost:9010/')

with open("resources/dictionaries/stopwords_en.txt", encoding="utf8") as file:
    content = [line.strip().lower() for line in file.readlines()]
    stop_words = set(content)
    
def overlaps(start1, end1, start2, end2):
    return bool(range(max(start1, start2), min(end1, end2)+1))

def post_process_ctx(context, filtering=True):
    return [token for token in context if 
            (token.isalnum() and (not filtering
        or preprocess_target(token).lower() not in stop_words))]

def preprocess_target(target):
    return target.strip()

def target_index_char_based(start, end, ctx_tokens):
    size = np.sum([len(token) for token in ctx_tokens]) + len(ctx_tokens)
    target_pos = (start + end) / 2
    target_pos_rel = target_pos / size
    return int(target_pos_rel * len(post_process_ctx(ctx_tokens)))

@lru_cache(maxsize=None)
def targets_with_index(start, end, context):
    curr_pos = 0
    targets = []
    j = 0
    w = 0
    curr_split = ''
    ctx_split = context.split()
    whitespaces = re.findall('\s+', context)
    num_whitespaces = [len(token) for token in whitespaces]
    num_whitespaces.append(1)
    tokens = word_tokenize(context)
    tokens = ['"' if token not in context else token for token in tokens]
    for index, token in enumerate(tokens, 1):
        targets.append((token, index, curr_pos, (curr_pos + len(token))))
        curr_pos += len(token)
        curr_split += token
        if ctx_split[j] == curr_split:
            curr_pos += num_whitespaces[w]
            j += 1
            w += 1
            curr_split = ''
    vals = [(target[0], target[1]) for target in targets \
            if overlaps(start, end, target[2], target[3])]
    return [val for val in vals if val[0] != '"']

from joblib import Memory
memory = Memory(location='resources/dependency-cache', verbose=0)
@memory.cache
def dependency_parse_with_root(sentence):
    try:
        dependency_parser = parser.raw_parse(sentence)
        dependencies = []
        parsetree = list(dependency_parser)[0]
        for index, node in parsetree.nodes.items():
            for relation, dependant in parsetree.nodes[index]['deps'].items():
                for dep in dependant:
                    triple = ((node['word'], index), relation, \
                              (parsetree.nodes[dep]['word'], dep))
                    dependencies.append(triple)
        return dependencies
    except:
        return []

@lru_cache(maxsize=None)
def dependency_parse(sentence):
    dependencies = dependency_parse_with_root(sentence)
    filtered_dependencies = [triple for triple in dependencies if triple[1] != 'ROOT']
    return filtered_dependencies

def ctx_extraction_all(context, target):
    ctx_tokens = word_tokenize(context)
    if target in ctx_tokens:
        ctx_tokens.remove(target)
    return ctx_tokens

def ctx_extraction_all_filtered(context, target, filtering = True):
    ctx_tokens = word_tokenize(context)
    post_ctx_tokens = post_process_ctx(ctx_tokens, filtering)
    if target in ctx_tokens:
        ctx_tokens.remove(target)
    return post_process_ctx

def ctx_extraction_window_pre_n(context, target, start, end, filtering = True , n = 3):
    target = preprocess_target(target)
    ctx_tokens = word_tokenize(context[:start])
    post_ctx_tokens = post_process_ctx(ctx_tokens, filtering)
    return post_ctx_tokens[-n:]

def ctx_extraction_window_suc_n(context, target, start, end, filtering = True, n = 3):
    target = preprocess_target(target)
    ctx_tokens = word_tokenize(context[end:])
    post_ctx_tokens = post_process_ctx(ctx_tokens, filtering)
    return post_ctx_tokens[:n]

def ctx_extraction_window_pre_suc_n(context, target, start, end, filtering = True, n = 3):
    ctx_tokens_pre = ctx_extraction_window_pre_n(context, target, start, end, filtering, n)
    ctx_tokens_suc = ctx_extraction_window_suc_n(context, target, start, end, filtering, n)
    ctx_tokens_pre.extend(ctx_tokens_suc)
    return ctx_tokens_pre

def ctx_extraction_dep_in(context, target, start, end):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    return list(set([triple[0][0] for triple in triples if triple[2] in targets]))

def ctx_extraction_dep_out(context, target, start, end):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    return list(set([triple[2][0] for triple in triples if triple[0] in targets]))

def ctx_extraction_dep_in_out(context, target, start, end):
    ctx_tokens_in = ctx_extraction_dep_in(context, target, start, end)
    ctx_tokens_out = ctx_extraction_dep_out(context, target, start, end)
    ctx_tokens_in.extend(ctx_tokens_out)
    return list(set(ctx_tokens_in))

def ctx_extraction_dep_recu_in_n_steps(context, target, start, end, n = 2):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    result_tokens = []
    curr_target = targets
    for step in range(0, n):
        step_result = [triple[0] for triple in triples 
                       if triple[2] in curr_target]
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
    return list(set([token[0] for token in result_tokens]))

def ctx_extraction_dep_recu_out_n_steps(context, target, start, end, n = 2):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    result_tokens = []
    curr_target = targets
    for step in range(0, n):
        step_result = [triple[2] for triple in triples 
                       if triple[0] in curr_target]
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
    return list(set([token[0] for token in result_tokens]))

def ctx_extraction_dep_recu_in_out_n_steps(context, target, start, end, n = 2):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    result_tokens = []
    curr_target = targets
    for step in range(0, n):
        step_result = [triple[2] for triple in triples 
                       if triple[0] in curr_target]
        step_result_out = [triple[0] for triple in triples 
                       if triple[2] in curr_target]
        step_result.extend(step_result_out)
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
    return list(set([token[0] for token in result_tokens]))

def ctx_extraction_dep_recu_in_cover(context, target, start, end, cover = 0.1):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    ctx_tokens = word_tokenize(context)
    ctx_tokens_post = post_process_ctx(ctx_tokens)
    result_tokens = []
    curr_target = targets
    curr_cover = 0
    while curr_cover < cover:
        step_result = [triple[0] for triple in triples 
                       if triple[2] in curr_target]
        if set(step_result) == set(curr_target):
                break
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
        curr_cover = len(result_tokens) / len(ctx_tokens_post)
    return list(set([token[0] for token in result_tokens]))

def ctx_extraction_dep_recu_out_cover(context, target, start, end, cover = 0.1):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    ctx_tokens = word_tokenize(context)
    ctx_tokens_post = post_process_ctx(ctx_tokens)
    result_tokens = []
    curr_target = targets
    curr_cover = 0
    while curr_cover < cover:
        step_result = [triple[2] for triple in triples 
                       if triple[0] in curr_target]
        if set(step_result) == set(curr_target):
                break
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
        curr_cover = len(result_tokens) / len(ctx_tokens_post)
    return list(set([token[0] for token in result_tokens]))

def ctx_extraction_dep_recu_in_out_cover(context, target, start, end, cover = 0.1):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    ctx_tokens = word_tokenize(context)
    ctx_tokens_post = post_process_ctx(ctx_tokens)
    result_tokens = []
    curr_target = targets
    curr_cover = 0
    while curr_cover < cover:
        step_result = [triple[2] for triple in triples 
                       if triple[0] in curr_target]
        step_result_out = [triple[0] for triple in triples 
                       if triple[2] in curr_target]
        step_result.extend(step_result_out)
        if set(step_result) == set(curr_target):
                break
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
        curr_cover = len(result_tokens) / len(ctx_tokens_post)
    return list(set([token[0] for token in result_tokens]))

In [None]:
sentence = "Normally, the land will be passed down by future generations in a way " + \
             "that recognizes the community's traditional connection to that country ."
target = 'passed'

print('ctx_etraction_all:')
print(ctx_extraction_all_filtered(sentence, target))

print('ctx_extraction_window_pre_n:')
print(ctx_extraction_window_pre_n(sentence, "Normally", 0, 8, filtering=False))
print(ctx_extraction_window_pre_n(sentence, "the", 11, 14, filtering=False))
print(ctx_extraction_window_pre_n(sentence, "land", 15, 19, filtering=False))
print(ctx_extraction_window_pre_n(sentence, "to", 126, 128, filtering=False))
print(ctx_extraction_window_pre_n(sentence, target, 28, 34, n = 5, filtering=False))

print('ctx_extraction_window_suc_n:')
print(ctx_extraction_window_suc_n(sentence, "country", 135, 142, filtering=False))
print(ctx_extraction_window_suc_n(sentence, "to", 126, 128, filtering=False))
print(ctx_extraction_window_suc_n(sentence, "connection", 115, 125, filtering=False))
print(ctx_extraction_window_suc_n(sentence, "community", 91, 100, n = 5, filtering=False))

print('ctx_extraction_window_pre_suc_n:')
print(ctx_extraction_window_pre_suc_n(sentence, "passed", 28, 34, filtering=False))
print(ctx_extraction_window_pre_suc_n(sentence, "the", 11, 14, filtering=False))
print(ctx_extraction_window_pre_suc_n(sentence, "to", 127, 129, filtering=False))

print('ctx_extraction_dep_in:')
print(ctx_extraction_dep_in(sentence, "land", 15, 19))

print('ctx_extraction_dep_out:')
print(ctx_extraction_dep_out(sentence, target, 28, 34))
print(ctx_extraction_dep_out(sentence, "land", 15, 19))

print('ctx_extraction_dep_in_out:')
print(ctx_extraction_dep_in_out(sentence, "land", 15, 19))

print('ctx_extraction_dep_recu_in_n_steps:')
print(ctx_extraction_dep_recu_in_n_steps(sentence, "the", 11, 14, n = 3))

print('ctx_extraction_dep_recu_out_n_steps:')
print(ctx_extraction_dep_recu_out_n_steps(sentence, "the", 11, 14))

print('ctx_extraction_dep_recu_in_out_n_steps:')
print(ctx_extraction_dep_recu_in_out_n_steps(sentence, "the", 11, 14))

print('ctx_extraction_dep_recu_in_cover:')
print(ctx_extraction_dep_recu_in_cover(sentence, "the", 11, 14, cover=0.1))

print('ctx_extraction_dep_recu_out_cover:')
print(ctx_extraction_dep_recu_out_cover(sentence, "the", 11, 14, cover=0.1))

print('ctx_extraction_dep_recu_in_out_cover:')
print(ctx_extraction_dep_recu_in_out_cover(sentence, "the", 11, 14, cover=0.1))

### c. Context Extraction

After we defined all the context extraction approaches, we can apply them on the actual dataset. To do so, we first extract all the distinct sentences from the actual training set and create a new dataframe containing only the sentence ids, the sentence, the target and all the computed contexts. This also makes it easier to integrate context extraction functions implemented in other languages. Afterwards we can compute the context features and join them back with the target features dataframe.

In [217]:
Context = namedtuple('Context', 'name, params, func')
ContextFeatureCategory = namedtuple('ContextFeatureCategory', 'name, func')
ContextDataset = namedtuple('ContextDataset', 'name, context, train, test')
ContextFeatureDataset = namedtuple('ContextFeatureDataset', 'name, context, fc, agg, train, test')
contexts = []
ctx_fcs = []
ctx_feature_datasets = []

### (1. Extraction functions)

In [218]:
import numpy as np

def ctx_window_pre_n(dataframe, n, filtering):
    df = dataframe.copy()
    df['context'] = df.apply(lambda columns : 
                ctx_extraction_window_pre_n(columns['sentence'], columns['target'], \
                columns['start'], columns['end'],  n = n, filtering = filtering), axis = 1)
    return df

ctx_window_pre_n_2_no_filter = lambda dataframe : ctx_window_pre_n(dataframe, 2, False)
ctx_window_pre_n_2_filter = lambda dataframe : ctx_window_pre_n(dataframe, 2, True)
ctx_window_2_nf = Context('ctx_window_pre_n', {'n':2, 'filtering':False}, ctx_window_pre_n_2_no_filter)
ctx_window_2_f = Context('ctx_window_pre_n', {'n':2, 'filtering':True}, ctx_window_pre_n_2_filter)
contexts.append(ctx_window_2_nf)
contexts.append(ctx_window_2_f)

# print('ctx_extraction_window_pre_n')
# df_context['ctx_extraction_window_suc_n'] = df_context.apply(lambda columns : 
#                                         ctx_extraction_window_suc_n(columns['sentence'], columns['target'], \
#                                                 columns['start'], columns['end'], filtering = filtering), axis = 1)
# print('ctx_extraction_window_suc_n')
# df_context['ctx_extraction_window_pre_suc_n'] = df_context.apply(lambda columns : 
#                                         ctx_extraction_window_pre_suc_n(columns['sentence'], columns['target'], \
#                                                 columns['start'], columns['end'], filtering = filtering), axis = 1)
# print('ctx_extraction_window_pre_suc_n')
# df_context['ctx_extraction_dep_in'] = df_context.apply(lambda columns : 
#                                         ctx_extraction_dep_in(columns['sentence'], columns['target'], \
#                                                 columns['start'], columns['end']), axis = 1)
# print('ctx_extraction_dep_in')
# df_context['ctx_extraction_dep_out'] = df_context.apply(lambda columns : 
#                                         ctx_extraction_dep_out(columns['sentence'], columns['target'], \
#                                                                    columns['start'], columns['end']), axis = 1)
# print('ctx_extraction_dep_out')
# # 1. Compute dep_in_out using defined function
# df_context['ctx_extraction_dep_in_out'] = df_context.apply(lambda columns : 
#                                         ctx_extraction_dep_in_out(columns['sentence'], columns['target'], \
#                                                                    columns['start'], columns['end']), axis = 1)
# print('ctx_extraction_dep_in_out')
# # 2. Compute dep_in_out by combining precomputed results
# df_context['ctx_extraction_dep_in_out_dir'] = df_context[['ctx_extraction_dep_in', \
#                                                       'ctx_extraction_dep_out']].apply(lambda vals : vals[0]+vals[1], axis=1)

# print('ctx_extraction_dep_in_out_dir')
# df_context['ctx_extraction_dep_recu_in_n_steps']  = df_context.apply(lambda columns : 
#                                         ctx_extraction_dep_recu_in_n_steps(columns['sentence'], columns['target'], \
#                                         columns['start'], columns['end'], n=2), axis = 1)

# print('ctx_extraction_dep_recu_in_n_steps')
# df_context['ctx_extraction_dep_recu_out_n_steps']  = df_context.apply(lambda columns : 
#                                         ctx_extraction_dep_recu_out_n_steps(columns['sentence'], columns['target'], \
#                                         columns['start'], columns['end'], n=2), axis = 1)

# print('ctx_extraction_dep_recu_out_n_steps')
# df_context['ctx_extraction_dep_recu_in_out_n_steps']  = df_context.apply(lambda columns : 
#                                         ctx_extraction_dep_recu_in_out_n_steps(columns['sentence'], columns['target'], \
#                                         columns['start'], columns['end'], n=2), axis = 1)

# print('ctx_extraction_dep_recu_in_out_n_steps')
# df_context['ctx_extraction_dep_recu_in_cover']  = df_context.apply(lambda columns : 
#                                         ctx_extraction_dep_recu_in_cover(columns['sentence'], columns['target'], \
#                                         columns['start'], columns['end'], cover=0.2), axis = 1)

# print('ctx_extraction_dep_recu_in_cover')
# df_context['ctx_extraction_dep_recu_out_cover']  = df_context.apply(lambda columns : 
#                                         ctx_extraction_dep_recu_out_cover(columns['sentence'], columns['target'], \
#                                         columns['start'], columns['end'], cover=0.2), axis = 1)

# print('ctx_extraction_dep_recu_out_cover')
# df_context['ctx_extraction_dep_recu_in_out_cover']  = df_context.apply(lambda columns : 
#                                         ctx_extraction_dep_recu_in_out_cover(columns['sentence'], columns['target'], \
#                                         columns['start'], columns['end'], cover=0.2), axis = 1)

# print('ctx_extraction_dep_recu_in_out_cover')
# df_context

### (2) Context Preprocessing

In [219]:
def preprocess_ctx(context):
    return [token.strip().lower() for token in context]

def preprocess_ctx_df(dataframe):
    df = dataframe.copy()
    df['p_context'] = df.context.apply(lambda context : preprocess_ctx(context))
    return df

In [228]:
ctx_datasets = [ContextDataset(ds.name, ctx, preprocess_ctx_df(ctx.func(ds.train)), 
                preprocess_ctx_df(ctx.func(ds.test)))
                for ctx in contexts
                for ds in datasets]

### d. Context Features
After defining all the context definitions and extracting the different kinds of contexts from the sentence, we compute features on the context words. Therefore we first define which of the precomputed contexts to use.

### (d.1) Context Complexity Features

In [169]:
word_freq_wiki = {}
freq_sum_wiki = 0
with open("resources/word-freq-dumps/enwiki-20150602-words-frequency.txt", encoding="utf8") as file:
    for line in file:
        word, freq = line.partition(" ")[::2]
        word_freq_wiki[word.strip()] = int(freq)
        freq_sum_wiki+=int(freq)
        
def get_dict_count(target, freqs):
    return freqs.get(target.strip().lower(), 0)

In [205]:
def ctx_features_context_complexity(dataframe, agg):
    df = dataframe.copy()
    df['ctx_length'] = df.p_context.apply(lambda context : agg(context, len))
    df['ctx_freq_wiki'] = df.p_context.apply(lambda context : agg(context, get_dict_count, word_freq_wiki))
    df = df.fillna(0)
    return df

ctx_fc_context_complexity = ContextFeatureCategory('baseline_1', ctx_features_context_complexity)
feature_categories.append(ctx_fc_context_complexity)

In [235]:
ctx_datasets_fc_context_complexity = [ContextFeatureDataset(ctx_ds.name, ctx_ds.context, ctx_fc_context_complexity, agg, 
        ctx_fc_context_complexity.func(ctx_ds.train, agg.agg), ctx_fc_context_complexity.func(ctx_ds.test, agg.agg)) 
        for ctx_ds in ctx_datasets for agg in aggs]

  out=out, **kwargs)


In [236]:
ctx_datasets_fc_context_complexity[3].train

Unnamed: 0,id,sentence,start,end,target,nat,non_nat,nat_marked,non_nat_marked,binary,prob,p_target,pos_tags,pos_tags_pt,lemma,p_lemma,p_sentence,context,p_context,ctx_length,ctx_freq_wiki
0,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",0,8,Normally,10,10,0,1,1,0.05,normally,[RB],[r],Normally,normally,"normally , the land will be passed down to fut...",[],[],0.0,0.0
1,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",28,34,passed,10,10,0,1,1,0.05,passed,[VBN],[v],pass,pass,"normally , the land will be passed down to fut...",[land],[land],4.0,501336.0
2,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",15,19,land,10,10,0,0,0,0.00,land,[NN],[n],land,land,"normally , the land will be passed down to fut...",[],[],0.0,0.0
3,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",43,49,future,10,10,1,0,1,0.05,future,[JJ],[a],future,future,"normally , the land will be passed down to fut...","[land, passed]","[land, passed]",5.0,339534.0
4,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",43,61,future generations,10,10,1,2,1,0.15,future generations,"[JJ, NNS]","[a, n]",future generation,future generation,"normally , the land will be passed down to fut...","[land, passed]","[land, passed]",5.0,339534.0
5,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",50,61,generations,10,10,3,2,1,0.25,generations,[NNS],[n],generation,generation,"normally , the land will be passed down to fut...","[passed, future]","[passed, future]",6.0,210370.5
6,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",76,86,recognizes,10,10,2,4,1,0.30,recognizes,[VBZ],[v],recognize,recognize,"normally , the land will be passed down to fut...","[future, generations]","[future, generations]",8.5,138436.5
7,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",91,100,community,10,10,0,0,0,0.00,community,[NN],[n],community,community,"normally , the land will be passed down to fut...","[generations, recognizes]","[generations, recognizes]",10.5,23922.0
8,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",104,115,traditional,10,10,1,3,1,0.20,traditional,[JJ],[a],traditional,traditional,"normally , the land will be passed down to fut...","[recognizes, community]","[recognizes, community]",9.5,282824.5
9,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",104,142,traditional connection to that country,10,10,0,0,0,0.00,traditional connection to that country,"[JJ, NN, TO, DT, NN]","[a, n, n, n, n]",traditional connection to that country,traditional connection to that country,"normally , the land will be passed down to fut...","[recognizes, community]","[recognizes, community]",9.5,282824.5


In [None]:
from joblib import Memory
memory = Memory(location='resources/dependency-cache', verbose=0)
@memory.cache
def dependency_parse_with_root(sentence):
    try:
        dependency_parser = dependencyParser.raw_parse(sentence)
        dependencies = []
        parsetree = list(dependency_parser)[0]
        for index, node in parsetree.nodes.items():
            for relation, dependant in parsetree.nodes[index]['deps'].items():
                for dep in dependant:
                    triple = ((node['word'], index), relation, \
                              (parsetree.nodes[dep]['word'], dep))
                    dependencies.append(triple)
        return dependencies
    except:
        return []

@lru_cache(maxsize=None)
def dependency_parse(sentence):
    dependencies = dependency_parse_with_root(sentence)
    filtered_dependencies = [triple for triple in dependencies if triple[1] != 'root']
    return filtered_dependencies


def dep_dist_to_head(target, start, end, context):
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    return np.nan_to_num(np.mean([np.abs(triple[0][1] - triple[2][1])-1 
                                for triple in triples if triple[2] in targets]))

def dep_dist_to_root(target, start, end, context):
    targets = targets_with_index(start, end, context)
    triples = dependency_parse_with_root(context)
    root_nodes = list(filter(lambda triple : triple[1] == 'root' , triples))
    if root_nodes: 
        root_node = root_nodes[0]
    else:
        return 0
    dist = np.nan_to_num(np.mean([np.abs(root_node[2][1] - triple[2][1])-1 
                                for triple in triples if triple[2] in targets]))
    return dist if dist != -1 else 0

def dep_relation_to_head(target, start, end, context):
    targets = targets_with_index(start, end, context)
    triples = dependency_parse_with_root(context)
    relations = [triple[1] for triple in triples if triple[2] in targets]
    return relations[0] if len(relations) == 1 else 'misc'
    

def dep_head_word_len(target, start, end, context):
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    return np.nan_to_num(np.mean([len(triple[0][0]) 
        for triple in triples if triple[2] in targets]))

def dep_num_dependents(target, start, end, context):
    targets = targets_with_index(start, end, context)
    triples = dependency_parse_with_root(context)
    return len([triple[1] for triple in triples if triple[0] in targets])

def dep_max_num_dependents(context):
    triples = dependency_parse_with_root(context)
    most = Counter([triple[0][0] for triple in triples]).most_common(1)
    return most[0][1] if most else 0

In [None]:
 df['dep_dist_to_head'] = df[['target', 'start', 'end', 'sentence']].apply(lambda vals : 
                                                                            dep_dist_to_head(*vals), axis=1)
    df['dep_dist_to_root'] = df[['target', 'start', 'end', 'sentence']].apply(lambda vals : 
                                                                            dep_dist_to_root(*vals), axis=1)
    df['dep_dist_to_root_norm'] = df[['dep_dist_to_root', 'sentence']].apply(lambda vals : \
                                                        float(vals[0]) / (len(word_tokenize(vals[1]))-1), axis=1)
    df['dep_relation_to_head'] = df[['target', 'start', 'end', 'sentence']].apply(lambda vals : \
                                                                dep_relation_to_head(*vals), axis = 1)
    df['dep_num_dependents'] = df[['target', 'start', 'end', 'sentence']].apply(lambda vals : \
                                                                        dep_num_dependents(*vals), axis = 1)
    df['dep_max_num_dependents'] = df.sentence.apply(lambda sentence : dep_max_num_dependents(sentence))
    df['dep_num_dependents_norm'] = df.dep_num_dependents / df.dep_max_num_dependents
    df['dep_head_word_len'] = df[['target', 'start', 'end', 'sentence']].apply(lambda vals : \
                                                                        dep_head_word_len(*vals), axis = 1)

#### (1) Readability Measures
Here we implement some of the most popular and well-known historical readability measures. Most of them need multiple sentences to compute them properly, however, we will apply them on the extracted context.

In [66]:
from textatistic import Textatistic

df_context['rb_dalechall_score'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').dalechall_score)
df_context['rb_flesch_score'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').flesch_score)
df_context['rb_fleschkincaid_score'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').fleschkincaid_score)
df_context['rb_gunningfog_score'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').gunningfog_score)
df_context['rb_polysyblword_count'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').polysyblword_count)
df_context['rb_smog_score'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').smog_score)
df_context['rb_sybl_count'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').sybl_count)
df_context['rb_sybl_count_ratio'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').sybl_count / len(context))

df_context[['target', 'context', 'rb_dalechall_score', 'rb_flesch_score', 'rb_fleschkincaid_score', \
            'rb_gunningfog_score', 'rb_polysyblword_count', 'rb_smog_score', 'rb_sybl_count', 'rb_sybl_count_ratio']].head()

Unnamed: 0,target,context,rb_dalechall_score,rb_flesch_score,rb_fleschkincaid_score,rb_gunningfog_score,rb_polysyblword_count,rb_smog_score,rb_sybl_count,rb_sybl_count_ratio
0,passed,"[land, future, generations, recognizes]",15.6774,-8.725,15.47,21.6,2,11.208143,10,2.5
1,land,"[passed, future, generations]",14.311967,6.39,13.113333,14.533333,1,8.841846,7,2.333333
2,future,"[land, passed, generations, recognizes, commun...",13.3585,-1.28,14.68,26.0,3,13.023867,12,2.4
3,future generations,"[land, passed, recognizes, community, traditio...",13.3585,15.64,12.32,26.0,3,13.023867,11,2.2
4,generations,"[land, passed, future, recognizes, community, ...",14.460767,17.445,12.316667,22.4,3,13.023867,13,2.166667


#### (2) Linguistic Features
Here we implement some of the most popular and well-known historical readability measures. Most of them need multiple sentences to compute them properly, however, we will apply them on the extracted context.

In [None]:
df_context['context'] = df_context['ctx_extraction_window_pre_suc_n']

df_context['ctx_num_tokens'] = df_context.context.apply(lambda context : len(context))
df_context['ctx_avg_length'] = df_context.context.apply(lambda context : agg_ctx_feat_num_average(context, len))
df_context['ctx_avg_word_freq_wiki'] = df_context.context.apply(lambda context : \
                                                    agg_ctx_feat_num_average(context, get_dict_count, word_freq_wiki))
df_context.head()