# Context-Aware Complex Word Identification
Here we devise and implement all the relevant methods for evaluating the influence of context words for the complexity of a given target word. Thus, we implement various context definition methods that extract context words for a target based on different ideas (e.g. local context, grammatical context and semantic context). Afterwards we compute features for the context and use these features to represent the context in the classification task.

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.wsd import lesk
import nltk

In [9]:
TRAIN_ENGLISH_WIKIPEDIA = "../cwishareddataset/traindevset/" + \
                           "english/Wikipedia_Train.tsv"
df = pd.read_csv(TRAIN_ENGLISH_WIKIPEDIA, sep = "\t")
df.columns = ['id', 'sentence', "start", "end", "target", 
              "nat", "non_nat", "nat_marked", "non_nat_marked", "binary", "prob"]

### a. Context-Token Aggregation
First we define how feature values of multiple context-tokens should be aggreagated.

In [10]:
from nltk.tokenize import word_tokenize

def agg_ctx_feat_num_average(tokens, func_feature, *args):
    return np.mean([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_median(tokens, func_feature, *args):
    return np.median([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_max(tokens, func_feature, *args):
    return np.max([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_min(tokens, func_feature, *args):
    return np.min([func_feature(token, *args) for token in tokens])

def agg_ctx_feat_num_sum(tokens, func_feature, *args):
    return np.sum([func_feature(token, *args) for token in tokens])

### b. Context Definition
Here we compute different kinds of context definitions. For example, as a baseline we extract all tokens from the sentence except the target. A second approach is to use a n preceeding or n succeding tokens, or a combined window apporach were we extract n tokens preceeding and succeding of the target. A more sophisticated apporach involves dependency parsing of the sentence and applying different extraction heuristics. Finally we also implement a context extraction approach exploting FrameNet semantic parsing.

In [11]:
from collections import Counter

def mult_target(sentence, target):
    counter = Counter(word_tokenize(sentence))
    targets = word_tokenize(target)
    return np.sum([counter[target] for target in targets]) / len(targets)

df['mult_target'] = df[['sentence', 'target']].apply(lambda vals : mult_target(*vals), axis = 1)
df[df.mult_target == 4].head()

Unnamed: 0,id,sentence,start,end,target,nat,non_nat,nat_marked,non_nat_marked,binary,prob,mult_target
448,32L724R85LLGOQ18KTH5U7G6HTKPID,The Russian military is divided into the follo...,4,11,Russian,10,10,0,0,0,0.0,4.0
452,32L724R85LLGOQ18KTH5U7G6HTKPID,The Russian military is divided into the follo...,66,73,Russian,10,10,0,0,0,0.0,4.0
455,32L724R85LLGOQ18KTH5U7G6HTKPID,The Russian military is divided into the follo...,94,101,Russian,10,10,0,0,0,0.0,4.0
457,32L724R85LLGOQ18KTH5U7G6HTKPID,The Russian military is divided into the follo...,117,124,Russian,10,10,0,0,0,0.0,4.0
1861,3SCKNODZ0XHJWL8ZLN0GZTL23Y8N7K,"It is the tallest building in California , the...",10,17,tallest,10,10,0,1,1,0.05,4.0


In [56]:
from nltk.tokenize import word_tokenize
from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.stanford import StanfordNeuralDependencyParser
import os
from functools import lru_cache

java_path = "C:/Program Files (x86)/Java/jdk1.8.0_144/bin/java.exe"
os.environ['JAVAHOME'] = java_path
path_to_jar = 'resources/stanford-dependency-parser/stanford-parser.jar'
path_to_models_jar = 'resources/stanford-dependency-parser/stanford-parser-3.9.1-models.jar'

dependencyParser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

with open("resources/dictionaries/stopwords_en.txt", encoding="utf8") as file:
    content = [line.strip().lower() for line in file.readlines()]
    stop_words = set(content)

def post_process_ctx(context, filtering=True):
    return [token for token in context if 
            (token.isalnum() and (not filtering or preprocess_target(token).lower() not in stop_words))]

def preprocess_target(target):
    return target.strip()

def target_index_char_based(start, end, ctx_tokens):
    size = np.sum([len(token) for token in ctx_tokens]) + len(ctx_tokens)
    target_pos = (start + end) / 2
    target_pos_rel = target_pos / size
    return int(target_pos_rel * len(post_process_ctx(ctx_tokens)))

def targets_with_index(start, end, context):
    curr_pos = 0
    targets = []
    for index, token in enumerate(word_tokenize(context), 1):
        targets.append((token, index, curr_pos, (curr_pos + len(token))))
        curr_pos += len(token) + 1
    return [(target[0], target[1]) for target in targets \
            if target[2] >= start and target[3] <= end]

@lru_cache(maxsize=None)
def dependency_parse(sentence):
    dependency_parser = dependencyParser.raw_parse(sentence)
    dependencies = []
    parsetree = list(dependency_parser)[0]
    for index, node in parsetree.nodes.items():
        for relation, dependant in parsetree.nodes[index]['deps'].items():
            triple = ((node['word'], index), relation, \
                      (parsetree.nodes[dependant[0]]['word'], dependant[0]))
            if relation != 'root': dependencies.append(triple)
    return dependencies

def ctx_extraction_all(context, target):
    ctx_tokens = word_tokenize(context)
    if target in ctx_tokens:
        ctx_tokens.remove(target)
    return ctx_tokens

def ctx_extraction_all_filtered(context, target, filtering = True):
    ctx_tokens = word_tokenize(context)
    post_ctx_tokens = post_process_ctx(ctx_tokens, filtering)
    if target in ctx_tokens:
        ctx_tokens.remove(target)
    return post_process_ctx

def ctx_extraction_window_pre_n(context, target, start, end, filtering = True , n = 3):
    target = preprocess_target(target)
    ctx_tokens = word_tokenize(context[:start])
    post_ctx_tokens = post_process_ctx(ctx_tokens, filtering)
    return post_ctx_tokens[-n:]

def ctx_extraction_window_suc_n(context, target, start, end, filtering = True, n = 3):
    target = preprocess_target(target)
    ctx_tokens = word_tokenize(context[end:])
    post_ctx_tokens = post_process_ctx(ctx_tokens, filtering)
    return post_ctx_tokens[:n]

def ctx_extraction_window_pre_suc_n(context, target, start, end, filtering = True, n = 3):
    ctx_tokens_pre = ctx_extraction_window_pre_n(context, target, start, end, filtering, n)
    ctx_tokens_suc = ctx_extraction_window_suc_n(context, target, start, end, filtering, n)
    ctx_tokens_pre.extend(ctx_tokens_suc)
    return ctx_tokens_pre

def ctx_extraction_dep_in(context, target, start, end):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    return list(set([triple[0][0] for triple in triples if triple[2] in targets]))

def ctx_extraction_dep_out(context, target, start, end):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    return list(set([triple[2][0] for triple in triples if triple[0] in targets]))

def ctx_extraction_dep_in_out(context, target, start, end):
    ctx_tokens_in = ctx_extraction_dep_in(context, target, start, end)
    ctx_tokens_out = ctx_extraction_dep_out(context, target, start, end)
    ctx_tokens_in.extend(ctx_tokens_out)
    return list(set(ctx_tokens_in))

def ctx_extraction_dep_recu_in_n_steps(context, target, start, end, n = 2):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    result_tokens = []
    curr_target = targets
    for step in range(0, n):
        step_result = [triple[0] for triple in triples 
                       if triple[2] in curr_target]
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
    return list(set([token[0] for token in result_tokens]))

def ctx_extraction_dep_recu_out_n_steps(context, target, start, end, n = 2):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    result_tokens = []
    curr_target = targets
    for step in range(0, n):
        step_result = [triple[2] for triple in triples 
                       if triple[0] in curr_target]
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
    return list(set([token[0] for token in result_tokens]))

def ctx_extraction_dep_recu_in_out_n_steps(context, target, start, end, n = 2):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    result_tokens = []
    curr_target = targets
    for step in range(0, n):
        step_result = [triple[2] for triple in triples 
                       if triple[0] in curr_target]
        step_result_out = [triple[0] for triple in triples 
                       if triple[2] in curr_target]
        step_result.extend(step_result_out)
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
    return list(set([token[0] for token in result_tokens]))

def ctx_extraction_dep_recu_in_cover(context, target, start, end, cover = 0.1):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    ctx_tokens = word_tokenize(context)
    ctx_tokens_post = post_process_ctx(ctx_tokens)
    result_tokens = []
    curr_target = targets
    curr_cover = 0
    while curr_cover < cover:
        step_result = [triple[0] for triple in triples 
                       if triple[2] in curr_target]
        if set(step_result) == set(curr_target):
                break
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
        curr_cover = len(result_tokens) / len(ctx_tokens_post)
    return list(set([token[0] for token in result_tokens]))

def ctx_extraction_dep_recu_out_cover(context, target, start, end, cover = 0.1):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    ctx_tokens = word_tokenize(context)
    ctx_tokens_post = post_process_ctx(ctx_tokens)
    result_tokens = []
    curr_target = targets
    curr_cover = 0
    while curr_cover < cover:
        step_result = [triple[2] for triple in triples 
                       if triple[0] in curr_target]
        if set(step_result) == set(curr_target):
                break
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
        curr_cover = len(result_tokens) / len(ctx_tokens_post)
    return list(set([token[0] for token in result_tokens]))

def ctx_extraction_dep_recu_in_out_cover(context, target, start, end, cover = 0.1):
    target = preprocess_target(target)
    targets = targets_with_index(start, end, context)
    triples = dependency_parse(context)
    ctx_tokens = word_tokenize(context)
    ctx_tokens_post = post_process_ctx(ctx_tokens)
    result_tokens = []
    curr_target = targets
    curr_cover = 0
    while curr_cover < cover:
        step_result = [triple[2] for triple in triples 
                       if triple[0] in curr_target]
        step_result_out = [triple[0] for triple in triples 
                       if triple[2] in curr_target]
        step_result.extend(step_result_out)
        if set(step_result) == set(curr_target):
                break
        curr_target = list(set(step_result))
        result_tokens.extend(step_result)
        curr_cover = len(result_tokens) / len(ctx_tokens_post)
    return list(set([token[0] for token in result_tokens]))

In [None]:
sentence = "Normally, the land will be passed down by future generations in a way " + \
             "that recognizes the community's traditional connection to that country "
target = 'passed'

print('ctx_etraction_all:')
print(ctx_extraction_all_filtered(sentence, target))

print('ctx_extraction_window_pre_n:')
print(ctx_extraction_window_pre_n(sentence, "Normally", 0, 8))
print(ctx_extraction_window_pre_n(sentence, "the", 11, 14))
print(ctx_extraction_window_pre_n(sentence, "land", 15, 19))
print(ctx_extraction_window_pre_n(sentence, "to", 127, 129))
print(ctx_extraction_window_pre_n(sentence, target, 28, 34, n = 5))

print('ctx_extraction_window_suc_n:')
print(ctx_extraction_window_suc_n(sentence, "country", 135, 142))
print(ctx_extraction_window_suc_n(sentence, "to", 127, 129))
print(ctx_extraction_window_suc_n(sentence, "connection", 116, 126))
print(ctx_extraction_window_suc_n(sentence, "community", 91, 100, n = 5))

print('ctx_extraction_window_pre_suc_n:')
print(ctx_extraction_window_pre_suc_n(sentence, "passed", 28, 34))
print(ctx_extraction_window_pre_suc_n(sentence, "the", 11, 14))
print(ctx_extraction_window_pre_suc_n(sentence, "to", 127, 129))

print('ctx_extraction_dep_in:')
print(ctx_extraction_dep_in(sentence, "land", 15, 19))

print('ctx_extraction_dep_out:')
print(ctx_extraction_dep_out(sentence, target, 28, 34))
print(ctx_extraction_dep_out(sentence, "land", 15, 19))

print('ctx_extraction_dep_in_out:')
print(ctx_extraction_dep_in_out(sentence, "land", 15, 19))

print('ctx_extraction_dep_recu_in_n_steps:')
print(ctx_extraction_dep_recu_in_n_steps(sentence, "the", 11, 14, n = 3))

print('ctx_extraction_dep_recu_out_n_steps:')
print(ctx_extraction_dep_recu_out_n_steps(sentence, "the", 11, 14))

print('ctx_extraction_dep_recu_in_out_n_steps:')
print(ctx_extraction_dep_recu_in_out_n_steps(sentence, "the", 11, 14))

print('ctx_extraction_dep_recu_in_cover:')
print(ctx_extraction_dep_recu_in_cover(sentence, "the", 11, 14, cover=0.1))

print('ctx_extraction_dep_recu_out_cover:')
print(ctx_extraction_dep_recu_out_cover(sentence, "the", 11, 14, cover=0.1))

print('ctx_extraction_dep_recu_in_out_cover:')
print(ctx_extraction_dep_recu_in_out_cover(sentence, "the", 11, 14, cover=0.1))

### c. Context Extraction

After we defined all the context extraction approaches, we can apply them on the actual dataset. To do so, we first extract all the distinct sentences from the actual training set and create a new dataframe containing only the sentence ids, the sentence, the target and all the computed contexts. This also makes it easier to integrate context extraction functions implemented in other languages. Afterwards we can compute the context features and join them back with the target features dataframe.

In [57]:
import numpy as np

df_context = df.loc[0:30, ['id', 'sentence', 'target', 'start', 'end']]
filtering = True

df_context['ctx_extraction_window_pre_n'] = df_context.apply(lambda columns : 
                                        ctx_extraction_window_pre_n(columns['sentence'], columns['target'], \
                                                columns['start'], columns['end'], filtering = filtering), axis = 1)

print('ctx_extraction_window_pre_n')
df_context['ctx_extraction_window_suc_n'] = df_context.apply(lambda columns : 
                                        ctx_extraction_window_suc_n(columns['sentence'], columns['target'], \
                                                columns['start'], columns['end'], filtering = filtering), axis = 1)
print('ctx_extraction_window_suc_n')
df_context['ctx_extraction_window_pre_suc_n'] = df_context.apply(lambda columns : 
                                        ctx_extraction_window_pre_suc_n(columns['sentence'], columns['target'], \
                                                columns['start'], columns['end'], filtering = filtering), axis = 1)
print('ctx_extraction_window_pre_suc_n')
df_context['ctx_extraction_dep_in'] = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_in(columns['sentence'], columns['target'], \
                                                columns['start'], columns['end']), axis = 1)
print('ctx_extraction_dep_in')
df_context['ctx_extraction_dep_out'] = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_out(columns['sentence'], columns['target'], \
                                                                   columns['start'], columns['end']), axis = 1)
print('ctx_extraction_dep_out')
# 1. Compute dep_in_out using defined function
df_context['ctx_extraction_dep_in_out'] = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_in_out(columns['sentence'], columns['target'], \
                                                                   columns['start'], columns['end']), axis = 1)
print('ctx_extraction_dep_in_out')
# 2. Compute dep_in_out by combining precomputed results
df_context['ctx_extraction_dep_in_out_dir'] = df_context[['ctx_extraction_dep_in', \
                                                      'ctx_extraction_dep_out']].apply(lambda vals : vals[0]+vals[1], axis=1)

print('ctx_extraction_dep_in_out_dir')
df_context['ctx_extraction_dep_recu_in_n_steps']  = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_recu_in_n_steps(columns['sentence'], columns['target'], \
                                        columns['start'], columns['end'], n=2), axis = 1)

print('ctx_extraction_dep_recu_in_n_steps')
df_context['ctx_extraction_dep_recu_out_n_steps']  = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_recu_out_n_steps(columns['sentence'], columns['target'], \
                                        columns['start'], columns['end'], n=2), axis = 1)

print('ctx_extraction_dep_recu_out_n_steps')
df_context['ctx_extraction_dep_recu_in_out_n_steps']  = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_recu_in_out_n_steps(columns['sentence'], columns['target'], \
                                        columns['start'], columns['end'], n=2), axis = 1)

print('ctx_extraction_dep_recu_in_out_n_steps')
df_context['ctx_extraction_dep_recu_in_cover']  = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_recu_in_cover(columns['sentence'], columns['target'], \
                                        columns['start'], columns['end'], cover=0.2), axis = 1)

print('ctx_extraction_dep_recu_in_cover')
df_context['ctx_extraction_dep_recu_out_cover']  = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_recu_out_cover(columns['sentence'], columns['target'], \
                                        columns['start'], columns['end'], cover=0.2), axis = 1)

print('ctx_extraction_dep_recu_out_cover')
df_context['ctx_extraction_dep_recu_in_out_cover']  = df_context.apply(lambda columns : 
                                        ctx_extraction_dep_recu_in_out_cover(columns['sentence'], columns['target'], \
                                        columns['start'], columns['end'], cover=0.2), axis = 1)

print('ctx_extraction_dep_recu_in_out_cover')
df_context

ctx_extraction_window_pre_n
ctx_extraction_window_suc_n
ctx_extraction_window_pre_suc_n
ctx_extraction_dep_in
ctx_extraction_dep_out
ctx_extraction_dep_in_out
ctx_extraction_dep_in_out_dir
ctx_extraction_dep_recu_in_n_steps
ctx_extraction_dep_recu_out_n_steps
ctx_extraction_dep_recu_in_out_n_steps
ctx_extraction_dep_recu_in_cover
ctx_extraction_dep_recu_out_cover
ctx_extraction_dep_recu_in_out_cover


Unnamed: 0,id,sentence,target,start,end,ctx_extraction_window_pre_n,ctx_extraction_window_suc_n,ctx_extraction_window_pre_suc_n,ctx_extraction_dep_in,ctx_extraction_dep_out,ctx_extraction_dep_in_out,ctx_extraction_dep_in_out_dir,ctx_extraction_dep_recu_in_n_steps,ctx_extraction_dep_recu_out_n_steps,ctx_extraction_dep_recu_in_out_n_steps,ctx_extraction_dep_recu_in_cover,ctx_extraction_dep_recu_out_cover,ctx_extraction_dep_recu_in_out_cover
0,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",passed,28,34,[land],"[future, generations, recognizes]","[land, future, generations, recognizes]",[],"[land, generations, be, down, Normally, will]","[generations, land, be, down, Normally, will]","[land, generations, be, down, Normally, will]",[],"[to, the, land, generations, be, future, down,...","[to, the, passed, land, generations, be, futur...",[],"[land, generations, be, down, Normally, will]","[land, generations, be, down, Normally, will]"
1,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",land,15,19,[],"[passed, future, generations]","[passed, future, generations]",[passed],[the],"[passed, the]","[passed, the]",[passed],[the],"[the, passed, land, generations, be, down, Nor...",[passed],[the],"[the, passed]"
2,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",future,43,49,"[land, passed]","[generations, recognizes, community]","[land, passed, generations, recognizes, commun...",[generations],[],[generations],[generations],"[passed, generations]",[],"[future, to, passed, generations]","[passed, generations]",[],"[future, to, passed, generations]"
3,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",future generations,43,61,"[land, passed]","[recognizes, community, traditional]","[land, passed, recognizes, community, traditio...","[passed, generations]","[future, to]","[passed, to, future, generations]","[passed, generations, future, to]","[passed, generations]","[future, to]","[to, passed, generations, land, be, future, do...","[passed, generations]","[future, to]","[future, to, passed, generations]"
4,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",generations,50,61,"[land, passed, future]","[recognizes, community, traditional]","[land, passed, future, recognizes, community, ...",[passed],"[future, to]","[passed, to, future]","[passed, future, to]",[passed],"[future, to]","[to, passed, land, generations, be, future, do...",[passed],"[future, to]","[future, to, passed]"
5,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",recognizes,76,86,"[passed, future, generations]","[community, traditional, connection]","[passed, future, generations, community, tradi...",[way],"[connection, that]","[connection, way, that]","[way, connection, that]",[way],"[connection, that, country, community, traditi...","[in, connection, that, way, a, community, coun...",[way],"[connection, that]","[connection, that, way]"
6,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",community,91,100,"[future, generations, recognizes]","[traditional, connection, country]","[future, generations, recognizes, traditional,...",[connection],"[the, 's]","[the, connection, 's]","[connection, the, 's]","[connection, recognizes]","[the, 's]","[connection, the, 's, community, country, trad...","[connection, recognizes]","[the, 's]","[the, connection, 's]"
7,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",traditional,104,115,"[generations, recognizes, community]","[connection, country]","[generations, recognizes, community, connectio...",[connection],[],[connection],[connection],"[connection, recognizes]",[],"[connection, country, community, traditional, ...","[connection, recognizes]",[],"[connection, country, community, traditional, ..."
8,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",traditional connection to that country,104,142,"[generations, recognizes, community]",[],"[generations, recognizes, community]","[connection, recognizes, country]","[to, that, country, community, traditional]","[to, connection, that, country, community, tra...","[connection, recognizes, country, to, that, co...","[connection, recognizes, way, country]","[to, that, the, 's, country, community, tradit...","[to, that, connection, the, 's, way, country, ...","[connection, recognizes, country]","[to, that, country, community, traditional]","[to, that, connection, country, community, tra..."
9,3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...",country,135,142,"[community, traditional, connection]",[],"[community, traditional, connection]",[connection],"[to, that]","[to, connection, that]","[connection, to, that]","[connection, recognizes]","[to, that]","[to, that, connection, community, country, tra...","[connection, recognizes]","[to, that]","[to, that, connection]"


### d. Context Features
After defining all the context definitions and extracting the different kinds of contexts from the sentence, we compute features on the context words. Therefore we first define which of the precomputed contexts to use.

In [62]:
df_context['context'] = df_context.ctx_extraction_window_pre_suc_n

#### (1) Readability Measures
Here we implement some of the most popular and well-known historical readability measures. Most of them need multiple sentences to compute them properly, however, we will apply them on the extracted context.

In [66]:
from textatistic import Textatistic

df_context['rb_dalechall_score'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').dalechall_score)
df_context['rb_flesch_score'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').flesch_score)
df_context['rb_fleschkincaid_score'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').fleschkincaid_score)
df_context['rb_gunningfog_score'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').gunningfog_score)
df_context['rb_polysyblword_count'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').polysyblword_count)
df_context['rb_smog_score'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').smog_score)
df_context['rb_sybl_count'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').sybl_count)
df_context['rb_sybl_count_ratio'] = df_context.context.apply(lambda context : \
                                                Textatistic(' '.join(context) + '.').sybl_count / len(context))

df_context[['target', 'context', 'rb_dalechall_score', 'rb_flesch_score', 'rb_fleschkincaid_score', \
            'rb_gunningfog_score', 'rb_polysyblword_count', 'rb_smog_score', 'rb_sybl_count', 'rb_sybl_count_ratio']].head()

Unnamed: 0,target,context,rb_dalechall_score,rb_flesch_score,rb_fleschkincaid_score,rb_gunningfog_score,rb_polysyblword_count,rb_smog_score,rb_sybl_count,rb_sybl_count_ratio
0,passed,"[land, future, generations, recognizes]",15.6774,-8.725,15.47,21.6,2,11.208143,10,2.5
1,land,"[passed, future, generations]",14.311967,6.39,13.113333,14.533333,1,8.841846,7,2.333333
2,future,"[land, passed, generations, recognizes, commun...",13.3585,-1.28,14.68,26.0,3,13.023867,12,2.4
3,future generations,"[land, passed, recognizes, community, traditio...",13.3585,15.64,12.32,26.0,3,13.023867,11,2.2
4,generations,"[land, passed, future, recognizes, community, ...",14.460767,17.445,12.316667,22.4,3,13.023867,13,2.166667


#### (2) Linguistic Features
Here we implement some of the most popular and well-known historical readability measures. Most of them need multiple sentences to compute them properly, however, we will apply them on the extracted context.

In [None]:
df_context['context'] = df_context['ctx_extraction_window_pre_suc_n']

df_context['ctx_num_tokens'] = df_context.context.apply(lambda context : len(context))
df_context['ctx_avg_length'] = df_context.context.apply(lambda context : agg_ctx_feat_num_average(context, len))
df_context['ctx_avg_word_freq_wiki'] = df_context.context.apply(lambda context : \
                                                    agg_ctx_feat_num_average(context, get_dict_count, word_freq_wiki))
df_context.head()