# Complexity Lexicon Construction

## (1.0) Load Datasets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import namedtuple

Dataset = namedtuple('Dataset', 'name, train, test')
Model = namedtuple('Model', 'type, name, dimension, corpus, model')

MAIN_PATH_DATASET = "../cwishareddataset/traindevset/english/"
genres = ['Wikipedia', 'WikiNews', 'News']
datasets = ['Train', 'Dev']
columns = ['id', 'sentence', "start", "end", "target", 
           "nat", "non_nat", "nat_marked", "non_nat_marked", "binary", "prob"]


datasets = [Dataset('Wikipedia', 'Train', 'Dev'),
            Dataset('WikiNews', 'Train', 'Dev'),
            Dataset('News', 'Train', 'Dev')]

feature_categories = []

def load_df(path):
    df = pd.read_csv(path, header=None, sep = "\t")
    df.columns = columns
    return df

datasets = [Dataset(d.name, load_df(MAIN_PATH_DATASET + d.name + '_' + d.train + '.tsv'),
                            load_df(MAIN_PATH_DATASET + d.name + '_' + d.test + '.tsv'))
                            for d in datasets]

## (1.1) Load Embedding Models

### (1.1.1) Load GloVe embeddings

In [2]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

MAIN_PATH = 'D:/workspace_python/CoWoReId/python/resources/word-embeddings/'

glove_defs = [#Model('glove', 'glove.42B.300d.txt', 300, 'cc42B', None),  
              #Model('glove', 'glove.840B.300d.txt', 300, 'cc840B', None), 
              Model('glove', 'glove.6B.50d.txt', 50, 'wikipedia+gigaword5', None), 
              #Model('glove', 'glove.6B.100d.txt',100, 'wikipedia+gigaword5', None),
              #Model('glove', 'glove.6B.200d.txt', 200, 'wikipedia+gigaword5', None), 
              #Model('glove', 'glove.6B.300d.txt', 300, 'wikipedia+gigaword5', None),
              Model('glove', 'glove.twitter.27B.25d.txt', 25, 'twitter', None)]
              #Model('glove', 'glove.twitter.27B.50d.txt', 50, 'twitter', None), 
              #Model('glove', 'glove.twitter.27B.100d.txt', 100, 'twitter', None), 
              #Model('glove', 'glove.twitter.27B.200d.txt', 200, 'twitter', None)]

glove_models = []
for model in glove_defs:
    glove_file = datapath(MAIN_PATH + model.name)
    tmp_file = get_tmpfile(model.name + '-temp')
    glove2word2vec(glove_file, tmp_file)
    vecs = KeyedVectors.load_word2vec_format(tmp_file)
    glove_models.append(Model(model.type, model.name, model.dimension, model.corpus, vecs))
    print('load model : {}'.format(model.name))
    
print(glove_models)



load model : glove.6B.50d.txt
load model : glove.twitter.27B.25d.txt
[Model(type='glove', name='glove.6B.50d.txt', dimension=50, corpus='wikipedia+gigaword5', model=<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x0000001D60287A20>), Model(type='glove', name='glove.twitter.27B.25d.txt', dimension=25, corpus='twitter', model=<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x0000001D6DCFBE80>)]


### (1.1.2) Load word2vec embeddings

In [3]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors

MAIN_PATH = 'D:/workspace_python/CoWoReId/python/resources/word-embeddings/'
model_word2vec = Model('word2vec', 'GoogleNews-vectors-negative300.bin' , 300, 'GoogleNews', \
            KeyedVectors.load_word2vec_format(datapath(MAIN_PATH + 'GoogleNews-vectors-negative300.bin'), binary=True))

### (1.1.3) Load FastText embeddings

In [4]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.models.wrappers import FastText

MAIN_PATH = 'D:/workspace_python/CoWoReId/python/resources/word-embeddings/'
model_fastText = Model('FastText', 'wiki.en.vec', 300, 'Wikipedia', \
                 KeyedVectors.load_word2vec_format(MAIN_PATH + 'wiki.en.vec'))

In [5]:
models = []
models.extend(glove_models)
models.append(model_word2vec)
models.append(model_fastText)

## (2) Boostrapped Lexicon Construction

In [46]:
def ngram_embedding_similarity(model, word_l, word_r, missing_strat, ngram_repr):
    vecs_l = [model[word.strip().lower()] if word.strip().lower() in model.vocab 
                  else missing_strat(word, model.vector_size) 
                  for word in word_l.split()]   
    vecs_r = [model[word.strip().lower()] if word.strip().lower() in model.vocab \
                else missing_strat(word, model.vector_size)
                for word in word_r.split()]
    vec_l = ngram_repr(np.array(vecs_l))
    vec_r = ngram_repr(np.array(vecs_r))
    return np.dot(vec_l,vec_r) / (np.linalg.norm(vec_l) \
            * np.linalg.norm(vec_r))

In [55]:
from IPython.core.debugger import Tracer

def bootstrap_lexicon(model, vocab, seeds_l, seeds_r, embedding_sim, \
                      missing_strat, ref_term, ngram_repr, agg_embed=True, epochs=10, \
                      bound_l=-1, bound_r=1, thresh_l=-0.5,thresh_r=0.5):
    if not all(seed in vocab for seed in seeds_l):
        raise ValueError('Not all left seeds contained in vocabulary')
    if not all(seed in vocab for seed in seeds_r):
        raise ValueError('Not all right seeds contained in vocabulary')
    num_missing = np.sum([np.sum([1 for word in mwe.split()
                        if word.strip().lower() not in model.vocab]) for mwe in vocab])
    print('Missing vocab in model : {} / {}%'.format(num_missing, \
          (num_missing/len(vocab)*100)))
    if agg_embed:
        internal_vocab = vocab
    else:
        internal_vocab = [word for mwe in vocab for word in mwe.split()]
    # 1. Initialize the left and right seeds
    se_l = {seed : bound_l for seed in seeds_l}
    se_r = {seed : bound_r for seed in seeds_r}
    lexicon = se_l.copy()
    lexicon.update(se_r)
    for curr_epoch in range(1,epochs+1):
        # 2. Compute left and right weights
        #print(se_l)
        sum_l = np.abs(np.sum([score for word, score in se_l.items()]))
        sum_r = np.abs(np.sum([score for word, score in se_r.items()]))
        weight_l = sum_r / (sum_r + sum_l)
        weight_r = sum_l / (sum_r + sum_l)
        #print(sum_l)
        #print(sum_r)
        print('Epoch {} : Se_l_size = {}, Se_r_size = {}, weight_l = {}, weight_r = {},'.format(\
                      curr_epoch, len(se_l), len(se_r), weight_l, weight_r))
        for index, curr_word in enumerate(internal_vocab):
            if((index % 500) == 0):
                print('{}|{}:{}%   '.format(index, len(internal_vocab), 
                                    (index/len(internal_vocab))), end='')
            if curr_word in se_l or curr_word in se_r:
                continue
            # Compute the weighted left and right scores and sum them
            score_l = [(weight_l * score * \
                        embedding_sim(model, curr_word, seed, missing_strat, ngram_repr)) \
                        for seed, score in se_l.items()]
            score_r = [(weight_r * score * \
                        embedding_sim(model, curr_word, seed, missing_strat, ngram_repr)) \
                        for seed, score in se_r.items()]
            #print('Word : {}'.format(curr_word))
            #print(score_r)
            #print(score_l)
            score = np.sum(score_l) + np.sum(score_r)
            #print('final score : {}'.format(score))
            lexicon[curr_word] = score
            Tracer()()
            #print('{} : {}'.format(curr_word, score))
            # Add word to the seed set if the score is low or high enough
            if score <= thresh_l: se_l[curr_word] = score
            if score >= thresh_r: se_r[curr_word] = score
        #print(lexicon)
    # 3. Compute final scores and normalize them
    sim_ref = lexicon.get(ref_term)
    #print('SIM_REF:{}'.format(sim_ref))
    if not sim_ref:
        return ValueError('Reference term {} not found in lexicon'.format(ref_term))
    coll_l = {seed : (score - sim_ref) for seed, score in lexicon.items() \
                if (score - sim_ref) < 0}
    coll_r = {seed : (score - sim_ref) for seed, score in lexicon.items() \
                if (score - sim_ref) > 0}
    max_l = np.max(np.abs([score for _, score in coll_l.items()]))
    max_r = np.max(np.abs([score for _, score in coll_r.items()]))
    #print(coll_l)
    #print(max_l)
    lexicon[ref_term] = lexicon[ref_term] - sim_ref
    for word, score in coll_l.items():
        lexicon[word] = score / max_l
    for word, score in coll_r.items():
        lexicon[word] = score / max_r
    # 4. Aggregate MWE complexity scores
    if not agg_embed:
        for mwe in vocab:
            lexicon[mwe] = np.mean([lexicon[word] for word in mwe.split()])
    return lexicon


def bootstrap_lexicon_simple_norm(models, vocab, seeds_l, seeds_r, embedding_sim, \
                      missing_strat, ngram_repr, agg_embed=True, epochs=10, \
                      bound_l=-1, bound_r=1, thresh_l=-0.5,thresh_r=0.5):
    if not all(seed in vocab for seed in seeds_l):
        raise ValueError('Not all left seeds contained in vocabulary')
    if not all(seed in vocab for seed in seeds_r):
        raise ValueError('Not all right seeds contained in vocabulary')
    num_missing = np.sum([1 for word in vocab if word not in model.vocab])
    print('Missing vocab in model : {} / {}%'.format(num_missing, \
          (num_missing/len(vocab)*100)))
    if agg_embed:
        internal_vocab = vocab
    else:
        internal_vocab = [word for mwe in vocab for word in mwe.split()]
    # 1. Initialize the left and right seeds
    se_l = {seed : bound_l for seed in seeds_l}
    se_r = {seed : bound_r for seed in seeds_r}
    lexicon = se_l.copy()
    lexicon.update(se_r)
    for curr_epoch in range(1,epochs+1):
        # 2. Compute left and right weights
        #print(se_l)
        sum_l = np.abs(np.sum([score for word, score in se_l.items()]))
        sum_r = np.abs(np.sum([score for word, score in se_r.items()]))
        weight_l = sum_r / (sum_r + sum_l)
        weight_r = sum_l / (sum_r + sum_l)
        print('Epoch {} : Se_l_size = {}, Se_r_size = {}, weight_l = {}, weight_r = {},'.format(\
                      curr_epoch, len(se_l), len(se_r), weight_l, weight_r))
        for curr_word in internal_vocab:
            if curr_word in se_l or curr_word in se_r:
                continue
            # Compute the weighted left and right scores and sum them
            score_l = [(weight_l * score * \
                        embedding_sim(model, curr_word, seed, missing_strat, ngram_repr)) \
                        for seed, score in se_l.items()]
            score_r = [(weight_r * score * \
                        embedding_sim(model, curr_word, seed, missing_strat, ngram_repr)) \
                        for seed, score in se_r.items()]
            print('Word : {} = {}'.format(curr_word, score_l))
            print(score_r)
            score = np.sum(score_l) + np.sum(score_r)
            print('final score : {}'.format(score))
            lexicon[curr_word] = score
            #print('{} : {}'.format(curr_word, score))
            # Add word to the seed set if the score is low or high enough
            if score <= thresh_l: se_l[curr_word] = score
            if score >= thresh_r: se_r[curr_word] = score
        #print(lexicon)
    # 3. Compute final scores and normalize them
    coll_l = {seed : score for seed, score in lexicon.items() \
                if score < 0}
    coll_r = {seed : score for seed, score in lexicon.items() \
                if score > 0}
    max_l = np.max(np.abs([score for _, score in coll_l.items()]))
    max_r = np.max(np.abs([score for _, score in coll_r.items()]))
    for word, score in coll_l.items():
        lexicon[word] = score / max_l
    for word, score in coll_r.items():
        lexicon[word] = score / max_r
    # 4. Aggregate MWE complexity scores
    if not agg_embed:
        for mwe in vocab:
            lexicon[mwe] = np.mean([lexicon[word] for word in mwe.split()])
    return lexicon

def bootstrap_lexicon_multiple_embeddings(models, vocab, seeds_l, seeds_r, embedding_sim, \
                      missing_strat, ref_term, ngram_repr, agg_embed=True, epochs=10, \
                      bound_l=-1, bound_r=1, thresh_l=-0.5,thresh_r=0.5):
    if not all(seed in vocab for seed in seeds_l):
        raise ValueError('Not all left seeds contained in vocabulary')
    if not all(seed in vocab for seed in seeds_r):
        raise ValueError('Not all right seeds contained in vocabulary')
    num_missing = np.sum([1 for word in vocab if not \
                    any([word in model.vocab for model in models])])
    print('Missing vocab in models : {} / {}%'.format(num_missing, \
          (num_missing/len(vocab)*100)))
    if agg_embed:
        internal_vocab = vocab
    else:
        internal_vocab = [word for mwe in vocab for word in mwe.split()]
    # 1. Initialize the left and right seeds
    se_l = {seed : bound_l for seed in seeds_l}
    se_r = {seed : bound_r for seed in seeds_r}
    lexicon = se_l.copy()
    lexicon.update(se_r)
    for curr_epoch in range(1,epochs+1):
        # 2. Compute left and right weights
        #print(se_l)
        sum_l = np.abs(np.sum([score for word, score in se_l.items()]))
        sum_r = np.abs(np.sum([score for word, score in se_r.items()]))
        weight_l = sum_r / (sum_r + sum_l)
        weight_r = sum_l / (sum_r + sum_l)
        print(sum_l)
        print(sum_r)
        print('Epoch {} : Se_l_size = {}, Se_r_size = {}, weight_l = {}, weight_r = {},'.format(\
                      curr_epoch, len(se_l), len(se_r), weight_l, weight_r))
        for curr_word in internal_vocab:
            if curr_word in se_l or curr_word in se_r:
                continue
            # Compute the weighted left and right scores and sum them
            score_l = [(weight_l * score * \
                        np.mean([embedding_sim(model, curr_word, seed, missing_strat, ngram_repr) 
                                for model in models])) \
                        for seed, score in se_l.items()]
            score_r = [(weight_r * score * \
                        np.mean([embedding_sim(model, curr_word, seed, missing_strat, ngram_repr)
                                for model in models])) \
                        for seed, score in se_r.items()]
            print('Word : {}'.format(curr_word))
            print(score_r)
            print(score_l)
            score = np.sum(score_l) + np.sum(score_r)
            print('final score : {}'.format(score))
            lexicon[curr_word] = score
            #print('{} : {}'.format(curr_word, score))
            # Add word to the seed set if the score is low or high enough
            if score <= thresh_l: se_l[curr_word] = score
            if score >= thresh_r: se_r[curr_word] = score
        #print(lexicon)
    # 3. Compute final scores and normalize them
    sim_ref = lexicon.get(ref_term)
    print('SIM_REF:{}'.format(sim_ref))
    if not sim_ref:
        return ValueError('Reference term {} not found in lexicon'.format(ref_term))
    coll_l = {seed : (score - sim_ref) for seed, score in lexicon.items() \
                if (score - sim_ref) < 0}
    coll_r = {seed : (score - sim_ref) for seed, score in lexicon.items() \
                if (score - sim_ref) > 0}
    max_l = np.max(np.abs([score for _, score in coll_l.items()]))
    max_r = np.max(np.abs([score for _, score in coll_r.items()]))
    #print(coll_l)
    #print(max_l)
    lexicon[ref_term] = lexicon[ref_term] - sim_ref
    for word, score in coll_l.items():
        lexicon[word] = score / max_l
    for word, score in coll_r.items():
        lexicon[word] = score / max_r
    # 4. Aggregate MWE complexity scores
    if not agg_embed:
        for mwe in vocab:
            lexicon[mwe] = np.mean([lexicon[word] for word in mwe.split()])
    return lexicon

def bootstrap_lexicon_no_weight(model, vocab, seeds_l, seeds_r, embedding_sim, \
                      missing_strat, ref_term, ngram_repr, agg_embed=True, epochs=10, \
                      bound_l=-1, bound_r=1, thresh_l=-0.5,thresh_r=0.5):
    if not all(seed in vocab for seed in seeds_l):
        raise ValueError('Not all left seeds contained in vocabulary')
    if not all(seed in vocab for seed in seeds_r):
        raise ValueError('Not all right seeds contained in vocabulary')
    num_missing = np.sum([1 for word in vocab if not \
                    any([word in model.vocab for model in models])])
    print('Missing vocab in models : {} / {}%'.format(num_missing, \
          (num_missing/len(vocab)*100)))
    if agg_embed:
        internal_vocab = vocab
    else:
        internal_vocab = [word for mwe in vocab for word in mwe.split()]
    # 1. Initialize the left and right seeds
    se_l = {seed : bound_l for seed in seeds_l}
    se_r = {seed : bound_r for seed in seeds_r}
    lexicon = se_l.copy()
    lexicon.update(se_r)
    for curr_epoch in range(1,epochs+1):
        # 2. Compute left and right weights
        #print(se_l)
        sum_l = np.abs(np.sum([score for word, score in se_l.items()]))
        sum_r = np.abs(np.sum([score for word, score in se_r.items()]))
        weight_l = sum_r / (sum_r + sum_l)
        weight_r = sum_l / (sum_r + sum_l)
        print(sum_l)
        print(sum_r)
        print('Epoch {} : Se_l_size = {}, Se_r_size = {}, weight_l = {}, weight_r = {},'.format(\
                      curr_epoch, len(se_l), len(se_r), weight_l, weight_r))
        for curr_word in internal_vocab:
            if curr_word in se_l or curr_word in se_r:
                continue
            # Compute the weighted left and right scores and sum them
            score_l = [(score * \
                        np.mean([embedding_sim(model, curr_word, seed, missing_strat, ngram_repr) 
                                for model in models])) \
                        for seed, score in se_l.items()]
            score_r = [(score * \
                        np.mean([embedding_sim(model, curr_word, seed, missing_strat, ngram_repr)
                                for model in models])) \
                        for seed, score in se_r.items()]
            print('Word : {}'.format(curr_word))
            print(score_r)
            print(score_l)
            score = np.mean(score_l) + np.mean(score_r)
            print('final score : {}'.format(score))
            lexicon[curr_word] = score
            #print('{} : {}'.format(curr_word, score))
            # Add word to the seed set if the score is low or high enough
            if score <= thresh_l: se_l[curr_word] = score
            if score >= thresh_r: se_r[curr_word] = score
        #print(lexicon)
    # 3. Compute final scores and normalize them
    sim_ref = lexicon.get(ref_term)
    print('SIM_REF:{}'.format(sim_ref))
    if not sim_ref:
        return ValueError('Reference term {} not found in lexicon'.format(ref_term))
    coll_l = {seed : (score - sim_ref) for seed, score in lexicon.items() \
                if (score - sim_ref) < 0}
    coll_r = {seed : (score - sim_ref) for seed, score in lexicon.items() \
                if (score - sim_ref) > 0}
    max_l = np.max(np.abs([score for _, score in coll_l.items()]))
    max_r = np.max(np.abs([score for _, score in coll_r.items()]))
    #print(coll_l)
    #print(max_l)
    lexicon[ref_term] = lexicon[ref_term] - sim_ref
    for word, score in coll_l.items():
        lexicon[word] = score / max_l
    for word, score in coll_r.items():
        lexicon[word] = score / max_r
    # 4. Aggregate MWE complexity scores
    if not agg_embed:
        for mwe in vocab:
            lexicon[mwe] = np.mean([lexicon[word] for word in mwe.split()])
    return lexicon

## (3) Toy-Example Lexicon

In [50]:
from ngram_representation import missing_strat_random
from ngram_representation import ngram_repr_bow_mean
import gensim

seeds_complex = ['aboriginal']
seeds_non_complex = ['bad']

# Build the vocabulary
vocabulary = []
vocabulary.extend(seeds_complex)
vocabulary.extend(seeds_non_complex)
vocabulary.extend(['good', 'inuit', 'and', 'to this end'])

print('---------Complex Seeds----------------')
print(seeds_complex)
print('---------Non-Complex Seeds----------------')
print(seeds_non_complex)
print('---------Vocabulary----------------')
print(vocabulary)

model = models[0].model
ref_term = 'and'
lexicon = bootstrap_lexicon(model, vocabulary, seeds_non_complex, seeds_complex, \
                  ngram_embedding_similarity, missing_strat_random, ref_term, \
                  ngram_repr_bow_mean, agg_embed=False, epochs=3, thresh_l=-0.5, thresh_r=0.5)
print(lexicon)

---------Complex Seeds----------------
['aboriginal']
---------Non-Complex Seeds----------------
['bad']
---------Vocabulary----------------
['aboriginal', 'bad', 'good', 'inuit', 'and', 'to this end']
Missing vocab in model : 1 / 16.666666666666664%
1
1
Epoch 1 : Se_l_size = 1, Se_r_size = 1, weight_l = 0.5, weight_r = 0.5,
Word : good
[0.09518544375896454]
[-0.398244708776474]
final score : -0.30305926501750946
Word : inuit
[0.34829169511795044]
[0.06974420696496964]
final score : 0.4180359020829201
Word : and
[0.17070919275283813]
[-0.30853986740112305]
final score : -0.1378306746482849
Word : to
[0.13541758060455322]
[-0.3115454614162445]
final score : -0.17612788081169128
Word : this
[0.131220281124115]
[-0.3739129602909088]
final score : -0.24269267916679382
Word : end
[0.08204780519008636]
[-0.3172760307788849]
final score : -0.23522822558879852
1
1
Epoch 2 : Se_l_size = 1, Se_r_size = 1, weight_l = 0.5, weight_r = 0.5,
Word : good
[0.09518544375896454]
[-0.398244708776474]
fina

In [45]:
from ngram_representation import missing_strat_random
from ngram_representation import ngram_repr_bow_mean
import gensim

seeds_complex = ['aboriginal']
seeds_non_complex = ['bad']

# Build the vocabulary
vocabulary = []
vocabulary.extend(seeds_complex)
vocabulary.extend(seeds_non_complex)
vocabulary.extend(['good', 'inuit', 'and', 'to this end', 'extraordinary'])

print('---------Complex Seeds----------------')
print(seeds_complex)
print('---------Non-Complex Seeds----------------')
print(seeds_non_complex)
print('---------Vocabulary----------------')
print(vocabulary)

models_selected = [model.model for model in models if model.type == 'glove']
ref_term = 'and'
lexicon = bootstrap_lexicon_multiple_embeddings(models_selected, vocabulary, seeds_non_complex, seeds_complex, \
                  ngram_embedding_similarity, missing_strat_random, ref_term, \
                  ngram_repr_bow_mean, agg_embed=False, epochs=3, thresh_l=-0.5, thresh_r=0.5)
print(lexicon)

---------Complex Seeds----------------
['aboriginal']
---------Non-Complex Seeds----------------
['bad']
---------Vocabulary----------------
['aboriginal', 'bad', 'good', 'inuit', 'and', 'to this end', 'extraordinary']
Missing vocab in models : 1 / 14.285714285714285%
1
1
Epoch 1 : Se_l_size = 1, Se_r_size = 1, weight_l = 0.5, weight_r = 0.5,
Word : good
[0.11704779416322708]
[-0.4278191030025482]
final score : -0.31077130883932114
Word : inuit
[0.3482291102409363]
[0.0792631059885025]
final score : 0.4274922162294388
Word : and
[0.19104844331741333]
[-0.3692472577095032]
final score : -0.17819881439208984
Word : to
[0.15895500779151917]
[-0.36740386486053467]
final score : -0.2084488570690155
Word : this
[0.15309616923332214]
[-0.4176422953605652]
final score : -0.26454612612724304
Word : end
[0.1130194142460823]
[-0.3775354027748108]
final score : -0.2645159885287285
Word : extraordinary
[0.20164288580417633]
[-0.2545890212059021]
final score : -0.05294613540172577
1
1
Epoch 2 : Se_l

## (4) Complexity Lexicon

### (4.1) MWE with Embedding Aggregation

### (4.2) MWE with Complexity Score Aggregation

In [None]:
import pandas as pd
from ngram_representation import missing_strat_random
from ngram_representation import ngram_repr_bow_mean
from ngram_representation import ngram_repr_bow_max
from ngram_representation import ngram_repr_bow_min
from ngram_representation import ngram_repr_wiki_weighted_bow

TRAIN_ENGLISH_WIKIPEDIA = "../cwishareddataset/traindevset/" + \
                           "english/Wikipedia_Train.tsv"
TEST_ENGLISH_WIKIPEDIA = "../cwishareddataset/testset/" + \
                           "english/Wikipedia_Test.tsv"
df = pd.read_csv(TRAIN_ENGLISH_WIKIPEDIA, sep = "\t")
df_test = pd.read_csv(TEST_ENGLISH_WIKIPEDIA, sep = '\t')
df.columns = ['id', 'sentence', "start", "end", "target", 
              "nat", "non_nat", "nat_marked", "non_nat_marked", "binary", "prob"]
df_test.columns = ['id', 'sentence', "start", "end", "target", 
              "nat", "non_nat", "nat_marked", "non_nat_marked", "binary", "prob"]

Target = namedtuple('Target', 'target, dataset')
Lexicon = namedtuple('Lexicon', 'name, epochs, thresh_l, thresh_r, agg_embed, lexicon')

# Get the basic seeds for complex and non_complex words
# Group targets into complex and non-complex and flatten MWE
targets_complex = set([Target(mwe, ds.name) for ds in datasets 
                    for mwe in ds.train.loc[ds.train['binary'] == 1,]['target'].tolist()])
targets_non_complex = set([Target(mwe, ds.name) for ds in datasets 
                    for mwe in ds.train.loc[ds.train['binary'] == 0,]['target'].tolist()])

# Get the rest of the vocabulary as the test data to compute
# the complexity score on
targets_test = [Target(mwe, ds.name) for ds in datasets 
                for mwe in ds.test['target'].tolist()]

# Clean groups from overlapping words
targets_complex_cleaned = list(targets_complex.difference(targets_non_complex))
targets_non_complex_cleaned = list(targets_non_complex.difference(targets_complex))
targets_complex = list(targets_complex)
targets_non_complex = list(targets_non_complex)

print('---------Training Targets----------------')
print('Targets complex : {}'.format(len(targets_complex)))
print('Target non-complex : {}'.format(len(targets_non_complex)))
print('Targets clean-complex : {}'.format(len(targets_complex_cleaned)))
print('Target clean-non-complex : {}'.format(len(targets_non_complex_cleaned)))
print('---------Testing Seeds----------------')
print('Target test : {}'.format(len(targets_test)))

lexicons = []
# Parameters for lexicon construction
ref_term = 'and'
epochs=3
thresh_l=-0.5
thresh_r=0.5
agg_embed=False
embedding_sim = ngram_embedding_similarity
ngram_repr = ngram_repr_bow_mean
missing_strat = missing_strat_random
model = models[0].model

for ds in datasets:
    t_complex = [target.target for target in targets_complex_cleaned 
                 if target.dataset == ds.name]
    t_non_complex = [target.target for target in targets_non_complex_cleaned
                    if target.dataset == ds.name]
    t_test = [target.target for target in targets_test 
              if target.dataset == ds.name]
    vocabulary = []
    vocabulary.extend(t_complex)
    vocabulary.extend(t_non_complex)
    vocabulary.extend(t_test)
    print('-----------------------------------------')
    print('-----------------------------------------')
    print('Dataset :{}'.format(ds.name))
    print('---------Training Targets----------------')
    print('Targets complex : {}'.format(len(t_complex)))
    print('Target non-complex : {}'.format(len(t_non_complex)))
    print('---------Testing Seeds----------------')
    print('Target test : {}'.format(len(t_test)))
    lexicon = bootstrap_lexicon(model, vocabulary, t_non_complex, t_complex, \
                  embedding_sim=embedding_sim, \
                  missing_strat=missing_strat, ref_term=ref_term, \
                  ngram_repr=ngram_repr, agg_embed=agg_embed, epochs=epochs, \
                  thresh_l=thresh_l, thresh_r=thresh_r)
    lexicons.append(Lexicon(ds.name, epochs, thresh_l, thresh_r, agg_embed, lexicon))

print(len(lexicons))

---------Training Targets----------------
Targets complex : 7688
Target non-complex : 6316
Targets clean-complex : 6559
Target clean-non-complex : 5187
---------Testing Seeds----------------
Target test : 3328
-----------------------------------------
-----------------------------------------
Dataset :Wikipedia
---------Training Targets----------------
Targets complex : 1901
Target non-complex : 1603
---------Testing Seeds----------------
Target test : 694
Missing vocab in model : 192.0 / 4.5736064792758455%
Epoch 1 : Se_l_size = 1603, Se_r_size = 1901, weight_l = 0.5425228310502284, weight_r = 0.4574771689497717,
0|5306:0.0%   > [0;32m<ipython-input-55-d8c434c28a3a>[0m(48)[0;36mbootstrap_lexicon[0;34m()[0m
[0;32m     46 [0;31m            [1;31m#print('{} : {}'.format(curr_word, score))[0m[1;33m[0m[1;33m[0m[0m
[0m[0;32m     47 [0;31m            [1;31m# Add word to the seed set if the score is low or high enough[0m[1;33m[0m[1;33m[0m[0m
[0m[0;32m---> 48 [0;31m 

In [52]:
[(word, score) for word, score in lexicon.items() if score < -0.5]

[('knowledge', -0.6338390641618713), ('mind', -1.0)]