# Complexity Lexicon Construction

## (1.0) Load Datasets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import namedtuple

Dataset = namedtuple('Dataset', 'name, train, test')
Model = namedtuple('Model', 'type, name, dimension, corpus, model')

MAIN_PATH_DATASET = "../cwishareddataset/traindevset/english/"
genres = ['Wikipedia', 'WikiNews', 'News']
datasets = ['Train', 'Dev']
columns = ['id', 'sentence', "start", "end", "target", 
           "nat", "non_nat", "nat_marked", "non_nat_marked", "binary", "prob"]


datasets = [Dataset('Wikipedia', 'Train', 'Dev'),
            Dataset('WikiNews', 'Train', 'Dev'),
            Dataset('News', 'Train', 'Dev')]

feature_categories = []

def load_df(path):
    df = pd.read_csv(path, header=None, sep = "\t")
    df.columns = columns
    return df

datasets = [Dataset(d.name, load_df(MAIN_PATH_DATASET + d.name + '_' + d.train + '.tsv'),
                            load_df(MAIN_PATH_DATASET + d.name + '_' + d.test + '.tsv'))
                            for d in datasets]

## (1.1) Load Embedding Models

### (1.1.1) Load GloVe embeddings

In [2]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

MAIN_PATH = 'D:/workspace_python/CoWoReId/python/resources/word-embeddings/'

glove_defs = [#Model('glove', 'glove.42B.300d.txt', 300, 'cc42B', None),  
              #Model('glove', 'glove.840B.300d.txt', 300, 'cc840B', None), 
              Model('glove', 'glove.6B.50d.txt', 50, 'wikipedia+gigaword5', None), 
              #Model('glove', 'glove.6B.100d.txt',100, 'wikipedia+gigaword5', None),
              #Model('glove', 'glove.6B.200d.txt', 200, 'wikipedia+gigaword5', None), 
              #Model('glove', 'glove.6B.300d.txt', 300, 'wikipedia+gigaword5', None),
              Model('glove', 'glove.twitter.27B.25d.txt', 25, 'twitter', None)]
              #Model('glove', 'glove.twitter.27B.50d.txt', 50, 'twitter', None), 
              #Model('glove', 'glove.twitter.27B.100d.txt', 100, 'twitter', None), 
              #Model('glove', 'glove.twitter.27B.200d.txt', 200, 'twitter', None)]

glove_models = []
for model in glove_defs:
    glove_file = datapath(MAIN_PATH + model.name)
    tmp_file = get_tmpfile(model.name + '-temp')
    glove2word2vec(glove_file, tmp_file)
    vecs = KeyedVectors.load_word2vec_format(tmp_file)
    glove_models.append(Model(model.type, model.name, model.dimension, model.corpus, vecs))
    print('load model : {}'.format(model.name))
    
print(glove_models)



load model : glove.6B.50d.txt
load model : glove.twitter.27B.25d.txt
[Model(type='glove', name='glove.6B.50d.txt', dimension=50, corpus='wikipedia+gigaword5', model=<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x00000039A4E52978>), Model(type='glove', name='glove.twitter.27B.25d.txt', dimension=25, corpus='twitter', model=<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x00000039B24ABDD8>)]


### (1.1.2) Load word2vec embeddings

In [3]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors

MAIN_PATH = 'D:/workspace_python/CoWoReId/python/resources/word-embeddings/'
model_word2vec = Model('word2vec', 'GoogleNews-vectors-negative300.bin' , 300, 'GoogleNews', \
            KeyedVectors.load_word2vec_format(datapath(MAIN_PATH + 'GoogleNews-vectors-negative300.bin'), binary=True))

### (1.1.3) Load FastText embeddings

In [4]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.models.wrappers import FastText

MAIN_PATH = 'D:/workspace_python/CoWoReId/python/resources/word-embeddings/'
model_fastText = Model('FastText', 'wiki.en.vec', 300, 'Wikipedia', \
                 KeyedVectors.load_word2vec_format(MAIN_PATH + 'wiki.en.vec'))

In [5]:
models = []
models.extend(glove_models)
models.append(model_word2vec)
models.append(model_fastText)

## (2) Boostrapped Lexicon Construction

In [None]:
def ngram_embedding_similarity(model, word_l, word_r, missing_strat, ngram_repr):
    vecs_l = [model[word] if word in model.vocab 
                  else missing_strat(word, model.vector_size) 
                  for word in word_l.split()]   
    vecs_r = [model[word] if word in model.vocab \
                else missing_strat(word, model.vector_size)
                for word in word_r.split()]
    vec_l = ngram_repr(np.array(vecs_l))
    vec_r = ngram_repr(np.array(vecs_r))
    return np.dot(vec_l,vec_r) / (np.linalg.norm(vec_l) \
            * np.linalg.norm(vec_r))

In [None]:
def bootstrap_lexicon(model, vocab, seeds_l, seeds_r, embedding_sim, \
                      missing_strat, ref_term, ngram_repr, epochs=10, \
                      bound_l=-1, bound_r=1, thresh_l=-0.5,thresh_r=0.5):
    if not all(seed in vocab for seed in seeds_l):
        raise ValueError('Not all left seeds contained in vocabulary')
    if not all(seed in vocab for seed in seeds_r):
        raise ValueError('Not all right seeds contained in vocabulary')
    num_missing = np.sum([1 for word in vocab if word not in model.vocab])
    print('Missing vocab in model : {} / {}%'.format(num_missing, \
          (num_missing/len(vocab)*100)))
    # 1. Initialize the left and right seeds
    se_l = {seed : bound_l for seed in seeds_l}
    se_r = {seed : bound_r for seed in seeds_r}
    lexicon = se_l.copy()
    lexicon.update(se_r)
    for curr_epoch in range(1,epochs+1):
        # 2. Compute left and right weights
        #print(se_l)
        sum_l = np.abs(np.sum([score for word, score in se_l.items()]))
        sum_r = np.abs(np.sum([score for word, score in se_r.items()]))
        weight_l = sum_r / (sum_r + sum_l)
        weight_r = sum_l / (sum_r + sum_l)
        print(sum_l)
        print(sum_r)
        print('Epoch {} : Se_l_size = {}, Se_r_size = {}, weight_l = {}, weight_r = {},'.format(\
                      curr_epoch, len(se_l), len(se_r), weight_l, weight_r))
        for curr_word in vocab:
            if curr_word in se_l or curr_word in se_r:
                continue
            # Compute the weighted left and right scores and sum them
            score_l = [(weight_l * score * \
                        embedding_sim(model, curr_word, seed, missing_strat, ngram_repr)) \
                        for seed, score in se_l.items()]
            score_r = [(weight_r * score * \
                        embedding_sim(model, curr_word, seed, missing_strat, ngram_repr)) \
                        for seed, score in se_r.items()]
            print('Word : {} = {}'.format(curr_word, score_l))
            print(score_r)
            score = np.sum(score_l) + np.sum(score_r)
            print('final score : {}'.format(score))
            lexicon[curr_word] = score
            #print('{} : {}'.format(curr_word, score))
            # Add word to the seed set if the score is low or high enough
            if score <= thresh_l: se_l[curr_word] = score
            if score >= thresh_r: se_r[curr_word] = score
        #print(lexicon)
    # 3. Compute final scores and normalize them
    sim_ref = lexicon.get(ref_term)
    print('SIM_REF:{}'.format(sim_ref))
    if not sim_ref:
        return ValueError('Reference term {} not found in lexicon'.format(ref_term))
    coll_l = {seed : (score - sim_ref) for seed, score in lexicon.items() \
                if (score - sim_ref) < 0}
    coll_r = {seed : (score - sim_ref) for seed, score in lexicon.items() \
                if (score - sim_ref) > 0}
    max_l = np.max(np.abs([score for _, score in coll_l.items()]))
    max_r = np.max(np.abs([score for _, score in coll_r.items()]))
    #print(coll_l)
    #print(max_l)
    lexicon[ref_term] = lexicon[ref_term] - sim_ref
    for word, score in coll_l.items():
        lexicon[word] = score / max_l
    for word, score in coll_r.items():
        lexicon[word] = score / max_r
    return lexicon


def bootstrap_lexicon_simple_norm(model, vocab, seeds_l, seeds_r, embedding_sim, \
                      missing_strat, ngram_repr, epochs=10, \
                      bound_l=-1, bound_r=1, thresh_l=-0.5,thresh_r=0.5):
    if not all(seed in vocab for seed in seeds_l):
        raise ValueError('Not all left seeds contained in vocabulary')
    if not all(seed in vocab for seed in seeds_r):
        raise ValueError('Not all right seeds contained in vocabulary')
    num_missing = np.sum([1 for word in vocab if word not in model.vocab])
    print('Missing vocab in model : {} / {}%'.format(num_missing, \
          (num_missing/len(vocab)*100)))
    # 1. Initialize the left and right seeds
    se_l = {seed : bound_l for seed in seeds_l}
    se_r = {seed : bound_r for seed in seeds_r}
    lexicon = se_l.copy()
    lexicon.update(se_r)
    for curr_epoch in range(1,epochs+1):
        # 2. Compute left and right weights
        #print(se_l)
        sum_l = np.abs(np.sum([score for word, score in se_l.items()]))
        sum_r = np.abs(np.sum([score for word, score in se_r.items()]))
        weight_l = sum_r / (sum_r + sum_l)
        weight_r = sum_l / (sum_r + sum_l)
        print('Epoch {} : Se_l_size = {}, Se_r_size = {}, weight_l = {}, weight_r = {},'.format(\
                      curr_epoch, len(se_l), len(se_r), weight_l, weight_r))
        for curr_word in vocab:
            if curr_word in se_l or curr_word in se_r:
                continue
            # Compute the weighted left and right scores and sum them
            score_l = [(weight_l * score * \
                        embedding_sim(model, curr_word, seed, missing_strat, ngram_repr)) \
                        for seed, score in se_l.items()]
            score_r = [(weight_r * score * \
                        embedding_sim(model, curr_word, seed, missing_strat, ngram_repr)) \
                        for seed, score in se_r.items()]
            print('Word : {} = {}'.format(curr_word, score_l))
            print(score_r)
            score = np.sum(score_l) + np.sum(score_r)
            print('final score : {}'.format(score))
            lexicon[curr_word] = score
            #print('{} : {}'.format(curr_word, score))
            # Add word to the seed set if the score is low or high enough
            if score <= thresh_l: se_l[curr_word] = score
            if score >= thresh_r: se_r[curr_word] = score
        #print(lexicon)
    # 3. Compute final scores and normalize them
    coll_l = {seed : score for seed, score in lexicon.items() \
                if score < 0}
    coll_r = {seed : score for seed, score in lexicon.items() \
                if score > 0}
    max_l = np.max(np.abs([score for _, score in coll_l.items()]))
    max_r = np.max(np.abs([score for _, score in coll_r.items()]))
    for word, score in coll_l.items():
        lexicon[word] = score / max_l
    for word, score in coll_r.items():
        lexicon[word] = score / max_r
    return lexicon

## (3) Toy-Example Lexicon

In [None]:
from bootstrapped_lexicon import bootstrap_lexicon_simple_norm
from bootstrapped_lexicon import missing_strat_random
from bootstrapped_lexicon import ngram_embedding_similarity
from bootstrapped_lexicon import ngram_repr_bag_of_words
import gensim

seeds_complex = ['aboriginal']
seeds_non_complex = ['bad']

# Build the vocabulary
vocabulary = []
vocabulary.extend(seeds_complex)
vocabulary.extend(seeds_non_complex)
vocabulary.extend(['good', 'Inuit', 'and'])

print('---------Complex Seeds----------------')
print(seeds_complex)
print('---------Non-Complex Seeds----------------')
print(seeds_non_complex)
print('---------Vocabulary----------------')
print(vocabulary)

lexicon = bootstrap_lexicon_simple_norm(model, vocabulary, seeds_non_complex, seeds_complex, \
                  ngram_embedding_similarity, missing_strat_random, \
                  ngram_repr_bag_of_words, epochs=3, thresh_l=-0.3, thresh_r=0.4)
print(lexicon)

## (4) Complexity Lexicon

In [None]:
import pandas as pd
import gensim
from bootstrapped_lexicon import bootstrap_lexicon
from bootstrapped_lexicon import bootstrap_lexicon_simple_norm
from bootstrapped_lexicon import missing_strat_random
from bootstrapped_lexicon import ngram_embedding_similarity
from bootstrapped_lexicon import ngram_repr_bag_of_words

model = gensim.models.KeyedVectors.load_word2vec_format('resources/' + \
            'word-embeddings/GoogleNews-vectors-negative300.bin', binary=True)

TRAIN_ENGLISH_WIKIPEDIA = "../cwishareddataset/traindevset/" + \
                           "english/Wikipedia_Train.tsv"
TEST_ENGLISH_WIKIPEDIA = "../cwishareddataset/testset/" + \
                           "english/Wikipedia_Test.tsv"
df = pd.read_csv(TRAIN_ENGLISH_WIKIPEDIA, sep = "\t")
df_test = pd.read_csv(TEST_ENGLISH_WIKIPEDIA, sep = '\t')
df.columns = ['id', 'sentence', "start", "end", "target", 
              "nat", "non_nat", "nat_marked", "non_nat_marked", "binary", "prob"]
df_test.columns = ['id', 'sentence', "start", "end", "target", 
              "nat", "non_nat", "nat_marked", "non_nat_marked", "binary", "prob"]

df['num_tokens'] = df.target.apply(lambda target : len(target.split()))

# Get the basic seeds for complex and non_complex words

#seeds_complex = [word.lower().strip() for word in df.loc[df['binary'] == 1,'target'].tolist()]
#seen = set()
#seen_add = seen.add
#seeds_complex = [x for x in seeds_complex if not (x in seen or seen_add(x))]


#seeds_non_complex = [word.lower().strip() for word in df.loc[df['binary'] == 0,'target'].tolist()]
#seen = set()
#seen_add = seen.add
#seeds_non_complex = [x for x in seeds_non_complex if not (x in seen or seen_add(x))]

seeds_complex = ['aboriginal']
seeds_non_complex = ['bad']

# Build the vocabulary
vocabulary = []
vocabulary.extend(seeds_complex)
vocabulary.extend(seeds_non_complex)
vocabulary.extend(['good', 'Inuit'])
#vocabulary.extend([ngram.strip().lower() for target in df_test['target'].tolist() \
 #                   for ngram in target.split()])
#ref = 'and'
#vocabulary.append(ref)
print('---------Complex Seeds----------------')
print(len(seeds_complex))
print('---------Non-Complex Seeds----------------')
print(len(seeds_non_complex))
print('---------Vocabulary----------------')
print(len(vocabulary))

lexicon = bootstrap_lexicon_simple_norm(model, vocabulary, seeds_non_complex, seeds_complex, \
                  ngram_embedding_similarity, missing_strat_random, \
                  ngram_repr_bag_of_words, epochs=3, thresh_l=-0.3, thresh_r=0.4)