In [1]:
import nltk
import pickle
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
import import_ipynb
import wiktionary_parser
import preprocess_utils
import wikipedia
import pandas as pd
import numpy as np
from statistics import mean

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Phrases
import gensim.downloader
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-300')

import bert_semanitic_similarity_experiments

importing Jupyter notebook from wiktionary_parser.ipynb
importing Jupyter notebook from preprocess_utils.ipynb


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/jackragless/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


importing Jupyter notebook from bert_semanitic_similarity_experiments.ipynb


In [2]:
# accuracy scores:
# title to domain name (word2vec) = 75% (model1)
# text to def sentence (doc2vec) = 6.25% (model2)
# text to definition examples (bert semantic) = 78% (model3), 50%, 56%

In [3]:
def model_gen(curtext):
    curtext = preprocess_utils.clean_text(curtext,True,True,True,True,True)
    sents = []
    for sent in nltk.sent_tokenize(curtext):
        sent = sent[:-1]
        temp = []
        for word in nltk.word_tokenize(sent):
            temp.append(word)
        sents.append(temp)

    tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(sents)]

    model = Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4, epochs = 100)
    
    return model

In [11]:
def model1(title, domain, pos):
    definition_sents =  [obj['def'] for obj in wiktionary_parser.define(domain)[pos]]
    raw_domains = [nltk.word_tokenize(sent[:sent.find(')')].replace('(','').replace(')','').lower()) for sent in definition_sents]
    domains = []
    for arr in raw_domains:
        temp = []
        for word in arr:
            if word not in stop_words and word!=',':
                temp.append(word)
        domains.append(temp)
        
    print(domains)

    title_tokens = nltk.word_tokenize(title.lower())


    FINAL = []
    for dt in domains:
        temp2 = []
        for dtt in dt:
            temp1 = []
            for tt in title_tokens:
                try:
                    similarity = glove_vectors.similarity(dtt,tt)
                except:
                    similarity = 0
                        
                temp1.append(similarity)
            temp2.append(np.mean(temp1))
        FINAL.append(np.mean(temp2))
    return FINAL, definition_sents

In [12]:
model1('computers', 'mouse', 'noun')

[['general'], ['informal'], ['general'], ['computing'], ['boxing'], ['nautical'], ['obsolete'], ['general'], ['set', 'theory'], ['historical']]


([0.10236509,
  0.1184538,
  0.10236509,
  0.59832084,
  0.0078193415,
  0.022874743,
  0.3741742,
  0.10236509,
  0.22859639,
  0.07124958],
 ['(general) Any small rodent of the genus Mus.',
  '(informal) A member of the many small rodent and marsupial species resembling such a rodent.',
  '(general) A quiet or shy person.',
  '(computing) (plural mice or, rarely, mouses) An input device that is moved over a pad or other flat surface to produce a corresponding movement of a pointer on a graphical display.',
  '(boxing) A facial hematoma or black eye.',
  '(nautical) A turn or lashing of spun yarn or small stuff, or a metallic clasp or fastening, uniting the point and shank of a hook to prevent its unhooking or straightening out.',
  '(obsolete) A familiar term of endearment.Let the bloat King tempt you again to bed, / Pinch wanton on your cheek, call you his mouse',
  '(general) A match used in firing guns or blasting.',
  '(set theory) A small model of (a fragment of) Zermelo-Fraenke

In [5]:
def model2(model, kw, pos):
    term_to_define = [obj['def'] for obj in wiktionary_parser.define(kw)[pos]]
    
    final = []
    
    for candidate in term_to_define:
        candidate = preprocess_utils.clean_text(candidate,False,True,True,True,True)[:-1]
        def_sent_candidate = []
        for word in nltk.word_tokenize(candidate):
            def_sent_candidate.append(word)

#         print([' '.join(def_sent_candidate), mean([obj[1] for obj in model.docvecs.most_similar(positive=[model.infer_vector(def_sent_candidate)], topn = 10000)])])

        final.append(mean([obj[1] for obj in model.docvecs.most_similar(positive=[model.infer_vector(def_sent_candidate)], topn = 10000)]))
        
    return final, term_to_define

In [6]:
def model3(model, kw, pos):
    
    wikt_object = wiktionary_parser.define(kw)[pos]
    examples = [i['ex'] for i in wikt_object]
    
    final = []
    
    for ex in examples:
        ex = preprocess_utils.clean_text(ex,True,True,True,True,True).replace('.',' ').strip()
        ex_sent_candidate = []
        for word in nltk.word_tokenize(ex):
            ex_sent_candidate.append(word)

        print([' '.join(ex_sent_candidate), mean([obj[1] for obj in model.docvecs.most_similar(positive=[model.infer_vector(ex_sent_candidate)], topn = 10000)])])

        final.append(mean([obj[1] for obj in model.docvecs.most_similar(positive=[model.infer_vector(ex_sent_candidate)], topn = 10000)]))
        
    return final

In [None]:
curtext = preprocess_utils.clean_text(wikipedia.page('finance').content,True,True,True,True,True)
model = model_gen(curtext)
model3(model, 'index', 'noun')

In [None]:
# temp_text = preprocess_utils.clean_text(wikipedia.page('Computers').content.lower())
# model = model_gen(temp_text)
# model_text_domain_plus_sent_predict(model, 'mouse', 'noun')

In [7]:
with open('/home/jackragless/projects/data/DAIC_GLOGEN/doc2vec_eval_data.pkl', 'rb') as f:
    eval_data = pickle.load(f)

In [None]:
# score = 0
# count = 0
# for subject in eval_data.keys():
#     temp_text = preprocess_utils.clean_text(wikipedia.page(subject).content)
#     model = model_gen(temp_text)
#     for i in eval_data[subject]:
#         count += 1
#         temp_prob_arr, def_sents = model_text_domain_plus_sent_predict(model, i[0], i[1])
#         temp_index = np.where(temp_prob_arr == np.max(temp_prob_arr))[0][0]
#         guess = def_sents[temp_index]
# #         print(i[0], '<--->', i[2], '<--->', guess)
#         if guess.strip() == i[2].strip():
#             score+=1
# print(100*(score/count))

In [None]:
score = 0
count = 0
for subject in eval_data.keys():
    for i in eval_data[subject]:
        count += 1
        temp_prob_arr, def_sents = model1(subject, i[0], i[1])
        print(temp_prob_arr,def_sents)
        temp_index = np.where(temp_prob_arr == np.max(temp_prob_arr))[0][0]
        guess = def_sents[temp_index]
        print(i[0], '<--->', i[2], '<--->', guess)
        if guess.strip() == nltk.sent_tokenize(i[2].strip())[0]:
            print(1,'\n')
            score+=1
        else:
            print(0,'\n')
print(100*(score/count))

In [None]:
# score = 0
# count = 0
# for subject in eval_data.keys():
#     temp_text = preprocess_utils.clean_text(wikipedia.page(subject).content,True,True,True,True,True)
#     model = model_gen(temp_text)
#     for i in eval_data[subject]:
#         count += 1
#         temp_prob_arr, def_sents = model1(subject, i[0], i[1])
#         examples_check = model3(model, i[0], i[1])
#         if np.max(examples_check)>=0.98:
#             temp_index = np.where(examples_check == np.max(examples_check))[0][0]
#             guess = def_sents[temp_index]
#         else:       
#             temp_index = np.where(temp_prob_arr == np.max(temp_prob_arr))[0][0]
#             guess = def_sents[temp_index]
#         print(i[0], '<--->', i[2], '<--->', guess)
#         if guess.strip() == nltk.sent_tokenize(i[2].strip())[0]:
#             print(1,'\n')
#             score+=1
#         else:
#             print(0,'\n')
# print(100*(score/count))

In [None]:
score = 0
count = 0
for subject in eval_data.keys():
    curtext = preprocess_utils.clean_text(wikipedia.page(subject).content,True,True,True,True,True)
    for i in eval_data[subject]:
        orig_definition_sents =   [obj['def'] for obj in wiktionary_parser.define(i[0])[i[1]]]
        definition_sents = [preprocess_utils.clean_text(obj,False,True,True,True,True) for obj in orig_definition_sents]
        temp_prob_arr = bert_semanitic_similarity_experiments.semantic_predict_4(curtext, definition_sents)
        temp_index = np.where(temp_prob_arr == np.max(temp_prob_arr))[0][0]
        guess = orig_definition_sents[temp_index]
#         print(temp_prob_arr,orig_definition_sents)
        print(i[0], '<--->', i[2], '<--->', guess)
        if guess.strip() == nltk.sent_tokenize(i[2].strip())[0]:
            print(1,'\n')
            score+=1
        else:
            print(0,'\n')
        count+=1
print(100*(score/count))

In [8]:
import time
start_time = time.time()
score = 0
count = 0
for subject in eval_data.keys():
    curtext = preprocess_utils.clean_text(wikipedia.page(subject).content,True,True,True,True,True)
    for i in eval_data[subject]:
        orig_definition_sents =   [obj['def'] for obj in wiktionary_parser.define(i[0])[i[1]]]
        definition_sents = [preprocess_utils.clean_text(obj,False,True,True,True,True) for obj in orig_definition_sents]
        temp_prob_arr = bert_semanitic_similarity_experiments.semantic_predict_1(curtext, definition_sents)
        temp_index = np.where(temp_prob_arr == np.max(temp_prob_arr))[0][0]
        guess = orig_definition_sents[temp_index]
#         print(temp_prob_arr,orig_definition_sents)
        print(i[0], '<--->', i[2], '<--->', guess)
        if guess.strip() == nltk.sent_tokenize(i[2].strip())[0]:
            print(1,'\n')
            score+=1
        else:
            print(0,'\n')
        count+=1
print(100*(score/count))
print('time_taken = {}'.format(time.time()-start_time))

object <---> (object-oriented programming) An instantiation of a class or structure. <---> (object-oriented programming) An instantiation of a class or structure.
1 

class <---> (object-oriented programming, countable) A set of objects having the same behavior (but typically differing in state), or a template defining such a set. <---> (object-oriented programming, countable) A set of objects having the same behavior (but typically differing in state), or a template defining such a set.
1 

paradigm <---> An example serving as the model for such a pattern.  <---> (linguistics) A set of all forms which contain a common element, especially the set of all inflectional forms of a word or a particular grammatical category.
0 

loop <---> (programming) A programmed sequence of instructions that is repeated until or while a particular condition is satisfied. <---> (programming) A programmed sequence of instructions that is repeated until or while a particular condition is satisfied.
1 

shor

In [None]:
# wikt_def_gen.extract('forte')

In [None]:
# curtext = preprocess_utils.clean_text(wikipedia.page('Australia').content)
# curtext

In [None]:
#compare doc to definition domain
# for candidate in term_to_define:
#     final_candidate = []
#     final_candidate = candidate[candidate.find('(')+1:candidate.find(')')].replace(',','').lower().split()
#     print(' '.join(final_candidate), '<--->', mean([obj[1] for obj in model.docvecs.most_similar(positive=[model.infer_vector(final_candidate)], topn = len(tagged_data))]))

In [None]:
#compare doc to definition
# for candidate in term_to_define:
#     final_candidate = []
#     final_candidate = candidate[candidate.find(')')+1:].replace(',','').lower().split()
#     print(' '.join(final_candidate), '<--->', mean([obj[1] for obj in model.docvecs.most_similar(positive=[model.infer_vector(final_candidate)], topn = len(tagged_data))]))

In [None]:
# def model_text_domain_predict(model, kw, pos):
#     term_to_define = wikt_def_gen.extract(kw)[pos]
    
#     final = []
    
#     for candidate in term_to_define:
#         def_sent_candidate = []
#         candidate = candidate[:-1].lower()
#         candidate = candidate[candidate.find('(')+1:candidate.find(')')].replace(',','').lower().split()
        
#         final.append([candidate, mean([obj[1] for obj in model.docvecs.most_similar(positive=[model.infer_vector(candidate)], topn = 10000)])])
        
#     return final