In [1]:
from dataset import DatasetPreProcessor
import pickle
import numpy as np
from torch.utils import data
import torch

[nltk_data] Downloading package semcor to /Users/mefkov/nltk_data...
[nltk_data]   Package semcor is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mefkov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mefkov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
dpp = DatasetPreProcessor(corpus = 'semcor', first_time = False)

In [20]:
Params = {}
Params['split'] = 0.8 
Params['windowSize'] = 5
Params['batchSize'] = 64
Params['wordEmbedding'] = dpp.embedding
Params['wordEmbeddingSize'] = 300
Params['sentenceEncoder'] = 'simple test'
Params['filenameX'] = 'old/XBatches.pkl'
Params['filenameSenses'] =  'old/SBatches.pkl'
Params['filenameLength'] = 'old/lengthBatches.pkl'

In [6]:
dpp.get_train_validation_split(split = Params['split'])

In [16]:
def prepareData():
    
    WordsToDisambiguate = set()
    WordsSent = dict()
    trainDocIds = dpp.doc_ids['train']
    for trainDocId in trainDocIds:
        for word in list(dpp.document[trainDocId]['wsd'].keys()):
            WordsToDisambiguate.add(word)
            WordsSent[word] = set()
        
    for trainDocId in trainDocIds:
        for word in list(dpp.document[trainDocId]['wsd'].keys()):
            for sent in dpp.document[trainDocId]['wsd'][word].keys():
                WordsSent[word].add(sent)
            
    for word in list(WordsSent.keys()):
        WordsSent[word] = list(WordsSent[word])
    
    WordsToDisambiguate = list(WordsToDisambiguate)
    
    return WordsSent, WordsToDisambiguate

In [17]:
WordsSent, WordsToDisambiguate = prepareData()

In [63]:
def createXSLpickle(WordsSent, WordsToDisambiguate, Params = Params, load_from_file = True, sensesList = training.senses):
    
    '''
    WordsToDisambiguate = set()
    WordsSent = dict()
    trainDocIds = dpp.doc_ids['train']
    for trainDocId in trainDocIds:
        for word in list(dpp.document[trainDocId]['wsd'].keys()):
            WordsToDisambiguate.add(word)
            WordsSent[word] = set()
        
    for trainDocId in trainDocIds:
        for word in list(dpp.document[trainDocId]['wsd'].keys()):
            for sent in dpp.document[trainDocId]['wsd'][word].keys():
                WordsSent[word].add(sent)
            
    for word in list(WordsSent.keys()):
        WordsSent[word] = list(WordsSent[word])
    
    WordsToDisambiguate = list(WordsToDisambiguate)
    '''
    
    windowT = Params['windowSize']
    data_type = 'train'
    
    if load_from_file:
    
        keys = list(set(sensesList))
        embed_len = Params['wordEmbeddingSize']
        SenseVocab = dict.fromkeys(keys, (torch.zeros(1, embed_len), 0))
    

    XBatches = []
    SBathes = []
    lengthBatches = []
    
    if not(load_from_file):
        for WordToDisambiguate in WordsToDisambiguate:

            for sense in WordsSent[WordToDisambiguate]:
                t = dpp.get_wsd_context(dataset_type = data_type, word = WordToDisambiguate, word_sense = sense, context_num = windowT, is_word = True, is_sentence = False, is_document = False)
                for context in t['masked'][0]:
                    t = [dpp.w2i[word] for word in context]
                    XBatches.append(t)
                    lengthBatches.append(np.count_nonzero(t))
                    SBathes.append(sense)
                    
        keys = list(set([item for sublist in list(WordsSent.values()) for item in sublist]))
        embed_len = Params['wordEmbeddingSize']
        SenseVocab = dict.fromkeys(keys, (torch.zeros(1, embed_len), 0))
        
        with open(Params['filenameX'], 'wb') as f:
               pickle.dump(XBatches, f)
       
        with open(Params['filenameSenses'], 'wb') as f:
               pickle.dump(SBatches, f)
       
        with open(Params['filenameLength'], 'wb') as f:
               pickle.dump(lengthBatches, f)
                
    return SenseVocab, XBatches, SBathes, lengthBatches 

In [13]:

#XBatches, SBathes, lengthBatches = createXSLpickle(WordsSent, WordsToDisambiguate, windowT = Params['windowSize'], data_type = 'train')

#with open(Params['filenameX'], 'wb') as f:
#       pickle.dump(XBatches, f)
       
#with open(Params['filenameSenses'], 'wb') as f:
#       pickle.dump(SBatches, f)
       
#with open(Params['filenameLength'], 'wb') as f:
#       pickle.dump(lengthBatches, f)

In [14]:
class Dataset(data.Dataset):
    
    def __init__(self, filenameX, filenameS, filenameL):
        'Initialization'

        with open(filenameL, 'rb') as f:
            self.length = pickle.load(f)
            
        with open(filenameS, 'rb') as f:
            self.senses = pickle.load(f)
            
        with open(filenameX, 'rb') as f:
            self.X = np.array(pickle.load(f))

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.senses)

    def __getitem__(self, index):
        'Generates one sample of data'

        senses_idx = self.senses[index]
        X_idx = self.X[index]
        length_idx = self.length[index]

        return (X_idx, senses_idx, length_idx) 

In [25]:
def get_context_vectors(data, senses, length, word_embedding = Params['wordEmbedding'], sentence_encoder = Params['sentenceEncoder']):
    
    '''
    context_vectors.size(0) - batch size
    context_vectors.size(1) - the context vector size
    context_vectors.size(2) - the dim of word embeddings
    '''
    
    data_torch = torch.LongTensor(data)
    data_embedding = word_embedding(data_torch)
    if sentence_encoder == 'simple test':
        context_vectors = data_embedding.mean(dim = 1)
    else: 
        context_vectors = sentence_encoder(data_embedding, length)
    
    return (context_vectors, senses, length)

In [26]:
def get_sense(context_vectors, senses, SenseVocab):
    
    senses_ = np.unique(senses)
    for sense_ in senses_:
        ind = torch.LongTensor([i for i,val in enumerate(senses) if val==sense_])
        len1 = len(ind)
        emb1 = context_vectors[ind, :].sum(dim = 0)
        temp = SenseVocab[sense_] 
        emb2 = (temp[0]*temp[1] + emb1)
        emb2 = emb2/(len1+temp[1]) # this is the weighted average
        len2 = len1+temp[1]
        SenseVocab[sense_] = (emb2, len2)
        
    return True

In [64]:
training = Dataset(Params['filenameX'], Params['filenameSenses'], Params['filenameLength'])
training_generator = data.DataLoader(training, batch_size = Params['batchSize'], num_workers = 4)

In [65]:
SenseVocab, _, _, _  = createXSLpickle(WordsSent, WordsToDisambiguate, Params = Params, load_from_file = True, sensesList = training.senses)

In [66]:
def createSenseVocabulary(Vocab = SenseVocab, generator = training_generator):
    
    for batchesX, batchesSense, batchesLength in generator:
        (context_vectors, senses, length) = get_context_vectors(batchesX, batchesSense, batchesLength)
        get_sense(context_vectors, senses, Vocab)
    
    return SenseVocab

In [67]:
SenseVocab = createSenseVocabulary(Vocab = SenseVocab, generator = training_generator)