In [3]:
from torch import nn
from torch.autograd import Variable
import torch
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

glove_path = '../glove.840B.300d.txt'
android_corpus_path = '../android_dataset/corpus.tsv'
ubuntu_corpus_path = '../ubuntu_dataset/text_tokenized.txt'
WORD_TO_ID = 'word_to_id'
U_ID2DATA = 'ubuntu_id_to_data'
A_ID2DATA = 'android_id_to_data'

# Processes all sentences in out datasets to give useful containers of data concerning the corpus:
# word2id vocab 
# dict of question id to list of words in the question
def process_whole_corpuses():
    list_dataset_paths = [ubuntu_corpus_path, android_corpus_path]
    all_txt = []
    ubuntu_ids = []
    android_ids = []
    ubuntu_id_to_data = {}
    android_id_to_data = {}
    
    for dataset_path in list_dataset_paths:
        lines = open(dataset_path).readlines()
        for line in lines:
            
            id_title_body_list = line.split('\t')
            idx = int(id_title_body_list[0])
            title_plus_body = id_title_body_list[1] + ' ' + id_title_body_list[2][:-1]
            all_txt.append(title_plus_body) 
            
            if dataset_path == ubuntu_corpus_path: ubuntu_id_to_data[idx] = title_plus_body.split()
            else: android_id_to_data[idx] = title_plus_body.split()
    
    vectorizer = CountVectorizer(binary=True, analyzer='word', token_pattern='[^\s]+[a-z]*[0-9]*')
    vectorizer.fit(all_txt)
    return {WORD_TO_ID: vectorizer.vocabulary_, U_ID2DATA: ubuntu_id_to_data, A_ID2DATA: android_id_to_data}

processed_corpus = process_whole_corpuses()

ubuntu_id_to_data = processed_corpus[U_ID2DATA]
android_id_to_data = processed_corpus[A_ID2DATA]
word_to_id = processed_corpus[WORD_TO_ID]

628439


In [4]:
PADDING_IDX = 0

# Get glove embeddings matrix only for words in our corpus (++Gain of gigabytes of memory)
# Matrix [num_words_with_embeddings x word_dim] is be fed to pytorch nn.Embedding module without gradient
# Function returns this nn.Embedding Object
def load_glove_embeddings(glove_path, word_to_id_vocab, embedding_dim=300):
    with open(glove_path) as f:
        glove_matrix = np.zeros((len(word_to_id_vocab), embedding_dim))
        for line in f.readlines():
            values = line.split()
            word = values[0]
            index = word_to_id_vocab.get(word)
            if index:
                try:
                    vector = np.array(values[1:], dtype='float32')
                    glove_matrix[index] = vector
                except: pass
                
    glove_matrix = torch.from_numpy(glove_matrix).float()
    torch_embedding = nn.Embedding(glove_matrix.size(0), glove_matrix.size(1), padding_idx=PADDING_IDX)
    torch_embedding.weight = nn.Parameter(glove_matrix)
    torch_embedding.weight.requires_grad = False
    
    return torch_embedding

torch_embedding = load_glove_embeddings(glove_path, word_to_id)

# Returns a matrix of [num_words x word_embedding_dim] when fed with a 1D tensor of words in a sentence
# print(torch_embedding(Variable(torch.LongTensor([22, 3]))))

In [5]:
TRUNCATE_LENGTH = 100

# Takes a question id and the corresponding dict of question_id_to_words
# Builds a matrix of [1 x num_words x input_size] where first dim is for concatenation in future
# Use up to TRUNCATE_LENGTH number of words and pad if needed
def question_id_to_matrix(question_id, dict_qid_to_words, words_to_id_vocabulary, pytorch_embeddings):
    question_data = dict_qid_to_words[question_id]
    word_ids = []
    
    # Build list of ids of words in that question
    for word in question_data:
        if len(word) == 1: continue
        if len(word_ids) == 100: break
        word_ids.append(int(words_to_id_vocabulary[word.lower()]))
    
    # Pad if need more rows
    if len(word_ids) < TRUNCATE_LENGTH: word_ids += [PADDING_IDX] * (TRUNCATE_LENGTH-len(word_ids))
    
    question_in_embedded_form = pytorch_embeddings(torch.LongTensor(word_ids))
    return question_in_embedded_form.unsqueeze(0)

print(question_id_to_matrix(57211, android_id_to_data, word_to_id, torch_embedding))

36
Variable containing:
( 0 ,.,.) = 
 -0.0850  0.5020  0.0024  ...  -0.2151 -0.2630 -0.0060
  0.0014  0.3565 -0.0555  ...  -0.1124  0.0783  0.2240
 -0.2490  0.0878 -0.3940  ...  -0.4625  0.1552  0.3354
           ...             ⋱             ...          
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
[torch.FloatTensor of size 1x100x300]

