In [1]:
import pandas as pd
import numpy as np
import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn import feature_extraction

torch.manual_seed(1)

<torch._C.Generator at 0x7fa790d11d70>

In [2]:
%cd ..

/home/kuba/Projects/github_search


In [3]:
module_import_corpus = pd.read_csv('data/module_import_corpus.csv').iloc[:,0]

In [4]:
module_import_corpus = module_import_corpus.dropna()

# Dataset

ImportDataset allows for sampling `context`, `target` pairs and negative examples as described in Import2Vec paper

In [5]:
class ImportDataset:
    
    def __init__(self, import_corpus, min_import_frequency=10):
        self._vectorizer = feature_extraction.text.CountVectorizer(min_df=min_import_frequency, binary=True)
        occurrence_matrix = self._vectorizer.fit_transform(import_corpus)
        n_imports = np.array((occurrence_matrix.sum(axis=1) > 1)).reshape(-1)
        valid_indices = np.where(n_imports)[0]
        occurrence_matrix = occurrence_matrix[valid_indices,:]
        self._occurrence_matrix = occurrence_matrix
        self.corpus_size = occurrence_matrix.shape[0] 
        self.vocabulary_size = occurrence_matrix.shape[1] 
    
    def sample_imports(self, n_positive_imports, n_negative_imports=None):
        if n_negative_imports is None:
            n_negative_imports = n_positive_imports 
            
        positive_import_contexts, positive_import_targets = self._sample_positive_or_negative_imports(n_positive_imports, positive=True)
        negative_import_contexts, negative_import_targets = self._sample_positive_or_negative_imports(n_negative_imports, positive=False)
        positive_import_contexts, positive_import_targets = torch.tensor(positive_import_contexts), torch.tensor(positive_import_targets) 
        negative_import_contexts, negative_import_targets = torch.tensor(negative_import_contexts), torch.tensor(negative_import_targets)
        predictions = torch.cat(
            (
                torch.ones(n_positive_imports),
                torch.zeros(n_positive_imports)
            ),
            axis=0
        )
        contexts = torch.cat((positive_import_contexts, negative_import_contexts), axis=0)
        targets = torch.cat((positive_import_targets, negative_import_targets), axis=0)
        return contexts, targets, predictions
        
    def _sample_positive_or_negative_imports(self, n_imports, positive):
        file_indices_sample = np.random.choice(range(self.corpus_size), size=n_imports)
        context_indices = []
        target_indices = []
        for idx in file_indices_sample:
            sample_row = np.array(self._occurrence_matrix[idx].todense())[0]
            import_indices = np.where(sample_row)[0]
            context_index = np.random.choice(import_indices, size=1)[0]
            if positive:
                sample_row[context_index] = 0
                possible_target_indices = np.where(sample_row)[0]
            else:
                possible_target_indices = np.where(sample_row == 0)[0] 
            target_index = np.random.choice(possible_target_indices)
            context_indices.append(context_index)
            target_indices.append(target_index)
        return context_indices, target_indices
    

In [6]:
small_import_corpus = [
    'pandas numpy',
    'numpy seaborn',
    'pandas tensorflow',
    'tensorflow seaborn',
    'pandas seaborn'
]

In [7]:
small_import_dataset = ImportDataset(small_import_corpus, min_import_frequency=1)
assert small_import_dataset.corpus_size == 5
assert small_import_dataset.vocabulary_size == 4
assert small_import_dataset._occurrence_matrix.sum() == 10

In [8]:
class Import2VecModeler(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim):
        super(Import2VecModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
    def forward(self, context, target):
        context_embeddings = self.embeddings(context)
        target_embeddings = self.embeddings(target)
        similarities = (context_embeddings * target_embeddings).sum(axis=1) 
        return similarities

In [9]:
module_import_corpus = module_import_corpus[module_import_corpus.str.split().apply(len) > 1]

In [10]:
valid_module_import_corpus = module_import_corpus[module_import_corpus.str.split().apply(set).apply(len) > 1]

In [11]:
embedding_dim = 100
import_dataset = ImportDataset(list(valid_module_import_corpus))
import2vec = Import2VecModeler(import_dataset.vocabulary_size, embedding_dim)

In [12]:
n_iterations = 5000 
n_positive_imports = 32

In [13]:
n_positive_imports * n_iterations / import_dataset.corpus_size  

0.23238283910828894

In [None]:
n_positive_imports * n_iterations / import_dataset.vocabulary_size

In [None]:
losses = []
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(import2vec.parameters(), lr=0.001)

for iteration in tqdm.tqdm(range(n_iterations)):
    total_loss = 0
    (context, target, pred) = import_dataset.sample_imports(n_positive_imports=n_positive_imports)
    import2vec.zero_grad()
    log_probs = import2vec(context, target)

    loss = loss_function(log_probs, pred)

    loss.backward()
    optimizer.step()

    total_loss += loss.item()
    losses.append(total_loss)

 64%|██████▍   | 3204/5000 [08:09<04:36,  6.49it/s]

In [None]:
import matplotlib.pyplot as plt

plt.plot(losses)