In [1]:
#default_exp matching_zsl

In [2]:
#export
import os
import ast
import tqdm
import json
import attr
from operator import itemgetter

from scarce_learn import zero_shot
from mlutil.feature_extraction import embeddings
import itertools


import pandas as pd
import numpy as np
from sklearn import feature_extraction, metrics, model_selection

import matplotlib.pyplot as plt
import gensim

from github_search import paperswithcode_tasks

import mlutil
from functools import partial


from scarce_learn.zero_shot import devise_jax, devise_torch

In [3]:
%env XLA_PYTHON_CLIENT_PREALLOCATE=false

env: XLA_PYTHON_CLIENT_PREALLOCATE=false


In [4]:
# upstream

import_corpus_path = 'output/module_corpus.csv'
word_vectors_filename = 'output/import2vec_module_vectors.bin'

In [5]:
%cd ..

/home/kuba/Projects/github_search


%%time
import_corpus_df = pd.read_csv(import_corpus_path)
per_repo_imports = import_corpus_df.groupby('repo')['imports'].agg(sum).apply(set)
import_corpus_df['imports'] = import_corpus_df['imports'].apply(ast.literal_eval)

In [6]:
%%time
python_files_df = pd.read_csv('data/crawled_python_files.csv', encoding='latin-1')
repo_names = python_files_df['repo_name']
import_corpus_df = pd.read_csv(import_corpus_path)
per_repo_imports = import_corpus_df.groupby('repo')['imports'].agg(sum).apply(set)

CPU times: user 2min 51s, sys: 2.98 s, total: 2min 54s
Wall time: 2min 56s


In [7]:
python_files_df.shape

(1402272, 3)

In [8]:
import_corpus_df.shape

(1375818, 3)

In [9]:
python_files_df['repo_name']

0                      trangvu/ape-npi
1                      trangvu/ape-npi
2                      trangvu/ape-npi
3                      trangvu/ape-npi
4                      trangvu/ape-npi
                      ...             
1402267    wayne1204/NOAA-fish-finding
1402268    wayne1204/NOAA-fish-finding
1402269    wayne1204/NOAA-fish-finding
1402270    wayne1204/NOAA-fish-finding
1402271    wayne1204/NOAA-fish-finding
Name: repo_name, Length: 1402272, dtype: object

In [10]:
python_files_df['repo_name'].unique().shape

(18933,)

python_files_df['repo'] = python_files_df['repo_name'].str.split("/").apply(itemgetter(1))  + '/' + python_files_df['repo_name']
repo_names_tmp = python_files_df['repo_name']
repo_names = repo_names_tmp.unique()
python_files_df['repo_name'] = python_files_df['repo']
python_files_df['repo'] = repo_names_tmp

In [11]:
%%time
import2vec = gensim.models.KeyedVectors.load(word_vectors_filename)
import2vec_embedder = mlutil.feature_extraction.embeddings.AverageWordEmbeddingsVectorizer(import2vec)

CPU times: user 9.05 ms, sys: 43 µs, total: 9.09 ms
Wall time: 10.1 ms


In [12]:
paperswithcode_with_imports_df = pd.read_csv('output/papers_with_imports.csv')
paperswithcode_with_imports_df['tasks'] = paperswithcode_with_imports_df['tasks'].str.replace("2d ", "").str.replace("3d ", "").str.replace("4d ", "").str.replace("6d ", "").str.lower().apply(ast.literal_eval)
paperswithcode_with_imports_df['imports'] = paperswithcode_with_imports_df['imports'].str.replace("set\(\)", "{}").apply(ast.literal_eval)#str.replace("2d ", "").str.replace("3d ", "").str.replace("4d ", "").str.replace("6d ", "").str.lower().apply(ast.literal_eval)

In [13]:
paperswithcode_with_imports_df.shape

(12224, 23)

In [14]:
paperswithcode_with_imports_df['n_imports'] = paperswithcode_with_imports_df['imports'].apply(len) 

In [15]:
paperswithcode_with_imports_df['n_imports_with_embeddings'] = paperswithcode_with_imports_df['imports'].apply(lambda imps: len([imp in import2vec.vocab.keys() for imp in imps]))

In [16]:
%%time
word_embeddings = mlutil.feature_extraction.embeddings.load_gensim_embedding_model('glove-wiki-gigaword-300')

CPU times: user 33.6 s, sys: 250 ms, total: 33.8 s
Wall time: 34.4 s


In [17]:
python_word_embeddings = gensim.models.Word2Vec.load('output/abstract_w2v100.bin')

In [18]:
@attr.s
class RepoTaskData:
    
    tasks = attr.ib()
    repos = attr.ib()
    X = attr.ib()
    all_tasks = attr.ib()
    y = attr.ib()
    
    def split_tasks(area_grouped_tasks, test_size=0.2):
        tasks_train, tasks_test = model_selection.train_test_split(area_grouped_tasks['task'], stratify=area_grouped_tasks['area'], test_size=test_size, random_state=0)
        return tasks_train, tasks_test
    
    def create_split(tasks_train, all_tasks, paperswithcode_with_features_df, X_repr):
        train_indicator = paperswithcode_with_features_df['most_common_task'].isin(tasks_train)
        print(train_indicator.shape)
        repos_train = paperswithcode_with_features_df['repo'][train_indicator]
        repos_test = paperswithcode_with_features_df['repo'][~train_indicator]
        X_repr = X_repr.apply(lambda x: " ".join(x))
        X_train = X_repr[train_indicator]
        X_test = X_repr[~train_indicator]
        all_tasks_train = all_tasks[train_indicator]
        all_tasks_test = all_tasks[~train_indicator]
        y_train = paperswithcode_with_features_df[train_indicator]['most_common_task'].str.lower()
        y_test = paperswithcode_with_features_df[~train_indicator]['most_common_task'].str.lower()
        
        return (
            RepoTaskData(tasks_train, repos_train, X_train, all_tasks_train, y_train),
            RepoTaskData(tasks_test, repos_test, X_test, all_tasks_test, y_test)
        )

In [19]:
#export


def get_first_vocab_entry(vocab):
    return list(itertools.islice(vocab.items(), 1))[0][0] 


class PairedKeyedVectors:
    
    @attr.s
    class wv:
        vocab = attr.ib()
    
    def __init__(self, kv1, kv2):
        self.kv1 = kv1
        self.kv2 = kv2
        self.vocab = {**kv1.vocab, **kv2.vocab} 
        self.dim1 = len(kv1[get_first_vocab_entry(kv1.vocab)])
        self.dim2 = len(kv2[get_first_vocab_entry(kv2.vocab)])
        self.wv= PairedKeyedVectors.wv(self.vocab)
    
    def __getitem__(self, item):
        if not item in self.kv1.vocab.keys():
            return np.concatenate([np.zeros(self.dim1), self.kv2[item]])
        elif not item in self.kv2.vocab.keys():
            return np.concatenate([self.kv1[item], np.zeros(self.dim2)])
        else:
            return np.concatenate([self.kv1[item], self.kv2[item]])
    


@attr.s
class RetrieverLearner:
    
    zs_learner: zero_shot.ZeroShotClassifier = attr.ib()
    input_embedder: embeddings.EmbeddingVectorizer = attr.ib() 
    y_embedder: embeddings.EmbeddingVectorizer = attr.ib()
    input_embedder_kwargs = attr.ib(default=dict())
        
    @staticmethod
    def create(
        zs_learner: zero_shot.ZeroShotClassifier,
        input_embeddings: gensim.models.KeyedVectors,
        target_embeddings: gensim.models.KeyedVectors,
        input_embedding_method: embeddings.EmbeddingVectorizer,
        y_embedding_method: embeddings.EmbeddingVectorizer,
        input_embedder_kwargs=dict()
    ):
        input_embedder = input_embedding_method(input_embeddings, **input_embedder_kwargs) 
        y_embedder = y_embedding_method(target_embeddings)
        return RetrieverLearner(zs_learner, input_embedder, y_embedder)
    
    def get_target_embeddings(self, y):
        unique_y = pd.Series(y.unique())
        y_embeddings = self.y_embedder.transform(unique_y)
        return unique_y, y_embeddings
    
    def fit_learner(self, data, **kwargs):
        self.input_embedder.fit(data.X)
        X_embeddings = self.input_embedder.transform(data.X)
        self.y_embedder.fit(data.y)
        unique_y, y_embeddings = self.get_target_embeddings(data.y)
        input_y_idxs = data.y.apply(lambda t: unique_y[unique_y == t].index[0])
        self.zs_learner.fit(np.array(X_embeddings), np.array(input_y_idxs), np.array(y_embeddings), **kwargs)
        
    def predict_idxs(self, X, y_embeddings):
        X_embeddings = self.input_embedder.transform(X)
        return self.zs_learner.predict(X_embeddings, y_embeddings)
    
    def predict_topk(self, X, y_embeddings, target_names, k=5, similarity=metrics.pairwise.cosine_similarity):
        X_embeddings = self.input_embedder.transform(X)
        predictions = self.zs_learner.predict_raw(X_embeddings)
        target_similarities = similarity(predictions, y_embeddings)
        targets = [target_names[row[:k]] for row in (-target_similarities).argsort(axis=1)]
        return targets
        
    def evaluate(self, data, metric):
        unique_y, y_embeddings = self.get_target_embeddings(data.y)
        input_y_idxs = data.y.apply(lambda t: unique_y[unique_y == t].index[0])
        predicted_idxs = self.predict_idxs(data.X, y_embeddings)
        return metric(input_y_idxs, predicted_idxs)

In [20]:
#export

def get_accuracy(learner, X, y, y_names, k=10, similarity=metrics.pairwise.cosine_similarity):
    input_embeddings = learner.input_embedder.transform(X)
    y_embeddings = learner.y_embedder.transform(y_names)
    predictions = learner.zs_learner.predict_raw(input_embeddings)
    target_similarities = similarity(predictions, y_embeddings)
    target_idxs = (-target_similarities).argsort(axis=1)
    targets = [y_names.iloc[row[:k]] for row in target_idxs]

    accuracies = np.zeros(len(X))
    for i in range(len(X)):
        true_tasks = set(all_tasks_test.iloc[i])
        accuracies[i] = len(true_tasks.intersection(set(targets[i].values))) / min(len(true_tasks), k)
    return accuracies.mean()

In [21]:
import pickle

graph = pickle.load(open('output/call_igraph.pkl', 'rb'))

In [22]:
len(graph.get_vertex_dataframe().iloc[graph.neighborhood(vertices=["<ROOT>"])[0]])

18934

get repos that are in graph 

In [23]:
graph_nodes = graph.get_vertex_dataframe()['name'].unique()

In [24]:
%%time
paperswithcode_with_features_df = paperswithcode_with_imports_df[
    paperswithcode_with_imports_df['repo'].isin(graph.get_vertex_dataframe()['name']) |
    paperswithcode_with_imports_df['repo'].apply(lambda s: s.split("/")[1]).isin(graph.get_vertex_dataframe()['name'])
]

CPU times: user 555 ms, sys: 12 ms, total: 567 ms
Wall time: 572 ms


In [25]:
paperswithcode_with_imports_df = paperswithcode_with_imports_df[paperswithcode_with_imports_df['repo'].isin(paperswithcode_with_features_df['repo'])]

In [26]:
def clean_task_name(task_name):
    return task_name.replace("2d ", "").replace("3d ", "").replace("4d ", "").replace("6d ", "").lower()

paperswithcode_with_features_df['most_common_task'] = paperswithcode_with_features_df['most_common_task'].str.lower()
tasks = paperswithcode_with_features_df['most_common_task'].str.lower()
tasks = tasks.apply(clean_task_name)
all_tasks = paperswithcode_with_features_df['tasks'].apply(lambda s: [clean_task_name(t) for t in s])
paperswithcode_with_features_df.shape

(12224, 25)

In [27]:
all_tasks.explode().value_counts()[:100]

semantic segmentation             1066
object detection                  1050
image classification               946
language modelling                 494
representation learning            454
                                  ... 
cell segmentation                   70
nuclear segmentation                69
natural language understanding      69
multi-person pose estimation        69
feature engineering                 68
Name: tasks, Length: 100, dtype: int64

In [28]:
#export

def get_area_grouped_tasks(paperswithcode_tasks_path='data/paperswithcode_tasks.csv'):
    area_grouped_tasks = pd.read_csv('data/paperswithcode_tasks.csv')
    area_grouped_tasks['task'] = area_grouped_tasks['task'].str.replace("-", ' ')
    area_grouped_tasks = area_grouped_tasks[area_grouped_tasks['task'].isin(tasks)]
    area_counts = area_grouped_tasks['area'].value_counts()
    area_grouped_tasks = area_grouped_tasks[area_grouped_tasks['area'].isin(area_counts.index[area_counts > 1])]
    return area_grouped_tasks

In [29]:
area_grouped_tasks = get_area_grouped_tasks()

In [30]:
tasks_train, tasks_test = RepoTaskData.split_tasks(area_grouped_tasks)

In [31]:
len(tasks_train)

295

In [32]:
tasks_test

467                        object counting
1253    conversational response generation
854              probabilistic programming
525          facial expression recognition
1683                   montezuma's revenge
                       ...                
896                      data augmentation
926                                    eeg
1646            neural architecture search
592                       graph regression
1230               sentence classification
Name: task, Length: 74, dtype: object

In [33]:
len(tasks_test)

74

In [34]:
paperswithcode_with_features_df['most_common_task']

0               dictionary learning
1                   region proposal
2                  image generation
3        natural language inference
4        natural language inference
                    ...            
12219             anomaly detection
12220             anomaly detection
12221             anomaly detection
12222                style transfer
12223       representation learning
Name: most_common_task, Length: 12224, dtype: object

In [35]:
paperswithcode_with_features_df['most_common_task'].isin(tasks_test).sum()

2915

In [36]:
paperswithcode_with_features_df.shape

(12224, 25)

In [37]:
paperswithcode_with_features_df.shape

(12224, 25)

In [38]:
from github_search import github_readmes
import concurrent.futures

In [39]:
def try_decode(s, codec="utf-8"):
    try:
        return s.decode(codec)
    except:
        return None


def get_readme_summaries(upstream, product, keywords=True):
    pool = concurrent.futures.ProcessPoolExecutor(max_workers=10)
    raw_readmes = list(pool.map(github_readmes.get_readme, paperswithcode_with_features_df['repo']))
    readmes = pd.Series(raw_readmes).apply(try_decode)
    return readmes

In [40]:
from github_search import python_call_graph

In [41]:
def get_readme_summaries(df, keywords=True):
    pool = concurrent.futures.ProcessPoolExecutor(max_workers=10)
    raw_readmes = list(pool.map(github_readmes.get_readme, df['repo']))
    readmes = list(map(try_decode, raw_readmes))
    return readmes

In [42]:
paperswithcode_with_features_df.columns

Index(['Unnamed: 0', 'paper_url', 'arxiv_id', 'title', 'abstract', 'url_abs',
       'url_pdf', 'proceeding', 'authors', 'tasks', 'date', 'methods',
       'framework', 'mentioned_in_github', 'mentioned_in_paper',
       'paper_arxiv_id', 'paper_title', 'paper_url_abs', 'paper_url_pdf',
       'repo', 'repo_url', 'most_common_task', 'imports', 'n_imports',
       'n_imports_with_embeddings'],
      dtype='object')

In [None]:
%%time
readme_keywords = get_readme_summaries(paperswithcode_with_features_df)

In [None]:
l = list(readme_keywords)

In [None]:
len(l)

In [None]:
import gensim

def try_keywords(text):
    return python_call_graph.try_run(gensim.summarization.keywords)(text)

In [None]:
%%time
readme_keywords = pd.Series(pool.map(try_keywords, readmes)).str.replace("\n", " ")

In [None]:
readme_keywords

In [None]:
i = 0

for repo, readme in zip(paperswithcode_with_features_df['repo'], readmes):
    if not readme is None:
        try:
            readme.decode("utf-8")
        except:
            print(repo)
            print(readme)
            i += 1

In [None]:
i

In [None]:
dependency_records_df = pd.read_csv('output/processed_dependency_records.csv').dropna()#.iloc[:1000000]
non_root_dependency_records_df = dependency_records_df[
    (dependency_records_df['source'] != "<ROOT>") &
    (dependency_records_df['edge_type'] != 'repo-repo')
]
repo_descriptions = non_root_dependency_records_df[['source', 'repo_description']].groupby('source').apply(lambda df: df['repo_description'].iloc[0])

In [None]:
describable_paperswithcode_with_features_df = paperswithcode_with_features_df[paperswithcode_with_features_df['repo'].isin(repo_descriptions.index)]
describable_paperswithcode_with_imports_df = paperswithcode_with_imports_df[paperswithcode_with_imports_df['repo'].isin(repo_descriptions.index)]
describable_repo_tasks = all_tasks[paperswithcode_with_imports_df['repo'].isin(repo_descriptions.index)]

In [None]:
describable_paperswithcode_with_features_df.shape

In [None]:
describable_paperswithcode_with_imports_df.shape

In [None]:
all_tasks

In [None]:
import_data_train, import_data_test = RepoTaskData.create_split(tasks_train, describable_repo_tasks, describable_paperswithcode_with_features_df, describable_paperswithcode_with_imports_df['imports'])

In [None]:
import_data_train.X.shape, import_data_test.X.shape

In [None]:
import_data_train.repos.isin(repo_descriptions.index).mean()

In [None]:
import_data_test.repos.isin(repo_descriptions.index).mean()

In [None]:
task_embedder = mlutil.feature_extraction.embeddings.AverageWordEmbeddingsVectorizer(word_embeddings)

In [None]:
from scipy.stats import hmean

In [None]:
#export


def get_outgoing_edges(graph, node):
    #idx = pd.Index(graph.names).get_loc(node)
    #outgoing_edges_idx = np.where(graph.mat[idx].todense())[1]
    return graph.get_vertex_dataframe().iloc[graph.successors(node)]['name']
    #return graph.names[outgoing_edges_idx]


def get_repo_functions(graph, repo):
    return ' '.join(get_outgoing_edges(graph, repo).values)

In [None]:
graph_records = pd.read_csv('output/dependency_records.csv')

In [None]:
%%time
if os.path.exists("output/tmp_graph_data.pkl"):
    (graph_data_train, graph_data_test) = pickle.load(open("output/tmp_graph_data.pkl", "rb"))
else:
    graph_data_train, graph_data_test = RepoTaskData.create_split(tasks_train, all_tasks, paperswithcode_with_features_df, paperswithcode_with_imports_df['imports'])
    graph_data_train.X = graph_data_train.repos.apply(lambda x: get_repo_functions(graph, x))
    graph_data_test.X = graph_data_test.repos.apply(lambda x: get_repo_functions(graph, x))
    pickle.dump((graph_data_train, graph_data_test), open("output/tmp_graph_data.pkl", "wb"))

In [None]:
repo_descriptions.loc[graph_data_train.repos[6]]

In [None]:
graph_data_train, graph_data_test = RepoTaskData.create_split(tasks_train, describable_repo_tasks, describable_paperswithcode_with_features_df, describable_paperswithcode_with_imports_df['imports'])

In [None]:
graph_data_train.X = pd.Series(repo_descriptions.loc[graph_data_train.repos].values, index=graph_data_train.repos.index)

In [None]:
graph_data_test.X = pd.Series(repo_descriptions.loc[graph_data_test.repos].values, index=graph_data_test.repos.index)

In [None]:
graph_data_train.repos.iloc[0]

In [None]:
get_outgoing_edges(graph, get_outgoing_edges(graph, graph_data_train.repos.iloc[0]).iloc[0])

In [None]:
len(graph_data_train.X)

In [None]:
#export


def retrieve_query_results(learner, data, query, k=10, similarity=metrics.pairwise.cosine_similarity):
    input_embeddings = learner.input_embedder.transform(data.X)
    y_embeddings = learner.y_embedder.transform([query])
    predictions = learner.zs_learner.predict_raw(input_embeddings)
    input_target_similarities = similarity(predictions, y_embeddings)
    return data.X.iloc[np.argsort(-input_target_similarities[:,0])[:k]]

    
def get_retrieval_results(learner, data, k=10, similarity=metrics.pairwise.cosine_similarity):
    y_names, __ = learner.get_target_embeddings(data.y)
    input_embeddings = learner.input_embedder.transform(data.X)
    y_embeddings = learner.y_embedder.transform(y_names)
    predictions = learner.zs_learner.predict_raw(input_embeddings)
    input_target_similarities = similarity(predictions, y_embeddings)

    X_recalled = [
        np.argsort(-input_target_similarities[:,y_idx])[:k]
        for (y_idx, __) in enumerate(y_names)
    ]
    return X_recalled


def get_retrieval_accuracies(learner, data, k=10, similarity=metrics.pairwise.cosine_similarity):
    y_names, __ = learner.get_target_embeddings(data.y)
    recalled_X = get_retrieval_results(learner, data, k=k, similarity=similarity)
    recalled_X_actual_y = [data.y.iloc[idxs_recalled].explode() for idxs_recalled in recalled_X]
    accurately_recalled = [
        y_name in recalled_X_actual_y[y_idx].values 
        for (y_idx, y_name) in enumerate(y_names)
    ]
    return pd.Series(data=accurately_recalled, index=y_names)


def get_retrieval_accuracy(learner, data, k=10, similarity=metrics.pairwise.cosine_similarity):
    y_names, __ = learner.get_target_embeddings(data.y)
    return np.mean(get_retrieval_accuracies(learner, data, k, similarity))

## Abstracts

In [None]:
paperswithcode_with_imports_df['abstract']

In [None]:
has_abstract = ~paperswithcode_with_imports_df['abstract'].isna()

In [None]:
tasks_train[has_abstract]
paperswithcode_with_features_df[has_abstract]

In [None]:
abstract_data_train, abstract_data_test = RepoTaskData.create_split(tasks_train[has_abstract], all_tasks[has_abstract], paperswithcode_with_features_df[has_abstract], paperswithcode_with_features_df[has_abstract]['abstract'].str.split())

In [None]:
from scarce_learn.zero_shot import devise_jax

In [None]:
import fasttext
fasttext_model = fasttext.load_model("output/python_files_fasttext_dim200.bin")

In [None]:
abstract_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    python_word_embeddings,
    python_word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

abstract_learner.fit_learner(abstract_data_train)

In [None]:
abstract_learner.evaluate(abstract_data_train, metrics.accuracy_score)

In [None]:
abstract_learner.evaluate(abstract_data_test, metrics.accuracy_score)

In [None]:
get_retrieval_accuracy(abstract_learner, abstract_data_train, k=10)

In [None]:
get_retrieval_accuracy(abstract_learner, abstract_data_test, k=10)

# Abstract model using fasttext trained on Python code

In [None]:
ezslearner = zero_shot.ESZSLearner()
abstract_fasttext_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    word_embeddings,
    fasttext_model,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.FastTextVectorizer
)

abstract_fasttext_learner.fit_learner(abstract_data_train)

In [None]:
abstract_fasttext_learner.evaluate(abstract_data_train, metrics.accuracy_score)

In [None]:
abstract_learner.evaluate(abstract_data_test, metrics.accuracy_score)

In [None]:
get_retrieval_accuracy(abstract_fasttext_learner, abstract_data_train, k=10)

In [None]:
get_retrieval_accuracy(abstract_fasttext_learner, abstract_data_test, k=10)

# Fasttext model on READMEs

In [None]:
paperswithcode_with_imports_df['readme'] = readmes
paperswithcode_with_features_df['readme'] = readmes

In [None]:
has_readme = ~paperswithcode_with_imports_df['readme'].isna()

readme_data_train, readme_data_test = RepoTaskData.create_split(tasks_train[has_readme], all_tasks[has_readme], paperswithcode_with_features_df[has_readme], paperswithcode_with_features_df[has_readme]['readme'].str.split())

In [None]:
readme_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    python_word_embeddings,
    python_word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

readme_learner.fit_learner(readme_data_train)

In [None]:
readme_learner.evaluate(readme_data_train, metrics.accuracy_score)

In [None]:
get_retrieval_accuracy(readme_learner, readme_data_train, k=10)

In [None]:
get_retrieval_accuracy(readme_learner, readme_data_test, k=10)

## Fasttext on READMEs - worse than word2vec

In [None]:
ezslearner = zero_shot.ESZSLearner()
readme_fasttext_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    word_embeddings,
    fasttext_model,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.FastTextVectorizer
)

readme_fasttext_learner.fit_learner(readme_data_train)

In [None]:
readme_fasttext_learner.evaluate(readme_data_train, metrics.accuracy_score)

In [None]:
get_retrieval_accuracy(readme_fasttext_learner, readme_data_train, k=10)

In [None]:
get_retrieval_accuracy(readme_fasttext_learner, readme_data_test, k=10)

In [None]:
readme_keywords_data_train, readme_keywords_data_test = RepoTaskData.create_split(tasks_train[has_readme], all_tasks[has_readme], paperswithcode_with_features_df[has_readme], readme_keywords[has_readme].str.split())

# README keywords

In [None]:
ezslearner = zero_shot.ESZSLearner()
readme_keywords_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    word_embeddings,
    word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

readme_keywords_learner.fit_learner(readme_keywords_data_train)

In [None]:
readme_keywords_learner.evaluate(readme_keywords_data_train, metrics.accuracy_score)

In [None]:
get_retrieval_accuracy(readme_keywords_learner, readme_keywords_data_train, k=10)

In [None]:
get_retrieval_accuracy(readme_keywords_learner, readme_keywords_data_test, k=10)

## Import2Vec

In [None]:
ezslearner = zero_shot.ESZSLearner()
import2vec_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(lmbda=100.0, gamma=10.0),
    import2vec,
    word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

import2vec_learner.fit_learner(import_data_train)#, n_epochs=100, batch_size=64)

In [None]:
import2vec_learner.evaluate(import_data_train, metrics.accuracy_score)

In [None]:
import2vec_learner.evaluate(import_data_test, metrics.accuracy_score)

In [None]:
get_retrieval_accuracy(import2vec_learner, import_data_train, k=10)

In [None]:
get_retrieval_accuracy(import2vec_learner, import_data_test, k=10)

## PRoNe

In [None]:
prone_embeddings = gensim.models.KeyedVectors.load("data/prone_embeddings.bin")

Using repo embedding from node embeddings

In [None]:
prone_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100,10),
    prone_embeddings,
    python_word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

prone_learner.fit_learner(graph_data_train)#, n_epochs=10, batch_size=32)
prone_learner.evaluate(graph_data_train, metrics.accuracy_score)

In [None]:
get_retrieval_accuracy(prone_learner, graph_data_train, k=10)

In [None]:
get_retrieval_accuracy(prone_learner, graph_data_test, k=10)

## GraphSage

## aggregating vertex embeddings 

In [None]:
!ls output/*graphsage*bin

In [None]:
graphsage_embeddings = gensim.models.KeyedVectors.load("output/graphsage_embeddings_fasttext_dim200_epochs50_dim200_layers2.bin")

In [None]:
list(graphsage_embeddings.vocab)[-1]

In [None]:
graph_data_train.X

In [None]:
ezslearner = zero_shot.ESZSLearner()
graphsage_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    graphsage_embeddings,
    graphsage_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

graphsage_learner.fit_learner(graph_data_train)

In [None]:
graphsage_learner.evaluate(graph_data_train, metric=metrics.accuracy_score)

In [None]:
get_retrieval_accuracy(graphsage_learner, graph_data_train, k=10)

In [None]:
get_retrieval_accuracy(graphsage_learner, graph_data_test, k=10)

In [None]:
graphsage_devise_learner = RetrieverLearner.create(
    devise_jax.DEVISELearner(margin=0.5),
    graphsage_embeddings,
    fasttext_model,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.FastTextVectorizer
)

graphsage_devise_learner.fit_learner(graph_data_train, batch_size=64, n_epochs=200)

In [None]:
graphsage_devise_learner.evaluate(graph_data_train, metric=metrics.accuracy_score)

In [None]:
get_retrieval_accuracy(graphsage_devise_learner, graph_data_train, k=10)

In [None]:
get_retrieval_accuracy(graphsage_devise_learner, graph_data_test, k=10)

## using GraphSAGE model for embedding

In [None]:
graphsage_data_train.repos

In [None]:
class LambdaTransformer:
    
    def __init__(self, transform_fn):
        self.transform = transform_fn
        
    def fit(self, X, **kwargs):
        return self

In [None]:
from github_search.pytorch_geometric_data import PygGraphWrapper
import torch

In [None]:
fasttext_embedder = embeddings.FastTextVectorizer(fasttext_model)

In [None]:
repo_descriptions

In [None]:
%%time
dependency_graph_wrapper = PygGraphWrapper(fasttext_embedder.transform, non_root_dependency_records_df, "repo_description", "file_description")

In [None]:
graphsage_model = torch.load("output/graphsage_model_11_dim200_layers2.pth").cpu()#"output/graphsage_model_60_dim200_layers3.pth").cpu()
graphsage_model.training = False

In [None]:
non_root_dependency_records_df.merge(graph_data_train.repos, left_on="source")

In [212]:
graph_data_train.repos.isin(repo_descriptions.index).mean()

0.9205489346334417

In [213]:
graphsage_data_train, graphsage_data_test = RepoTaskData.create_split(tasks_train, all_tasks, paperswithcode_with_features_df, paperswithcode_with_imports_df['imports'])
graphsage_data_train.X = repo_descriptions.loc[graph_data_train.repos]
graphsage_data_test.X = repo_descriptions.loc[graph_data_test.repos]

(12224,)


KeyError: "['zzz1515151/self-supervised_learning_sketch', 'zhoubolei/moments_models', 'zacwellmer/WorldModels', 'yueqiw/gqn-world-model', 'zhang-huihui/git-repository', 'zapplea/bert', 'EthanWYB/bert-classification', 'svakulenk0/response_eval', 'zsweet/BERT_zsw', 'zhen-he/tracking-by-animation', 'xinge008/Cylinder3D', 'zhanghainan/RNN-encdec', 'zhanghainan/TailoredSeq2Seq2DifferentConversationScenarios', 'yichigo/Chest-X-Ray', 'zju-3dv/multiway', 'zqhl/Wide-Area-Crowd-Counting_CVPR2019', 'yeshaokai/Robustness-Aware-Pruning-ADMM', 'zalandoresearch/flair', 'zhaoyuzhi/Legacy-Photo-Editing-with-Learned-Noise-Prior', 'yeoedward/Robust-Fill', 'codchen/CRE', 'LLNL/FAST', 'zliucr/coach', 'songyadong106/111', 'myrtleSoftware/deepspeech', 'zhengziqiang/P2C', 'yikegami/openpose', 'yinzhiyan43/openpose-dev', 'lucidrains/siren-pytorch', 'zhangsilu17/Gini_distance_statistics', 'zhen-dong/hawq', 'yuekai146/NMT', 'yzhou359/MakeItTalk', 'zhougroup/BAM', 'carrenD/ummkd', 'ziangqin-stu/impl_data-effiient-hrl', 'yhw-yhw/PVAMVSNet', 'zphang/usc_dae', 'XiaoxiaoGuo/rcdqn', 'zhegan27/LXMERT-AdvTrain', 'yangdsh/VQA-BUTD-demo', 'brandontrabucco/up_down_cell', 'brandontrabucco/up_down_rnn_cell', 'zaeemzadeh/Active-Learning-UCF101-IPM', 'zhangzx-sjtu/LANTERN-NeurIPS-2019', 'zaccharieramzi/fastmri-reproducible-benchmark', 'drigoni/ComparisonsDGM', 'ceciliavision/perceptual-reflection-removal', 'yiskw713/ClassActivationMapping', 'hursung1/GradientEpisodicMemory', 'facebookresearch/GradientEpisodicMemory', 'yistLin/FragmentVC', 'j-min/VL-T5', 'j-duan/VS-Net', 'Kelym/FAST', 'akuzeee/AFLAC', 'uber-research/LaneGCN', 'zheyejs/3D-convolutional-speaker-recognition', 'yjhong89/Domain-Adaptation', 'yyunon/reproducibility-project-group-71', 'ztoString/CRNN_CTC_OCR_TensorFlow', 'zyasjtu/CNN-RNN-CTC', 'Gitikameher/Domain-Adaptive-Single-View-3D-Reconstruction', 'young-zonglin/bilm-tf-extended', 'yangrui123/Hidden', 'yuanjing-zhu/elmo', 'NaoyukiKanda/LibriSpeechMix', 'ywu94/Code-Notes', 'yaodongyu/TRADES', 'zjfheart/Friendly-Adversarial-Training', 'zju3dv/neuralbody', 'vinsis/ternary-quantization', 'youshyee/CEP', 'zhang2010hao/cw2vec-pytorch', 'mwydmuch/extremeText', 'Xianhang/EDSC-pytorch', 'yrbahn/Deep-AutoEncoders-for-Collaborative-Filtering', 'g-jozsef/sampling-framework', 'benedekrozemberczki/Splitter', 'nkmjm/qiML', 'yitong91/StoryGAN', 'SaeedNajafi/ac-tagger', 'yilundu/improved_contrastive_divergence', 'rnradon/gender_emotion_classification', 'zangobot/secml_malware', 'GillesVandewiele/WalkExperiments', 'Binbose/keras-layer-normalization-rnn', 'yotharit/image_style_transfer', 'MIMBCD-UI/dataset-uta4-rates', 'Stephenfang51/Grad_CAM', 'mzymzy/paper3-quantized_distillzation', 'dropoutlabs/encrypted-skin-cancer-detection', 'yytyvonne/DQN_agent_Chatbot', 'Kakoedlinnoeslovo/fairseq', 'overwindows/PALM', 'facebookresearch/GloRe', 'carlo-/sepconv-ios', 'zhusiling/UNets', 'zhongpeixiang/SemEval2019-Task3-EmotionDetection', 'euranova/CASS-dataset', 'yangliuy/NeuralResponseRanking', 'ybyangpku/CADGMs', 'KeqiangSun/FAB', 'yagyapandeya/CNN-with-Few-Data-VGGish-', 'RICE-EIC/FracTrain', 'zhengshou/AutoLoc', 'yzcjtr/GeoNet', 'ymcidence/Zero-Shot-Sketch-Image-Hashing', 'yule-BUAA/HGConv', 'yongbowin/pkuseg-python_annotation', 'sourabhdattawad/TabNet', 'ymirsky/KitNET-py', 'ymirsky/Kitsune-py', 'yumaloop/LSTMAutoEncoderOnMovingMNIST', 'overlapping-instances/MultiStar', 'yeefan1999/Explainable-Health-Prediction-with-Transfer-Learning', 'zy1998/inception_v3_flowerIdentify', 'zjZSTU/GoogLeNet', 'zzs1994/CVQN', 'yijie0710/GeoNet_pytorch', 'youngbin-ro/Multi2OIE', 'yky138495/awesome-matlab-rank-1000', 'mushfiqur11/SS-VideoCaptioning', 'yoomambo/BayesianOptimization_Tuned', 'code2k13/nlppipe', 'Carco-git/CW_Attack_on_MNIST', 'KangchengHou/gntk', 'dariush-salami/gcn-gesture-recognition', 'hszhao/PSPNet', 'zzxslp/CosRec', 'yongzx/SDEC-Keras', 'NanboLi/MulMON', 'zanyarz/NeuralTwinsTalk', 'yangsenius/TransPose', 'facebookresearch/EmpatheticDialogues', 'zhanglinfeng1997/Sentiment-Analysis-via-GCN', 'ReemHal/Browser-Based-Annotator', 'yogeshbalaji/robustOT', 'yuleiniu/vc', 'yoavnavon/GRU4REC-spotify', 'yxinjiang/Unet-for-foreground-segmentation', 'cocoxu/simplification', 'Stepphonwol/my_yowo', 'zgahhblhc/DialogueFairness', 'yaxingwang/Mix-and-match-networks', 'yamizi/FeatureNet', 'mgong2/DA_Infer', 'yurayli/image-caption-pytorch', 'cod3licious/conec', 'surafelml/improving-zeroshot-nmt', 'Sachin19/adversarial-classify', 'yurayli/stanford-cs224n-sol', 'yuji-roh/fr-train', 'NLPLearn/QANet', 'zhangjiong724/autoassist-exp', 'mohaseeb/wisture', 'yuzhimanhua/lm-lstm-crf', 'yuzhimanhua/Multi-BioNER', 'yshenaw/GNN-Resource-Management', 'yechengxi/deconvolution', 'shuohangwang/Cross-Thought', 'Nachwa/object_states', 'yinglunz/ROAI_ICML2020', 'darkreapyre/HaaS-dev', 'darkreapyre/HaaS-GitOps', 'yan-roo/SpineNet-Pytorch', 'darkreapyre/HaaS', 'yahoo/crow', 'ziqi92/Modof', 'zhangjy2008327/lane-detection-with-double-convgrus', 'wi-pi/GDPR', 'youngminPIL/rollback', 'zoj613/polya-gamma', 'yanfengliu/layered_embeddings', 'yfletberliac/adversarially-guided-actor-critic', 'KentonMurray/ProxGradPytorch', 'yahsieh37/Visual-Saliency-Prediction', 'yubowen-ph/JointER', 'zalkikar/BBOX_GradCAM', 'vinayprabhu/Kannada_MNIST', 'XinJCheng/CSPN', 'zyang-16/MCNS', 'joaoreis-feup/hyper_process_model', 'jpcreis/Hyper-Process-Model', 'yaxingwang/MineGAN', 'yaxingwang/DeepI2I', 'shubhamguptaiitd/GraphRNN', 'yihui-he/Estimated-Depth-Map-Helps-Image-Classification', 'SachinIchake/KALM', 'zygmuntz/hyperband', 'yueqiw/ncp-sort', 'drgriffis/Extrinsic-Evaluation-tasks', 'zhudanhao/g-gnn', 'zxok365/On-Demand-Ridesourcing-Project', 'liamcli/darts', 'yezhang-xiaofan/Rationale-CNN', 'zju-vipa/NetGraft', 'lianbin/VIOSLAM', 'CanCanZeng/LearnVIORB', 'ZuoJiaxing/Learn-ORB-VIO-Stereo-Mono', 'yuzhe630/adder-DSE', 'ychnlgy/DeepConsensus-experimental-FROZEN', 'carolinlawrence/nematus', 'zhengzx-nlp/past-and-future-nmt', 'zswang666/Stereo-LiDAR-CCVNorm', 'SuryanarayanaMK/PDE-STRIDE', 'zhangxiaoyu11/OmiEmbed', 'suvojit-0x55aa/A2S2K-ResNet', 'zhiyongc/Graph-Markov-Network', 'coastalcph/koepsala-parser', 'Nadavc220/DomainAdversarialTrainingOfNeuralNetworks', 'yujiapingyu/Deep-Hashing', 'SungjoonPark/KoreanWordVectors', 'xingyizhou/3DKeypoints-DA', 'vishal-burman/Neural-Machine-Translation', 'ykrmm/ICLR_2020', 'daphne12345/SummarizationRadiologyReports', 'belaalb/TI-DG', 'ianRDavies/LeMOL', 'yang-song/score_sde', 'yfreedomliTHU/mos-pytorch1.1', 'ybisk/charNMT-noise', 'StephanieWyt/RDGCN', 'zju-vipa/TransferbilityFromAttributionMaps', 'yehengchen/SmartCar-FaceRecognition', 'yehengchen/FaceRecognition-FaceNet', 'zhaolongkzz/human_motion', 'zth667/Diverse-Image-Synthesis-from-Semantic-Layout', 'yaohungt/GSTEG_CVPR_2019', 'yipersevere/text-sentiment-classification-with-deep-neural-networks', 'yipersevere/thesis', 'iamkucuk/DCGAN-Face-Generation', 'yzhu319/dlnd_face_generation_git', 'yujuezhao/AC-GAN', 'NadimKawwa/DCGAN_faces', 'yashyenugu/Anime-Face-GAN', 'virafpatrawala/DCGAN', 'suzana-ilic/DCGANs_pytorch', 'suzana-ilic/pytorch_DCGANs', 'toru34/li_emnlp_2017', 'ycccccccccc/Learning-unbiased-zero-shot-semantic-segmentation-networks-via-transductive-transfer', 'yunshengb/SimGNN', 'zhangzjn/DTVNet', 'yashkant/PNAS-Binarized-Neural-Networks', 'spikeeSakshu/CharacterRecognition', 'iamjanvijay/rnnt_decoder_cuda', 'zcyang/imageqa-san', 'zihangJiang/DR-Learning-for-3D-Face', 'zhengwang100/RECT', 'yqx7150/EASEL', 'yunzhusong/AAAI20-PORLHG', 'carlini/pixel-deflection', 'zion-king/Deep-Learning-for-Person-Re-identification', 'ayanc/rpcnn', 'XiaoxiaoGuo/fashion-retrieval', 'KamitaniLab/cnnpref', 'uber-research/FSDM', 'KelestZ/CondGen', 'ykiiiiii/CosmoVAE', 'mit-acl/clear', 'myagues/flax_nerf', 'yenchenlin/nerf-pytorch', 'yalharbi/StructuredNoiseInjection', 'zhangxiangxiao/glyph', 'songlab-cal/tape', 'xiangzhang1015/OATM', 'drimpossible/GDumb', 'yzjiao/Subg-Con', 'zhaofang0627/HPBTT', 'yuantiku/PoDA', 'zalanborsos/online-variance-reduction', 'yaxingwang/Transferring-GANs', 'MartinHahner88/FoggySynscapes', 'chuhang/SurfConv', 'StephenPauwels/edbn_ecmlpkdd', 'ds4dm/branch-search-trees', 'SSL92/hyperIQA', 'suyeecav/model-targeted-poisoning', 'XinGla/RCF', 'zengxianyu/jsws', 'zxlzr/RAN', 'zhengzx-nlp/dynamic-nmt', 'zhezh/adafuse-3d-human-pose', 'zhangboshen/A2J', 'zdou0830/DAFE', 'Xiangyi1996/PPNet-PyTorch', 'zoeyuchao/LFNet_modify', 'ypeleg/komplex', 'zenroad/modifypointnet', 'ycszen/TorchSeg', 'vinnik-dmitry07/PlaceRecognition', 'Xnsam/clothing_classification', 'mgonzalezrivero/reef_learning', 'carolgithubv1/convnets-keras', 'yuhuixu1993/Trained-Rank-Pruning', 'yahoo/object_relation_transformer', 'ymcui/Chinese-PreTrained-XLNet', 'zetayue/CPA', 'cruvadom/Logit_Separation', 'yogeshbalaji/Normalized-Wasserstein', 'carrenD/Med-CMDA', 'johanna-einsiedler/covid-19-air-pollution', 'benedekrozemberczki/APPNP', 'j96w/6-PACK', 'ziyin-dl/global-anchor-method', 'aaaasssddf/global-anchor-method', 'pbizopoulos/signal2image-modules-in-deep-neural-networks-for-eeg-classification', 'zhiweiuu/secs', 'coastalcph/Sequence_classification_with_human_attention', 'FuzhenZhuang/Transfer-Learning-Toolkit', 'zerohd4869/SLK-NER', 'yifan-h/CS-GNN', 'iamkissg/cpae-pytorch', 'yao8839836/text_gcn', 'selim-iitdu/STANCT', 'nithishkaviyan/Sentiment-Analysis-of-Yelp-Reviews', 'audqhsid/-Review-CNN-for-Sentence-Classification', 'yongjincho/cnn-text-classification-pytorch', 'sebastian-hofstaetter/neural-ranking-kd', 'yenchenlin/fid', 'yanx27/Pointnet', 'zgx0534/pointnet_win', 'y2kmz/pointnetv2', 'ytng001/sensemaking', 'ysenarath/hate-detection-icsc-2020', 'yitu-opensource/T2T-ViT', 'ysyushi/HyperMine', 'yatharthagarwal/x_ray', 'yongjie-lin/bert-opensesame', 'code-gen/cscg', 'code-gen/cgcs', 'TIXFeniks/neurips2019_intrus', 'yanx27/3DGNN_pytorch', 'gan3sh500/octaveconv-pytorch', 'yagyapandeya/Music_Source_Seperation_TF2', 'yoojungsun0/Psych239', 'jobdataexchange/competensor', 'zmd971202/IronyGeneration', 'zhaoyanpeng/vpcfg', 'iamgroot42/nelec', 'carljohanhoel/BayesianRLForAutonomousDriving', 'yyysbysb/al_obs_neurips19', 'ArashRahnama/Adversarial-Explanations-for-Artificial-Intelligence-Systems-AXAI', 'yixuan/cdtau', 'dhirajpatnaik16297/IMG-TXT-Generative-Adversarial-Network', 'htconquer/ddh', 'ayanc/edgeml.mdp', 'zekarias-tilahun/GAP', 'rochesterxugroup/HAM_dataset', 'WeijiaLau/MHCH-DAMI', 'yliu1021/HandGestureClassifierCNN', 'yujiali/ggnn', 'drsleep/nas-segm-pytorch', 'yiskw713/boundary_loss_for_remote_sensing', 'johanna-rock/imRICnn', 'dariopavllo/style-semantics', 'zhuoyang125/simple_classifier', 'benedekrozemberczki/AttentionWalk', 'yromano/fair_dummies', 'zalandoresearch/famos', 'vinojjayasundara/textcaps', 'violet-zct/DeMa-BWE', 'yangliuy/HybridNCM', 'rktamplayo/LeTraNets', 'yanrucheng/PINet-demo', 'zeyofu/EDL', 'zjunlp/DiagnoseRE', 'zhengdao-chen/GNN4CD', 'balbok0/bayes-nn-qsh', 'zphang/bert_on_stilts', 'ysharma1126/Split-Brain-Autoencoder', 'yanminglai/Malware-GAN', 'benedekrozemberczki/TENE', 'rktamplayo/DenoiseSum', 'yitianhoulai/ART', 'zhuchen03/FreeLB', 'sdyy6211/plant-segmentation', 'zhenxun-zhuang/SGD-Exponential-Stepsize', 'asprenger/keras_acgan', 'yongleex/AGT-ME', 'zhangtj1996/lottery-ticket-hypothesis-Mxnet', 'arnavdodiedo/DenseNet-MNIST', 'zhangweichen2006/iCAN', 'fsahli/MFclass', 'blablabananarama/ukiyoGAN', 'zhao-lab/kalidindi_dpgp_multi_vehicle_2019', 'zju3dv/pvnet', 'yumeng5/JoSH', 'zhaoxlpku/KnowledGPT', 'york2210/MedicalChatbot-HRL', 'cod3licious/simec', 'sourabhmadur/Neural-Style-Transfer', 'shizuo-kaji/StyleTransfer', 'kidach1/NeuralArtisticStyle', 'RyanWu2233/Style_Transfer', 'patconrey/ANN-Example', 'ialhashim/StyleGAN-Tensorflow2', 'tr1pzz/stylegan2-pytorch', 'xiangyue9607/BioNEV', 'zhyack/SCC', 'HongyuGong/Geometry-of-Compositionality', 'zychen423/KE-VIST', 'zomux/neuralcompressor', 'zhanxinrui/tracking_wo_bnw_fork', 'KhenAharon/Deep-Learning-SNLI-Residual-Stacked-Encoders', 'SharifAmit/OCT_Classification', 'zhawhjw/yolact-interpret', 'zhhchen4njit/yolact', 'ywang07/nmt_soft_prototype', 'j96w/DenseFusion', 'bayrameda/MrAP', 'XiangLiu0731/MFGNet', 'zxleong/GPRNet', 'belkakari/cellular-automata-pytorch', 'z-fabian/transfer_lowerbounds_arXiv', 'nithishkaviyan/Show-and-Tell-Neural-Network-Image-Caption-Generator-', 'zhaitongqing233/Backdoor-attack-against-speaker-verification', 'yuanyuanli85/Stacked_Hourglass_Network_Keras', 'zehuichen123/DSEBM', 'iamhankai/attribute-aware-attention', 'j-a-lin/DFANet_PyTorch', 'yongheng1991/qec_net', 'yfsong0709/RA-GCNv2', 'yzhan238/CGExpan', 'SaeedSharifiMa/AIF', 'yardstick17/AspectBasedSentimentAnalysis', 'XiaowanLi2018/TimeSeriesPrediction_BasedOnCNN', 'yuxi120407/DIB', 'Information-Fusion-Lab-Umass/causal_transfer_learning', 'yredwood/fewshot_blogpost', 'zuoxingdong/VIN_PyTorch_Visdom', 'yqian4/optuna', 'yuxi120407/semi-supervised_tensorflow2.0', 'yijiuzai/Matching-Networks-for-One-Shot-Learning', 'yumoh/speech-keras', 'yeeeqichen/Bert', 'yamad07/NeuralProcess', 'mireshghallah/shredder-v1', 'y0ast/Variational-Autoencoder', 'rmehta1987/CoZINB', 'yolu1055/conditional-glow', 'avinashsai/BERT-Aspect', 'yjparkLiCS/18-NIPS-APIAE', 'yechens/QiuZhao-ChongChongChong', 'yuanyu255/PCNN_C2SA', 'RElbers/region-mutual-information-pytorch', 'subhayanmukherjee/cnninsar', 'zekarias-tilahun/goat', 'pierreHmbt/Tensor_CDL', 'zh3nis/lstm-syl', 'nlpub/watset-java', 'yellowtownhz/STIGCN', 'ybayle/ReproducibleResearchCode', 'bmda-unibas/InverseLearningOfSymmetries', 'zhliping/Deep-Learning', 'yingtaomj/Iterative-Document-Representation-Learning-Towards-Summarization-with-Polishing', 'yikangli/video-rhythm', 'yougoforward/hlzhu_DANet_git', 'zhenxingsh/Pytorch_DANet', 'favae/favae_ijcai2019', 'jbarnesspain/blse', 'zetayue/MXMNet', 'nch08a/EDVizPhenotyping', 'yqx7150/IFR-Net-Code', 'yuzhimanhua/MetaCat', 'NLP-Discourse-SoochowU/t2d_discourseparser', 'umautobots/pixelwise-deblurring', 'youngryan1993/PrDA-Progressive-Domain-Adaptation-from-a-Source-Pre-trained-Model', 'youngryan1993/SFDA-Domain-Adaptation-without-Source-Data', 'IndustAI/risk-and-uncertainty', 'zhiyongc/Graph_Convolutional_LSTM', 'zhenpeiyang/RelativePose', 'zsef123/EfficientNets-PyTorch', 'zake7749/WSDM-Cup-2019', 'yftah89/TRL-PBLM', 'yumoxu/detnet', 'liernisestorain/zero-shot-dual-MT', 'yu20103983/FOTS', 'huangleiBuaa/OthogonalWN', 'mx54039q/cnn-visualizing', 'zidixiu/VIE', 'jmfacil/single-view-place-recognition', 'nikolamilosevic86/SerbianStemmer', 'pykao/ABCD-MICCAI2019', 'yasinyazici/Venn_GAN', 'tranc012/SMILE-Rep', 'bloodwass/mixout', 'zhangpur/SR-LSTM', 'zhangyu233/mvscode', 'zekunhao1995/PointFlowRenderer', 'yuriautsumi/PersonalizedGP', 'myaldiz/deep_violence_detection', 'yanqi1811/PWC-Net', 'zhengzhe97/yolactpaddle', 'Aoi-hosizora/FFDNet_pytorch', 'zhujiagang/gating-ConvNet-code', 'zhengwang100/RSDNE-python', 'zhenghuazx/BayesianLRPolicySearch', 'uber-research/D3G', 'vishalanand/MultiSeg', 'ynahshan/nn-quantization-pytorch', 'ovchinnikovdk/graph_clf', 'zchenry/ambiguity-comparison', 'belaalb/frameGAN', 'uber-research/DeepPruner', 'yli1/CGPS', 'SaeedNajafi/pytorch-ocd', 'SaeedNajafi/OCD-Learning', 'zhenngbolun/Learnbale_Bandpass_Filter', 'yzhangcs/crfpar', 'iamollas/Altruist', 'yaohungt/Capsules-Inverted-Attention-Routing', 'htanwar922/Language-Adversarial-Network', 'zhoujf620/Motif-based-inductive-GNN-training', 'amikael/ncdigraphs', 'yarotsky/voxelfeatures', 'fostiropoulos/dvq', 'yli1/CLCL', 'yahoo/maaf', 'codeRimoe/DL_for_RSIs', 'chuanraoCV/INQ-incremental-network-quantization-towards-lossless-CNNs-with-low-precision-weights', 'WeijiaZhang24/TEDVAE', 'yukang2017/RENAS', 'Nadav-Barak/AWP', 'mwray/Joint-Part-of-Speech-Embeddings', 'princetonvisualai/SPICE-U', 'jsgaobiao/superpoint_graph', 'ydecastro/lar_testing', 'rktamplayo/HCSC', 'ZurichNLP/ContraPro', 'Huangdebo/YOLOv4-MultiTask', 'ucals/cvae', 'violet-zct/pytorch-reorder-nmt', 'zaf11/xDeepFM-', 'carinanorre/Brain-Tumour-Segmentation-Dissertation', 'overshiki/unet-pytorch', 'zhugoldman/CNN-segmentation-for-Lung-cancer-OARs', 'Will-J-Gale/Self-Driving-Car-Vision', 'yassineAlouini/data-science-bowl-2018', 'SharifAmit/Fundus2Angio', 'black0017/3D-GAN-pytorch', 'ducha-aiki/LSUV-keras', 'yaoli/nade_k', 'tommoral/dicodile', 'yschroecker/universal_value_density_estimation', 'Ximilar-com/tf-image', 'zimengq/PyTorch-ReCode', 'zhezhaoa/neural_BOW_toolkit', 'benedekrozemberczki/SEAL-CI', 'zzd1992/Adversarial-Defense-by-Suppressing-High-Frequencies', 'zotrick/Pneumonia_classification_Xception', 'N0vel/weighted-hausdorff-distance-tensorflow-keras-loss', 'roccotrip/antisem', 'yaoqi-zd/SGAN', 'carpedm20/simulated-unsupervised-tensorflow', 'yeLer/fcn', 'zb12138/sph3d', 'yyyaoyuan/CWAN', 'yanivbl6/fixup', 'yifjiang/relative-depth-using-pytorch', 'yclavinas/ai_big_data_quantum_compution', 'zqwhu/SegDAwithBoundary', 'morningmoni/HiLAP', 'zerohd4869/HIN-SR', 'yaircarmon/semisup-adv', 'benedekrozemberczki/graph2vec', 'yinanzhu12/SegNet-keras', 'okn-yu/SegNet-A-Deep-Convolutional-Encoder-Decoder-Architecture-for-Image-Segmentation', 'yinanzhu12/SegNet-keras-implementation', 'georgeberry/role-action-embeddings', 'dariozanca/eymol', 'SSE-PT/SSE-PT', 'zju3dv/snake', 'yuvalpinter/m3gm', 'KanchiShimono/KGCN', 'benathi/word2gm', 'zhaobomin/pytorch-ocr', 'yunhai0920/company-name-id', 'tranbahien/CTPN-TensorFlow', 'zhenqifu/Twice-Mixing', 'Kanaderu/nlp_credibility', 'ziqizhang/semrerank', 'zengxianyu/crfill', 'Refefer/Dagger', 'ytsvetko/qvec', 'yaoshuwang/SelNet-Estimation', 'ucasir/NPRF', 'yueliukth/decoupling_breast_cancer_risk', 'corinadima/gWordcomp', 'benedekrozemberczki/BANE', 'yhchen12101/FGP-ICL', 'yang1fan2/Dota2-Prediction', 'ytsvetko/metaphor', 'xiangzhang1015/adversarial_seizure_detection', 'yuke93/RL-Restore', 'svakulenk0/semantic_coherence', 'aws-samples/amazon-sagemaker-visual-transformer', 'benathi/density-order-emb', 'yorkerlin/iBayesLRule', 'zhangmeishan/wordstructures', 'MIDA-group/sdt', 'zhd96/pi-vae', 'zihangdai/cegan_iclr2017', 'yuxiaochen1103/DG-STA', 'comicencyclo/TransferLearning_DiscriminativeFineTuning', 'yikeqicn/DeepErase', 'lcary/keras-program-induction', 'yjxiong/temporal-segment-networks', 'caroline171/content_based_recommendation', 'yccyenchicheng/p2pvg', 'pablovin/iCubUno', 'zhusiling/Pytorch-Encoding-boundary', 'ziyuan400/video_segmentation', 'swabhs/open-sesame', 'dssg/hiv-retention-public', 'zhanhaoliu09/auto_tv_denoise', 'zubair-irshad/imitation_learning', 'zhengjingwei/cluster_GCN', 'benedekrozemberczki/ClusterGCN', 'vinodkkurmi/PQG', 'yfsong0709/ResGCNv1', 'zbyte64/pytorch-dagsearch', 'yashkant/ENAS-Quantized-Neural-Networks', 'MINGUKKANG/PNU_Termproject_ENAS', 'www0wwwjs1/Matrix-Capsules-EM-Tensorflow', 'yashkalani/DRAW', 'rktamplayo/MCFA', 'yomnaa/AbsrtactiveApi', 'youngjie-cho/csci1470final', 'yougoforward/Fast_psaa', 'yougoforward/can', 'ywcmaike/TianchiVideoCharacterSegmentationPreliminary', 'zyasjtu/EAST', 'ydup/Anomaly-Detection-in-Time-Series-with-Triadic-Motif-Fields', 'yuhaozhang/tacred-relation', 'zhanjunlang/Span_OIE', 'zsef123/PGGAN-Pytorch', 'ANLGBOY/RealNVP-with-PyTorch', 'yanivbl6/quantized_meanfield', 'zi-lin/on-lstm-tensorflow', 'yikangshen/Ordered-Neurons', 'ben-ix/AdaptiveTPOT', 'zhling2020/RIS-GAN', 'mgermain/MADE', 'carpedm20/BEGAN-tensorflow', 'n-akram/SafeML', 'yuzhou-git/deep-casa', 'zsjdddhr/GraphRfi', 'yusuke0519/constrastive_predictive_coding', 'zjuym/chinese_cws_ner', 'DixinFan/st-gcn', 'yysijie/st-gcn', 'bpucla/latent-space-EBM-prior', 'laowang666888/ECSP1', 'zlai0/MAST', 'zh3nis/scrn', 'ranjanisubramanyan/Patient-data-representation', 'jsalsman/featex', 'ySalaun/LineSfM', 'zhanhuijing/ECC_PYCHARM'] not in index"

In [None]:
def make_records_df(sources, connected_vertices):
    return pd.DataFrame.from_records(
        [
            {"source": src, "destination": dst, "edge_type": "repo-file"}
            for (src, destinations) in zip(sources, connected_vertices)
            for dst in destinations 
        ]
    )

In [None]:
train_records_df = make_records_df(graphsage_data_train.repos, graph_data_train.X.fillna("").str.split()).drop_duplicates()

In [None]:
def get_vertex_embeddings(wrapper, vertex_subset, model):
    features = (
        model.full_forward(
            wrapper.dataset.x, wrapper.dataset.edge_index
        )
        .cpu()
        .detach()
        .numpy()
    )
    return features[wrapper.vertex_mapping.loc[vertex_subset]]


In [None]:
extended_dependency_graph_wrapper = PygGraphWrapper(embeddings.FastTextVectorizer(fasttext_model).transform, non_root_dependency_records_df + make_records_df(graphsage_data_train.repos, graph_data_train.X.dropna().str.split()))

In [None]:
dependency_graph_wrapper.get_vertex_embeddings(graphsage_data_train.X.iloc[0].split(), graphsage_model)

In [None]:
graphsage_learner = RetrieverLearner(
    zero_shot.ESZSLearner(100,10),
    LambdaTransformer(lambda x: dependency_graph_wrapper.get_vertex_embeddings(x, graphsage_model)),
    embeddings.FastTextVectorizer(fasttext_model)
)
graphsage_learner.fit_learner(graphsage_data_train)
graphsage_learner.evaluate(graphsage_data_train, metric=metrics.accuracy_score)

In [None]:
get_retrieval_accuracy(graphsage_learner, graphsage_data_train, k=10)

In [None]:
get_retrieval_accuracy(graphsage_learner, graphsage_data_test, k=10)

# Concatenation of repo, import embeddings

In [None]:
paired_data_train, paired_data_test = RepoTaskData.create_split(tasks_train, all_tasks, paperswithcode_with_features_df, paperswithcode_with_imports_df['imports'])
paired_data_train.X = graph_data_train.X + " " + import_data_train.X
paired_data_test.X = graph_data_test.X + " " + import_data_test.X

In [None]:
paired_data_train.X

In [None]:
paired_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    PairedKeyedVectors(python_word_embeddings.wv, graphsage_embeddings),
    fasttext_model,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.FastTextVectorizer
)

paired_learner.fit_learner(graph_data_train)

In [None]:
paired_learner.evaluate(graph_data_train, metric=metrics.accuracy_score)

In [None]:
get_retrieval_accuracy(paired_learner, paired_data_train, k=10)

In [None]:
get_retrieval_accuracy(paired_learner, paired_data_test, k=10)

In [None]:
results = []
for (learner, learner_name, test) in zip(
    [import2vec_learner, prone_learner, paired_learner],
    ['import2vec', 'prone', 'both'],
    [X_test, repo_graph_terms_test, X_paired_test]
):
    accs = []
    for k in [1, 3, 5, 10, 20]:
        rec = get_retrieval_accuracy(learner, test, y_test, test_task_idxs, k=k)
        accs.append(rec)
    results.append(pd.Series(name=learner_name, data=accs))

In [None]:
results_df = pd.DataFrame(results)
results_df.columns = ["Accuracy@{}".format(i) for i in [1, 3, 5, 10, 20]]

In [None]:
results_df.round(3).to_markdown(open("metrics/zsl_results.md", "w"))

In [None]:
!cat metrics/zsl_results.md

In [None]:
import toolz

In [None]:
task_distances = metrics.pairwise.cosine_distances(task_embeddings, task_embeddings)

In [None]:
poincare_embeddings = gensim.models.KeyedVectors.load('data/poincare5.vec')

In [None]:
import gensim.models.wrappers.fasttext
from gensim.test.utils import datapath

In [None]:
from github_search import typical_file_parts
from mlutil import prototype_selection

In [None]:
selected_lines_df = typical_file_parts.get_selected_lines_and_repos(python_files_df['repo_name'], python_files_df['content'])

# Selecting prototypical lines

In [None]:
fasttext_selector = prototype_selection.PrototypeSelector(fasttext_avg_embedder)

In [None]:
try:
    fasttext_prototypes = json.load(open('data/fasttext_prototypes.json', 'r'))
except:
    fasttext_selector.fit_prototypes(selected_lines_df['line'], selected_lines_df['repo'])
    fasttext_prototypes = fasttext_selector.prototypes
    json.dump(fasttext_prototypes, open('data/fasttext_prototypes.json', 'w'))

In [None]:
codebert_vectorizer = embeddings.TransformerVectorizer('microsoft/codebert-base', batch_size=64)

In [None]:
codebert_selector = prototype_selection.PrototypeSelector(codebert_vectorizer)

In [None]:
try:
    codebert_prototypes = json.load(open('data/codebert_prototypes.json', 'r'))
except:
    codebert_selector.fit_prototypes(selected_lines_df['line'], selected_lines_df['repo'])
    codebert_prototypes = codebert_selector.prototypes
    json.dump(codebert_prototypes, open('data/codebert_prototypes.json', 'w'))

In [None]:
def vectorize_prototypes(vectorizer, prototypes):
    prototype_aggregated_embeddings = {}
    for key in prototypes.keys():
        prototype_aggregated_embeddings[key] = np.mean(vectorizer.transform(prototypes[key]), axis=0)
    return list(prototype_aggregated_embeddings.keys()), np.row_stack(prototype_aggregated_embeddings.values())

In [None]:
codebert_prototypes = {
    repo: v
    for (repo, v) in codebert_prototypes.items()
    if repo in paperswithcode_with_imports_df['repo_name'].values
}

In [None]:
codebert_prototypes.keys()

In [None]:
repos_train

In [None]:
fasttext_prototypes = {
    repo: v
    for (repo, v) in fasttext_prototypes.items()
    if repo in paperswithcode_with_imports_df['repo_name'].values
}

In [None]:
def get_prototypes(repo_name):
    return pd.DataFrame({"codebert": codebert_prototypes[repo_name], "fasttext": fasttext_prototypes[repo_name]})

In [None]:
fasttext_prototypes.keys()

In [None]:
get_prototypes("transformer")

In [None]:
get_prototypes("mmdetection")

In [None]:
get_prototypes("Recommenders-movielens")

In [None]:
get_prototypes("mmdetection")

In [None]:
fasttext_prototypes['mmdetection']

In [None]:
codebert_repos, codebert_prototype_embeddings = vectorize_prototypes(codebert_vectorizer, codebert_prototypes)

In [None]:
fasttext_repos, fasttext_prototype_embeddings = vectorize_prototypes(fasttext_avg_embedder, fasttext_prototypes)

In [None]:
len(fasttext_prototype_embeddings)

In [None]:
paperswithcode_tasks_series = paperswithcode_with_imports_df['most_common_task']
paperswithcode_tasks_series.index = paperswithcode_with_imports_df['repo_name']
#paperswithcode_tasks_series = paperswithcode_tasks_series[paperswithcode_tasks_series.index.isin(fasttext_repos)]

In [None]:
fasttext_tasks = paperswithcode_tasks_series.loc[fasttext_repos]
fasttext_tasks_embeddings = task_embedder.transform(fasttext_tasks)
codebert_tasks = paperswithcode_tasks_series.loc[codebert_repos]
codebert_tasks_embeddings = task_embedder.transform(codebert_tasks)

In [None]:
codebert_prototype_embeddings.shape

In [None]:
eszs_learner = zero_shot.ESZSLearner()

In [None]:
codebert_prototype_embeddings.shape

In [None]:
len(codebert_tasks)

In [None]:
eszs_learner.fit(codebert_prototype_embeddings, codebert_tasks, task_embeddings[:-1])
eszs_learner.score(codebert_prototype_embeddings, codebert_tasks, task_embeddings[:-1])

In [None]:
eszs_learner.fit(fasttext_prototype_embeddings, fasttext_tasks, task_embeddings[:-1])
eszs_learner.score(fasttext_prototype_embeddings, fasttext_tasks, task_embeddings[:-1])

In [None]:
list(set(selected_lines_df['repo']))[3007]

In [None]:
problematic_lines_df = selected_lines_df[selected_lines_df['repo'] == 'auto_ml']

In [None]:
del codebert_vectorizer

In [None]:
problematic_lines_df['lines']

In [None]:
codebert_selector.prototypes

In [None]:
y_embeddings = fasttext_avg_embedder.transform(tasks)

In [None]:
repo_names = 
repo_embeddings = 

In [None]:
y_embeddings.shape