In [1]:
#default_exp matching_zsl

In [2]:
#export
import os
import ast
import tqdm
import json
import attr
from operator import itemgetter

from scarce_learn import zero_shot
from mlutil.feature_extraction import embeddings
import itertools


import pandas as pd
import numpy as np
from sklearn import feature_extraction, metrics, model_selection

import matplotlib.pyplot as plt
import gensim

from github_search import paperswithcode_tasks

import mlutil
from functools import partial


from scarce_learn.zero_shot import devise_jax, devise_torch

from github_search import github_readmes
import concurrent.futures
from github_search import python_call_graph
import gensim
from scipy.stats import hmean

In [3]:
%env XLA_PYTHON_CLIENT_PREALLOCATE=false

env: XLA_PYTHON_CLIENT_PREALLOCATE=false


In [4]:
# upstream

import_corpus_path = 'output/module_corpus.csv'
word_vectors_filename = 'output/import2vec_module_vectors.bin'

In [5]:
%cd ..

/home/kuba/Projects/github_search


%%time
import_corpus_df = pd.read_csv(import_corpus_path)
per_repo_imports = import_corpus_df.groupby('repo')['imports'].agg(sum).apply(set)
import_corpus_df['imports'] = import_corpus_df['imports'].apply(ast.literal_eval)

In [6]:
%%time
python_files_df = pd.read_csv('data/crawled_python_files.csv', encoding='latin-1')
repo_names = python_files_df['repo_name']
import_corpus_df = pd.read_csv(import_corpus_path)
per_repo_imports = import_corpus_df.groupby('repo')['imports'].agg(sum).apply(set)

CPU times: user 3min 33s, sys: 4.14 s, total: 3min 37s
Wall time: 3min 42s


In [7]:
python_files_df.shape

(1797972, 3)

In [8]:
import_corpus_df.shape

(1749175, 3)

In [9]:
python_files_df['repo_name']

0                   trangvu/ape-npi
1                   trangvu/ape-npi
2                   trangvu/ape-npi
3                   trangvu/ape-npi
4                   trangvu/ape-npi
                     ...           
1797967    vuanhtu1993/Keras-SRGANs
1797968    vuanhtu1993/Keras-SRGANs
1797969    vuanhtu1993/Keras-SRGANs
1797970    vuanhtu1993/Keras-SRGANs
1797971    vuanhtu1993/Keras-SRGANs
Name: repo_name, Length: 1797972, dtype: object

In [10]:
python_files_df['repo_name'].unique().shape

(26999,)

python_files_df['repo'] = python_files_df['repo_name'].str.split("/").apply(itemgetter(1))  + '/' + python_files_df['repo_name']
repo_names_tmp = python_files_df['repo_name']
repo_names = repo_names_tmp.unique()
python_files_df['repo_name'] = python_files_df['repo']
python_files_df['repo'] = repo_names_tmp

In [11]:
%%time
import2vec = gensim.models.KeyedVectors.load(word_vectors_filename)
import2vec_embedder = mlutil.feature_extraction.embeddings.AverageWordEmbeddingsVectorizer(import2vec)

CPU times: user 7.01 ms, sys: 116 µs, total: 7.13 ms
Wall time: 8.82 ms


In [12]:
paperswithcode_with_imports_df = pd.read_csv('output/papers_with_imports.csv')
paperswithcode_with_imports_df['tasks'] = paperswithcode_with_imports_df['tasks'].str.replace("2d ", "").str.replace("3d ", "").str.replace("4d ", "").str.replace("6d ", "").str.lower().apply(ast.literal_eval)
paperswithcode_with_imports_df['imports'] = paperswithcode_with_imports_df['imports'].str.replace("set\(\)", "{}").apply(ast.literal_eval)#str.replace("2d ", "").str.replace("3d ", "").str.replace("4d ", "").str.replace("6d ", "").str.lower().apply(ast.literal_eval)

In [13]:
paperswithcode_with_imports_df.shape

(17388, 23)

In [14]:
paperswithcode_with_imports_df['n_imports'] = paperswithcode_with_imports_df['imports'].apply(len) 

In [15]:
paperswithcode_with_imports_df['n_imports_with_embeddings'] = paperswithcode_with_imports_df['imports'].apply(lambda imps: len([imp in import2vec.vocab.keys() for imp in imps]))

In [16]:
%%time
word_embeddings = mlutil.feature_extraction.embeddings.load_gensim_embedding_model('glove-wiki-gigaword-300')

CPU times: user 33.5 s, sys: 279 ms, total: 33.8 s
Wall time: 34.1 s


In [17]:
import fasttext
fasttext_model = fasttext.load_model("output/python_files_fasttext_dim200.bin")



In [18]:
from gensim.models.callbacks import CallbackAny2Vec

        
class LossCallback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [19]:
python_word_embeddings = gensim.models.Word2Vec.load('output/abstract_readme_w2v200.bin')

INFO - 11:30:51: loading wv recursively from output/abstract_readme_w2v200.bin.wv.* with mmap=None
INFO - 11:30:51: loading vectors from output/abstract_readme_w2v200.bin.wv.vectors.npy with mmap=None
INFO - 11:30:51: setting ignored attribute vectors_norm to None
INFO - 11:30:51: loading vocabulary recursively from output/abstract_readme_w2v200.bin.vocabulary.* with mmap=None
INFO - 11:30:51: loading trainables recursively from output/abstract_readme_w2v200.bin.trainables.* with mmap=None
INFO - 11:30:51: loading syn1neg from output/abstract_readme_w2v200.bin.trainables.syn1neg.npy with mmap=None
INFO - 11:30:51: setting ignored attribute cum_table to None
INFO - 11:30:51: loaded output/abstract_readme_w2v200.bin


In [20]:
#export

@attr.s
class RepoTaskData:
    
    tasks = attr.ib()
    repos = attr.ib()
    X = attr.ib()
    all_tasks = attr.ib()
    y = attr.ib()
    
    def split_tasks(area_grouped_tasks, test_size=0.2):
        tasks_train, tasks_test = model_selection.train_test_split(area_grouped_tasks['task'], stratify=area_grouped_tasks['area'], test_size=test_size, random_state=0)
        return tasks_train, tasks_test
    
    def create_split(tasks_train, all_tasks, paperswithcode_with_features_df, X_repr):
        train_indicator = paperswithcode_with_features_df['most_common_task'].isin(tasks_train)
        print(train_indicator.shape)
        repos_train = paperswithcode_with_features_df['repo'][train_indicator]
        repos_test = paperswithcode_with_features_df['repo'][~train_indicator]
        X_repr = X_repr.apply(lambda x: " ".join(x))
        X_train = X_repr[train_indicator]
        X_test = X_repr[~train_indicator]
        all_tasks_train = all_tasks[train_indicator]
        all_tasks_test = all_tasks[~train_indicator]
        y_train = paperswithcode_with_features_df[train_indicator]['most_common_task'].str.lower()
        y_test = paperswithcode_with_features_df[~train_indicator]['most_common_task'].str.lower()
        
        return (
            RepoTaskData(tasks_train, repos_train, X_train, all_tasks_train, y_train),
            RepoTaskData(tasks_test, repos_test, X_test, all_tasks_test, y_test)
        )

In [21]:
#export


def get_first_vocab_entry(vocab):
    return list(itertools.islice(vocab.items(), 1))[0][0] 


class PairedKeyedVectors:
    
    @attr.s
    class wv:
        vocab = attr.ib()
    
    def __init__(self, kv1, kv2):
        self.kv1 = kv1
        self.kv2 = kv2
        self.vocab = {**kv1.vocab, **kv2.vocab} 
        self.dim1 = len(kv1[get_first_vocab_entry(kv1.vocab)])
        self.dim2 = len(kv2[get_first_vocab_entry(kv2.vocab)])
        self.wv= PairedKeyedVectors.wv(self.vocab)
    
    def __getitem__(self, item):
        if not item in self.kv1.vocab.keys():
            return np.concatenate([np.zeros(self.dim1), self.kv2[item]])
        elif not item in self.kv2.vocab.keys():
            return np.concatenate([self.kv1[item], np.zeros(self.dim2)])
        else:
            return np.concatenate([self.kv1[item], self.kv2[item]])
    


@attr.s
class RetrieverLearner:
    
    zs_learner: zero_shot.ZeroShotClassifier = attr.ib()
    input_embedder: embeddings.EmbeddingVectorizer = attr.ib() 
    y_embedder: embeddings.EmbeddingVectorizer = attr.ib()
    input_embedder_kwargs = attr.ib(default=dict())
        
    @staticmethod
    def create(
        zs_learner: zero_shot.ZeroShotClassifier,
        input_embeddings: gensim.models.KeyedVectors,
        target_embeddings: gensim.models.KeyedVectors,
        input_embedding_method: embeddings.EmbeddingVectorizer,
        y_embedding_method: embeddings.EmbeddingVectorizer,
        input_embedder_kwargs=dict()
    ):
        input_embedder = input_embedding_method(input_embeddings, **input_embedder_kwargs) 
        y_embedder = y_embedding_method(target_embeddings)
        return RetrieverLearner(zs_learner, input_embedder, y_embedder)
    
    def get_target_embeddings(self, y):
        unique_y = pd.Series(y.unique())
        y_embeddings = self.y_embedder.transform(unique_y)
        return unique_y, y_embeddings
    
    def fit_learner(self, data, **kwargs):
        self.input_embedder.fit(data.X)
        X_embeddings = self.input_embedder.transform(data.X)
        self.y_embedder.fit(data.y)
        unique_y, y_embeddings = self.get_target_embeddings(data.y)
        input_y_idxs = data.y.apply(lambda t: unique_y[unique_y == t].index[0])
        self.zs_learner.fit(np.array(X_embeddings), np.array(input_y_idxs), np.array(y_embeddings), **kwargs)
        
    def predict_idxs(self, X, y_embeddings):
        X_embeddings = self.input_embedder.transform(X)
        return self.zs_learner.predict(X_embeddings, y_embeddings)
    
    def predict_topk(self, X, y_embeddings, target_names, k=5, similarity=metrics.pairwise.cosine_similarity):
        X_embeddings = self.input_embedder.transform(X)
        predictions = self.zs_learner.predict_raw(X_embeddings)
        target_similarities = similarity(predictions, y_embeddings)
        targets = [target_names[row[:k]] for row in (-target_similarities).argsort(axis=1)]
        return targets
        
    def evaluate(self, data, metric):
        unique_y, y_embeddings = self.get_target_embeddings(data.y)
        input_y_idxs = data.y.apply(lambda t: unique_y[unique_y == t].index[0])
        predicted_idxs = self.predict_idxs(data.X, y_embeddings)
        return metric(input_y_idxs, predicted_idxs)

In [22]:
#export

def get_accuracy(learner, X, y, y_names, k=10, similarity=metrics.pairwise.cosine_similarity):
    input_embeddings = learner.input_embedder.transform(X)
    y_embeddings = learner.y_embedder.transform(y_names)
    predictions = learner.zs_learner.predict_raw(input_embeddings)
    target_similarities = similarity(predictions, y_embeddings)
    target_idxs = (-target_similarities).argsort(axis=1)
    targets = [y_names.iloc[row[:k]] for row in target_idxs]

    accuracies = np.zeros(len(X))
    for i in range(len(X)):
        true_tasks = set(all_tasks_test.iloc[i])
        accuracies[i] = len(true_tasks.intersection(set(targets[i].values))) / min(len(true_tasks), k)
    return accuracies.mean()

In [23]:
import pickle

graph = pickle.load(open('output/call_igraph.pkl', 'rb'))

In [24]:
len(graph.get_vertex_dataframe().iloc[graph.neighborhood(vertices=["<ROOT>"])[0]])

27000

get repos that are in graph 

In [25]:
graph_nodes = graph.get_vertex_dataframe()['name'].unique()

In [26]:
paperswithcode_with_imports_df.shape

(17388, 25)

In [27]:
%%time
paperswithcode_with_features_df = paperswithcode_with_imports_df[
    paperswithcode_with_imports_df['repo'].isin(graph.get_vertex_dataframe()['name']) |
    paperswithcode_with_imports_df['repo'].apply(lambda s: s.split("/")[1]).isin(graph.get_vertex_dataframe()['name'])
]

CPU times: user 939 ms, sys: 55.9 ms, total: 995 ms
Wall time: 999 ms


In [28]:
paperswithcode_with_imports_df.shape

(17388, 25)

In [29]:
paperswithcode_with_imports_df = paperswithcode_with_imports_df[paperswithcode_with_imports_df['repo'].isin(paperswithcode_with_features_df['repo'])]

In [30]:
paperswithcode_with_imports_df.shape

(17388, 25)

In [31]:
def clean_task_name(task_name):
    return task_name.replace("2d ", "").replace("3d ", "").replace("4d ", "").replace("6d ", "").lower()

paperswithcode_with_features_df['most_common_task'] = paperswithcode_with_features_df['most_common_task'].str.lower()
tasks = paperswithcode_with_features_df['most_common_task'].str.lower()
tasks = tasks.apply(clean_task_name)
all_tasks = paperswithcode_with_features_df['tasks'].apply(lambda s: [clean_task_name(t) for t in s])
paperswithcode_with_features_df.shape

(17388, 25)

In [32]:
all_tasks.explode().value_counts()[:100]

semantic segmentation                     1520
object detection                          1508
image classification                      1387
language modelling                         716
representation learning                    634
                                          ... 
scene text                                 100
electron microscopy image segmentation     100
cell segmentation                          100
nuclear segmentation                        98
scene understanding                         98
Name: tasks, Length: 100, dtype: int64

In [33]:
#export

def get_area_grouped_tasks(paperswithcode_tasks_path='data/paperswithcode_tasks.csv'):
    area_grouped_tasks = pd.read_csv('data/paperswithcode_tasks.csv')
    area_grouped_tasks['task'] = area_grouped_tasks['task'].str.replace("-", ' ')
    area_grouped_tasks = area_grouped_tasks[area_grouped_tasks['task'].isin(tasks)]
    area_counts = area_grouped_tasks['area'].value_counts()
    area_grouped_tasks = area_grouped_tasks[area_grouped_tasks['area'].isin(area_counts.index[area_counts > 1])]
    return area_grouped_tasks

In [34]:
area_grouped_tasks = get_area_grouped_tasks()

INFO - 11:30:58: Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO - 11:30:58: NumExpr defaulting to 8 threads.


In [35]:
tasks_train, tasks_test = RepoTaskData.split_tasks(area_grouped_tasks)

In [36]:
len(tasks_train)

304

In [37]:
tasks_test

1783             binarization
209      activity recognition
1182          active learning
1731          image denoising
493          video prediction
                ...          
1068      weather forecasting
985     trajectory prediction
344         image enhancement
249            face detection
1487                starcraft
Name: task, Length: 76, dtype: object

In [38]:
len(tasks_test)

76

In [39]:
paperswithcode_with_features_df['most_common_task']

0            dictionary learning
1             sentiment analysis
2                region proposal
3               image generation
4                       fairness
                  ...           
17383          anomaly detection
17384             style transfer
17385             style transfer
17386             style transfer
17387    representation learning
Name: most_common_task, Length: 17388, dtype: object

In [40]:
paperswithcode_with_features_df['most_common_task'].isin(tasks_test).sum()

2252

In [41]:
paperswithcode_with_features_df.shape

(17388, 25)

In [42]:
paperswithcode_with_features_df.shape

(17388, 25)

In [43]:


def get_readme_summaries(upstream, product, keywords=True):
    pool = concurrent.futures.ProcessPoolExecutor(max_workers=10)
    raw_readmes = list(pool.map(github_readmes.get_readme, paperswithcode_with_features_df['repo']))
    readmes = pd.Series(raw_readmes).apply(github_readmes.try_decode)
    return readmes

In [44]:
def get_readmes(df, keywords=True):
    pool = concurrent.futures.ProcessPoolExecutor(max_workers=10)
    raw_readmes = list(pool.map(github_readmes.get_readme, df['repo']))
    readmes = list(map(github_readmes.try_decode, raw_readmes))
    return readmes

In [45]:
paperswithcode_with_features_df.columns

Index(['Unnamed: 0', 'paper_url', 'arxiv_id', 'title', 'abstract', 'url_abs',
       'url_pdf', 'proceeding', 'authors', 'tasks', 'date', 'methods',
       'framework', 'mentioned_in_github', 'mentioned_in_paper',
       'paper_arxiv_id', 'paper_title', 'paper_url_abs', 'paper_url_pdf',
       'repo', 'repo_url', 'most_common_task', 'imports', 'n_imports',
       'n_imports_with_embeddings'],
      dtype='object')

In [46]:
paperswithcode_with_features_df.shape

(17388, 25)

In [47]:
%%time
readmes = get_readmes(paperswithcode_with_features_df)

CPU times: user 6.05 s, sys: 1.55 s, total: 7.6 s
Wall time: 9min 19s


In [48]:
def try_keywords(text):
    return python_call_graph.try_run(gensim.summarization.keywords)(text)

In [49]:
pool = concurrent.futures.ProcessPoolExecutor(max_workers=20)

In [50]:
%%time
readme_keywords = pd.Series(pool.map(try_keywords, readmes)).str.replace("\n", " ")

CPU times: user 3.56 s, sys: 1.9 s, total: 5.47 s
Wall time: 7min 48s


In [51]:
dependency_records_df = pd.read_csv('output/processed_dependency_records.csv').dropna()#.iloc[:1000000]
non_root_dependency_records_df = dependency_records_df[
    (dependency_records_df['source'] != "<ROOT>") &
    (dependency_records_df['edge_type'] != 'repo-repo')
]
repo_descriptions = non_root_dependency_records_df[['source', 'repo_description']].groupby('source').apply(lambda df: df['repo_description'].iloc[0])

describable_paperswithcode_with_features_df = paperswithcode_with_features_df[paperswithcode_with_features_df['repo'].isin(repo_descriptions.index)]
describable_paperswithcode_with_imports_df = paperswithcode_with_imports_df[paperswithcode_with_imports_df['repo'].isin(repo_descriptions.index)]
describable_repo_tasks = all_tasks[paperswithcode_with_imports_df['repo'].isin(repo_descriptions.index)]


import_data_train, import_data_test = RepoTaskData.create_split(tasks_train, describable_repo_tasks, describable_paperswithcode_with_features_df, describable_paperswithcode_with_imports_df['imports'])

KeyError: "['repo_description'] not in index"

In [None]:
describable_paperswithcode_with_features_df = paperswithcode_with_features_df[paperswithcode_with_features_df['repo'].isin(repo_descriptions.index)]
describable_paperswithcode_with_imports_df = paperswithcode_with_imports_df[paperswithcode_with_imports_df['repo'].isin(repo_descriptions.index)]
describable_repo_tasks = all_tasks[paperswithcode_with_imports_df['repo'].isin(repo_descriptions.index)]


import_data_train, import_data_test = RepoTaskData.create_split(tasks_train, describable_repo_tasks, describable_paperswithcode_with_features_df, describable_paperswithcode_with_imports_df['imports'])

In [None]:
describable_paperswithcode_with_features_df.shape

In [None]:
describable_paperswithcode_with_imports_df.shape

In [None]:
all_tasks

In [None]:
task_embedder = mlutil.feature_extraction.embeddings.AverageWordEmbeddingsVectorizer(word_embeddings)

In [None]:
#export


def get_outgoing_edges(graph, node):
    #idx = pd.Index(graph.names).get_loc(node)
    #outgoing_edges_idx = np.where(graph.mat[idx].todense())[1]
    return graph.get_vertex_dataframe().iloc[graph.successors(node)]['name']
    #return graph.names[outgoing_edges_idx]


def get_repo_functions(graph, repo):
    return ' '.join(get_outgoing_edges(graph, repo).values)

In [None]:
graph_records = pd.read_csv('output/dependency_records.csv')

In [None]:
!rm out

In [None]:
#export


def prepare_task_train_test_split(upstream, area_grouped_tasks_path, product):
    area_grouped_tasks = get_area_grouped_tasks(area_grouped_tasks_path)
    tasks_train, tasks_test = RepoTaskData.split_tasks(area_grouped_tasks)
    tasks_train.to_csv(product['train'], index=None)
    tasks_test.to_csv(product['test'], index=None)


def prepare_graph_repo_task_data(upstream, product):
    graph_data_train, graph_data_test = RepoTaskData.create_split(tasks_train, all_tasks, paperswithcode_with_features_df, paperswithcode_with_imports_df['imports'])
    graph_data_train.X = graph_data_train.repos.apply(lambda x: get_repo_functions(graph, x))
    graph_data_test.X = graph_data_test.repos.apply(lambda x: get_repo_functions(graph, x))
    pickle.dump((graph_data_train, graph_data_test), open(str(product), "wb"))

In [None]:
%%time
if os.path.exists("output/tmp_graph_data.pkl"):
    (graph_data_train, graph_data_test) = pickle.load(open("output/tmp_graph_data.pkl", "rb"))
else:
    graph_data_train, graph_data_test = RepoTaskData.create_split(tasks_train, all_tasks, paperswithcode_with_features_df, paperswithcode_with_imports_df['imports'])
    graph_data_train.X = graph_data_train.repos.apply(lambda x: get_repo_functions(graph, x))
    graph_data_test.X = graph_data_test.repos.apply(lambda x: get_repo_functions(graph, x))
    pickle.dump((graph_data_train, graph_data_test), open("output/tmp_graph_data.pkl", "wb"))

In [None]:
graph_data_train, graph_data_test = RepoTaskData.create_split(tasks_train, describable_repo_tasks, describable_paperswithcode_with_features_df, describable_paperswithcode_with_imports_df['imports'])

In [None]:
graph_data_train.X = pd.Series(repo_descriptions.loc[graph_data_train.repos].values, index=graph_data_train.repos.index)

In [None]:
graph_data_test.X = pd.Series(repo_descriptions.loc[graph_data_test.repos].values, index=graph_data_test.repos.index)

In [None]:
graph_data_train.repos.iloc[0]

In [None]:
get_outgoing_edges(graph, get_outgoing_edges(graph, graph_data_train.repos.iloc[0]).iloc[0])

In [None]:
len(graph_data_train.X)

In [None]:
graph_data_train.X.ilpc

In [None]:
for i in range(len(graph_data_train.X)):
    graph_data_train.X.iloc[i] = graph_data_train.X.iloc[i].replace(graph_data_train.repos.iloc[i], "")
for i in range(len(graph_data_test.X)):
    graph_data_test.X.iloc[i] = graph_data_test.X.iloc[i].replace(graph_data_test.repos.iloc[i], "")

In [None]:
graph_data_train.X = graph_data_train.X.str.replace(":", " ")
graph_data_train.X = graph_data_train.X.str.replace("<ROOT>", " ")
graph_data_test.X = graph_data_test.X.str.replace(":", " ")
graph_data_test.X = graph_data_test.X.str.replace("<ROOT>", " ")

In [None]:
graph_data_train.X

In [None]:
#export


    
def get_retrieval_results(learner, data, k=10, similarity=metrics.pairwise.cosine_similarity):
    y_names, __ = learner.get_target_embeddings(data.y)
    input_embeddings = learner.input_embedder.transform(data.X)
    y_embeddings = learner.y_embedder.transform(y_names)
    predictions = learner.zs_learner.predict_raw(input_embeddings)
    input_target_similarities = similarity(predictions, y_embeddings)

    X_recalled = [
        np.argsort(-input_target_similarities[:,y_idx])[:k]
        for (y_idx, __) in enumerate(y_names)
    ]
    return X_recalled


def get_retrieval_accuracies(learner, data, k=10, similarity=metrics.pairwise.cosine_similarity):
    y_names, __ = learner.get_target_embeddings(data.y)
    recalled_X = get_retrieval_results(learner, data, k=k, similarity=similarity)
    recalled_X_actual_y = [data.y.iloc[idxs_recalled].explode() for idxs_recalled in recalled_X]
    accurately_recalled = [
        y_name in recalled_X_actual_y[y_idx].values 
        for (y_idx, y_name) in enumerate(y_names)
    ]
    return pd.Series(data=accurately_recalled, index=y_names)


def get_retrieval_accuracy(learner, data, k=10, similarity=metrics.pairwise.cosine_similarity):
    y_names, __ = learner.get_target_embeddings(data.y)
    return np.mean(get_retrieval_accuracies(learner, data, k, similarity))

In [None]:
#export


def run_learner_experiment(
    retriever_learner,
    data_train, data_test
):
    retriever_learner.fit_learner(data_train)
    
    accuracy_train = retriever_learner.evaluate(data_train, metrics.accuracy_score)
    accuracy_test = retriever_learner.evaluate(data_test, metrics.accuracy_score)
    top10_accuracy_train = get_retrieval_accuracy(retriever_learner, data_train, k=10)
    top10_accuracy_test = get_retrieval_accuracy(retriever_learner, data_test, k=10)
    
    return dict(
        accuracy_train=accuracy_train,
        accuracy_test=accuracy_test,
        top10_accuracy_train=top10_accuracy_train,
        top10_accuracy_test=top10_accuracy_test
    )

## Abstracts

In [None]:
paperswithcode_with_imports_df['abstract']

In [None]:
has_abstract = ~paperswithcode_with_imports_df['abstract'].isna()

In [None]:
tasks_train[has_abstract]
paperswithcode_with_features_df[has_abstract]

In [None]:
abstract_data_train, abstract_data_test = RepoTaskData.create_split(tasks_train[has_abstract], all_tasks[has_abstract], paperswithcode_with_features_df[has_abstract], paperswithcode_with_features_df[has_abstract]['abstract'].str.split())

In [None]:
from scarce_learn.zero_shot import devise_jax

In [None]:
abstract_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    python_word_embeddings,
    python_word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

abstract_learner.fit_learner(abstract_data_train)

In [None]:
run_learner_experiment(abstract_learner, abstract_data_train, abstract_data_test)

# Abstract model using fasttext trained on Python code

In [None]:
ezslearner = zero_shot.ESZSLearner()
abstract_fasttext_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    word_embeddings,
    fasttext_model,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.FastTextVectorizer
)

In [None]:
run_learner_experiment(abstract_fasttext_learner, abstract_data_train, abstract_data_test)

# word2vec model on READMEs

In [None]:
paperswithcode_with_readmes_df = pd.read_csv("output/papers_with_readmes.csv")

In [None]:
paperswithcode_with_readmes_df['readme']

In [None]:
paperswithcode_with_imports_df['readme'] = paperswithcode_with_readmes_df['readme'] 
paperswithcode_with_features_df['readme'] = paperswithcode_with_readmes_df['readme'] 

In [None]:
has_readme = ~paperswithcode_with_imports_df['readme'].isna()

readme_data_train, readme_data_test = RepoTaskData.create_split(tasks_train[has_readme], all_tasks[has_readme], paperswithcode_with_features_df[has_readme], paperswithcode_with_features_df[has_readme]['readme'].str.split())

In [None]:
readme_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 100),
    python_word_embeddings,
    python_word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

readme_learner.fit_learner(readme_data_train)

In [None]:
run_learner_experiment(readme_learner, readme_data_train, readme_data_test)

## Fasttext on READMEs - worse than word2vec

In [None]:
ezslearner = zero_shot.ESZSLearner()
readme_fasttext_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    word_embeddings,
    fasttext_model,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.FastTextVectorizer
)

In [None]:
run_learner_experiment(readme_fasttext_learner, readme_data_train, readme_data_test)

# README keywords

In [None]:
readme_keywords_data_train, readme_keywords_data_test = RepoTaskData.create_split(tasks_train[has_readme], all_tasks[has_readme], paperswithcode_with_features_df[has_readme], readme_keywords[has_readme].str.split())

In [None]:
ezslearner = zero_shot.ESZSLearner()
readme_keywords_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(10, 10),
    word_embeddings,
    word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

run_learner_experiment(readme_keywords_learner, readme_keywords_data_train, readme_keywords_data_test)

## Import2Vec

In [None]:
ezslearner = zero_shot.ESZSLearner()
import2vec_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(lmbda=100.0, gamma=10.0),
    import2vec,
    word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

run_learner_experiment(import2vec_learner, import_data_train, import_data_test)

## PRoNe

In [None]:
prone_embeddings = gensim.models.KeyedVectors.load("data/prone_embeddings.bin")

Using repo embedding from node embeddings

In [None]:
prone_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100,10),
    prone_embeddings,
    python_word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

run_learner_experiment(prone_learner, graph_data_train, graph_data_test)

## GraphSage

## aggregating vertex embeddings 

In [None]:
graphsage_kv_file ="output/graphsage_embeddings_fasttext_dim200_epochs20_dim200_layers2.bin"

In [None]:
graphsage_embeddings = gensim.models.KeyedVectors.load(graphsage_kv_file)

In [None]:
list(graphsage_embeddings.vocab)[-1]

In [None]:
graph

In [None]:
vocab_list = list(graphsage_embeddings.vocab.keys())

In [None]:
tokens = graph_data_train.X[150].strip().split()

In [None]:
[token in vocab_list for token in tokens]

In [None]:
ezslearner = zero_shot.ESZSLearner()
graphsage_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    graphsage_embeddings,
    python_word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

run_learner_experiment(graphsage_learner, graph_data_train, graph_data_test)

In [None]:
abstract_results = run_learner_experiment(abstract_learner, abstract_data_train, abstract_data_test)
readme_results = run_learner_experiment(readme_learner, readme_data_train, readme_data_test)
graphsage_results = run_learner_experiment(graphsage_learner, graph_data_train, graph_data_test)

In [None]:
results_df = pd.DataFrame.from_records([abstract_results, readme_results, graphsage_results])
results_df['method'] = ['abstract', 'readme', 'graphsage']

In [None]:
results_df

## using GraphSAGE model for embedding

In [None]:
#export


class LambdaTransformer:
    
    def __init__(self, transform_fn):
        self.transform = transform_fn
        
    def fit(self, X, **kwargs):
        return self

In [None]:
from github_search.pytorch_geometric_data import PygGraphWrapper
import torch

In [None]:
fasttext_embedder = embeddings.FastTextVectorizer(fasttext_model)

In [None]:
%%time
dependency_graph_wrapper = PygGraphWrapper(fasttext_embedder.transform, non_root_dependency_records_df, "source", "destination")

In [None]:
!ls -ltr output/*graphsage*pth

In [None]:
graphsage_model = torch.load("output/graphsage_model_20_dim200_layers2.pth").cpu()
graphsage_model.training = False

In [None]:
graphsage_data_train, graphsage_data_test = RepoTaskData.create_split(tasks_train, all_tasks, paperswithcode_with_features_df, paperswithcode_with_imports_df.repo.apply(lambda s: [s]))

In [None]:
graphsage_data_train.X

In [None]:
#export


def make_records_df(sources, connected_vertices):
    return pd.DataFrame.from_records(
        [
            {"source": src, "destination": dst, "edge_type": "repo-file"}
            for (src, destinations) in zip(sources, connected_vertices)
            for dst in destinations 
        ]
    )

In [None]:
train_records_df = make_records_df(graphsage_data_train.repos, graph_data_train.X.fillna("").str.split()).drop_duplicates()

In [None]:
#export

def get_vertex_embeddings(wrapper, vertex_subset, model):
    features = (
        model.full_forward(
            wrapper.dataset.x, wrapper.dataset.edge_index
        )
        #.cpu()
        .detach()
        .numpy()
    )
    return features[wrapper.vertex_mapping.loc[vertex_subset]]

In [None]:
other_records_df = make_records_df(graphsage_data_train.repos, graph_data_train.X.dropna().str.split())

In [None]:
dep_graph_df = pd.concat([non_root_dependency_records_df, make_records_df(graphsage_data_train.repos, graph_data_train.X.dropna().str.split())])

In [None]:
dep_graph_df.isna().sum()

In [None]:
%%time
extended_dependency_graph_wrapper = PygGraphWrapper(embeddings.FastTextVectorizer(fasttext_model).transform, dep_graph_df)

In [None]:
extended_dependency_graph_wrapper.dataset.x.device

In [None]:
%%time
extended_dependency_graph_wrapper.get_vertex_embeddings(graphsage_data_train.X.iloc[0].split(), graphsage_model)

In [None]:
graphsage_learner = RetrieverLearner(
    zero_shot.ESZSLearner(100,10),
    LambdaTransformer(lambda x: extended_dependency_graph_wrapper.get_vertex_embeddings(x, graphsage_model)),
    embeddings.FastTextVectorizer(fasttext_model)
)
graphsage_learner.fit_learner(graphsage_data_train)
graphsage_learner.evaluate(graphsage_data_train, metric=metrics.accuracy_score)

In [None]:
get_retrieval_accuracy(graphsage_learner, graphsage_data_train, k=10)

In [None]:
get_retrieval_accuracy(graphsage_learner, graphsage_data_test, k=10)

# Demo

In [None]:
@attr.s
class Retriever:
    
    input_embedder = attr.ib()
    query_embedder = attr.ib()
    zs_learner = attr.ib()
    embeddings_calculated = attr.ib(default=False)
    
    def set_embeddings(self, X_names, X):
        self.X_embeddings = self.input_embedder.transform(X)
        self.X = X
        self.X_names = X_names
        self.embeddings_calculated = True
        
    def retrieve_query_results(self, query, k=10, similarity=metrics.pairwise.cosine_similarity):
        if not self.embeddings_calculated:
            raise Exception("embeddings not calculated")
        input_embeddings = self.X_embeddings
        y_embeddings = self.query_embedder.transform([query])
        predictions = self.zs_learner.predict_raw(input_embeddings)
        input_target_similarities = similarity(predictions, y_embeddings)
        return self.X_names.iloc[np.argsort(-input_target_similarities[:,0])[:k]]

    def from_retriever_learner(learner):
        return Retriever(learner.input_embedder, learner.y_embedder, learner.zs_learner)

In [None]:
import pickle

In [None]:
pickle.dump(readme_data_test, open("output/readme_data_test.pkl", "wb"))

In [None]:
pickle.dump(readme_learner, open("output/readme_learner.pkl", "wb"))

In [None]:
readme_retriever = Retriever.from_retriever_learner(readme_learner)
readme_retriever.set_embeddings(readme_data_train.repos, readme_data_train.X)

In [None]:
graphsage_retriever = Retriever.from_retriever_learner(graphsage_learner)
graphsage_retriever.set_embeddings(graphsage_data_train.repos, graphsage_data_train.X)

In [None]:
%%time
metric_learning_results = readme_retriever.retrieve_query_results("metric learning", k=5).values

In [None]:
%%time
metric_learning_results_graphsage = graphsage_retriever.retrieve_query_results("metric learning", k=5).values

In [None]:
distance_learning_results = readme_retriever.retrieve_query_results("distance learning", k=5).values

In [None]:
distance_learning_results_graphsage = graphsage_retriever.retrieve_query_results("distance learning", k=5).values

In [None]:
distance_learning_results

In [None]:
distance_learning_results_graphsage

In [None]:
readme_retriever.retrieve_query_results("video text detection", k=10).values

In [None]:
graphsage_retriever.retrieve_query_results("video text detection", k=10).values

In [None]:
readme_retriever.retrieve_query_results("context recommender systems", k=10).values

In [None]:
graphsage_retriever.retrieve_query_results("context recommender systems", k=10).values

In [None]:
readme_retriever.retrieve_query_results("bayesian optimization", k=10).values

In [None]:
graphsage_retriever.retrieve_query_results("bayesian optimization", k=10).values

In [None]:
readme_retriever.retrieve_query_results("evolutionary methods", k=10).values

In [None]:
graphsage_retriever.retrieve_query_results("evolutionary methods", k=10).values

In [None]:
readme_retriever.retrieve_query_results("painting", k=10).values

In [None]:
graphsage_retriever.retrieve_query_results("painting", k=10).values

In [None]:
%%time
retrieve_query_results(graphsage_learner, graphsage_data_train.repos, graphsage_data_train.X, "distance learning").values

In [None]:
retrieve_query_results(graphsage_learner, graphsage_data_train.X, "distance learning")

# Concatenation of repo, import embeddings

In [None]:
paired_data_train, paired_data_test = RepoTaskData.create_split(tasks_train, all_tasks, paperswithcode_with_features_df, paperswithcode_with_imports_df['imports'])
paired_data_train.X = graph_data_train.X + " " + import_data_train.X
paired_data_test.X = graph_data_test.X + " " + import_data_test.X

In [None]:
paired_data_train.X

In [None]:
paired_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    PairedKeyedVectors(python_word_embeddings.wv, graphsage_embeddings),
    fasttext_model,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.FastTextVectorizer
)

paired_learner.fit_learner(graph_data_train)

In [None]:
paired_learner.evaluate(graph_data_train, metric=metrics.accuracy_score)

In [None]:
get_retrieval_accuracy(paired_learner, paired_data_train, k=10)

In [None]:
get_retrieval_accuracy(paired_learner, paired_data_test, k=10)

In [None]:
results = []
for (learner, learner_name, test) in zip(
    [import2vec_learner, prone_learner, paired_learner],
    ['import2vec', 'prone', 'both'],
    [X_test, repo_graph_terms_test, X_paired_test]
):
    accs = []
    for k in [1, 3, 5, 10, 20]:
        rec = get_retrieval_accuracy(learner, test, y_test, test_task_idxs, k=k)
        accs.append(rec)
    results.append(pd.Series(name=learner_name, data=accs))

In [None]:
results_df = pd.DataFrame(results)
results_df.columns = ["Accuracy@{}".format(i) for i in [1, 3, 5, 10, 20]]

In [None]:
results_df.round(3).to_markdown(open("metrics/zsl_results.md", "w"))

In [None]:
!cat metrics/zsl_results.md

In [None]:
import toolz

In [None]:
task_distances = metrics.pairwise.cosine_distances(task_embeddings, task_embeddings)

In [None]:
poincare_embeddings = gensim.models.KeyedVectors.load('data/poincare5.vec')

In [None]:
import gensim.models.wrappers.fasttext
from gensim.test.utils import datapath

In [None]:
from github_search import typical_file_parts
from mlutil import prototype_selection

In [None]:
selected_lines_df = typical_file_parts.get_selected_lines_and_repos(python_files_df['repo_name'], python_files_df['content'])

# Selecting prototypical lines

In [None]:
fasttext_selector = prototype_selection.PrototypeSelector(fasttext_avg_embedder)

In [None]:
try:
    fasttext_prototypes = json.load(open('data/fasttext_prototypes.json', 'r'))
except:
    fasttext_selector.fit_prototypes(selected_lines_df['line'], selected_lines_df['repo'])
    fasttext_prototypes = fasttext_selector.prototypes
    json.dump(fasttext_prototypes, open('data/fasttext_prototypes.json', 'w'))

In [None]:
codebert_vectorizer = embeddings.TransformerVectorizer('microsoft/codebert-base', batch_size=64)

In [None]:
codebert_selector = prototype_selection.PrototypeSelector(codebert_vectorizer)

In [None]:
try:
    codebert_prototypes = json.load(open('data/codebert_prototypes.json', 'r'))
except:
    codebert_selector.fit_prototypes(selected_lines_df['line'], selected_lines_df['repo'])
    codebert_prototypes = codebert_selector.prototypes
    json.dump(codebert_prototypes, open('data/codebert_prototypes.json', 'w'))

In [None]:
def vectorize_prototypes(vectorizer, prototypes):
    prototype_aggregated_embeddings = {}
    for key in prototypes.keys():
        prototype_aggregated_embeddings[key] = np.mean(vectorizer.transform(prototypes[key]), axis=0)
    return list(prototype_aggregated_embeddings.keys()), np.row_stack(prototype_aggregated_embeddings.values())

In [None]:
codebert_prototypes = {
    repo: v
    for (repo, v) in codebert_prototypes.items()
    if repo in paperswithcode_with_imports_df['repo_name'].values
}

In [None]:
codebert_prototypes.keys()

In [None]:
repos_train

In [None]:
fasttext_prototypes = {
    repo: v
    for (repo, v) in fasttext_prototypes.items()
    if repo in paperswithcode_with_imports_df['repo_name'].values
}

In [None]:
def get_prototypes(repo_name):
    return pd.DataFrame({"codebert": codebert_prototypes[repo_name], "fasttext": fasttext_prototypes[repo_name]})

In [None]:
fasttext_prototypes.keys()

In [None]:
get_prototypes("transformer")

In [None]:
get_prototypes("mmdetection")

In [None]:
get_prototypes("Recommenders-movielens")

In [None]:
get_prototypes("mmdetection")

In [None]:
fasttext_prototypes['mmdetection']

In [None]:
codebert_repos, codebert_prototype_embeddings = vectorize_prototypes(codebert_vectorizer, codebert_prototypes)

In [None]:
fasttext_repos, fasttext_prototype_embeddings = vectorize_prototypes(fasttext_avg_embedder, fasttext_prototypes)

In [None]:
len(fasttext_prototype_embeddings)

In [None]:
paperswithcode_tasks_series = paperswithcode_with_imports_df['most_common_task']
paperswithcode_tasks_series.index = paperswithcode_with_imports_df['repo_name']
#paperswithcode_tasks_series = paperswithcode_tasks_series[paperswithcode_tasks_series.index.isin(fasttext_repos)]

In [None]:
fasttext_tasks = paperswithcode_tasks_series.loc[fasttext_repos]
fasttext_tasks_embeddings = task_embedder.transform(fasttext_tasks)
codebert_tasks = paperswithcode_tasks_series.loc[codebert_repos]
codebert_tasks_embeddings = task_embedder.transform(codebert_tasks)

In [None]:
codebert_prototype_embeddings.shape

In [None]:
eszs_learner = zero_shot.ESZSLearner()

In [None]:
codebert_prototype_embeddings.shape

In [None]:
len(codebert_tasks)

In [None]:
eszs_learner.fit(codebert_prototype_embeddings, codebert_tasks, task_embeddings[:-1])
eszs_learner.score(codebert_prototype_embeddings, codebert_tasks, task_embeddings[:-1])

In [None]:
eszs_learner.fit(fasttext_prototype_embeddings, fasttext_tasks, task_embeddings[:-1])
eszs_learner.score(fasttext_prototype_embeddings, fasttext_tasks, task_embeddings[:-1])

In [None]:
list(set(selected_lines_df['repo']))[3007]

In [None]:
problematic_lines_df = selected_lines_df[selected_lines_df['repo'] == 'auto_ml']

In [None]:
del codebert_vectorizer

In [None]:
problematic_lines_df['lines']

In [None]:
codebert_selector.prototypes

In [None]:
y_embeddings = fasttext_avg_embedder.transform(tasks)

In [None]:
repo_names = 
repo_embeddings = 

In [None]:
y_embeddings.shape