In [1]:
# default_exp matching_zsl

In [2]:
# export
import os
import ast
import tqdm
import json
import attr
from operator import itemgetter
from scipy.stats import hmean
import logging

import concurrent.futures

import itertools


import pandas as pd
import numpy as np
from sklearn import feature_extraction, metrics, model_selection

import matplotlib.pyplot as plt
import gensim


from functools import partial

from mlutil.feature_extraction import embeddings
import mlutil
from scarce_learn import zero_shot
from scarce_learn.zero_shot import devise_jax, devise_torch
from github_search import (
    paperswithcode_tasks,
    github_readmes,
    python_call_graph,
    data_utils,
)

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)

In [3]:
from github_search.pytorch_geometric_data import PygGraphWrapper
import torch

In [4]:
import re
from github_search.paperswithcode_tasks import clean_task_name
import fasttext
import pickle
import gensim

In [5]:
%env XLA_PYTHON_CLIENT_PREALLOCATE=false

env: XLA_PYTHON_CLIENT_PREALLOCATE=false


In [6]:
# upstream

import_corpus_path = "output/module_corpus.csv"
word_vectors_filename = "output/import2vec_module_vectors.bin"

In [7]:
%cd ..

/home/kuba/Projects/github_search


%%time
import_corpus_df = pd.read_csv(import_corpus_path)
per_repo_imports = import_corpus_df.groupby('repo')['imports'].agg(sum).apply(set)
import_corpus_df['imports'] = import_corpus_df['imports'].apply(ast.literal_eval)

%%time
#python_files_df = pd.read_csv('data/crawled_python_files.csv', encoding='latin-1')
#repo_names = python_files_df['repo_name']
import_corpus_df = pd.read_csv(import_corpus_path)
per_repo_imports = import_corpus_df.groupby('repo')['imports'].agg(sum).apply(set)

python_files_df['repo'] = python_files_df['repo_name'].str.split("/").apply(itemgetter(1))  + '/' + python_files_df['repo_name']
repo_names_tmp = python_files_df['repo_name']
repo_names = repo_names_tmp.unique()
python_files_df['repo_name'] = python_files_df['repo']
python_files_df['repo'] = repo_names_tmp

In [8]:
%%time
import2vec = gensim.models.KeyedVectors.load(word_vectors_filename)
import2vec_embedder = (
    mlutil.feature_extraction.embeddings.AverageWordEmbeddingsVectorizer(import2vec)
)

2022-03-14 17:24:12,219 - gensim.utils - INFO - loading Word2VecKeyedVectors object from output/import2vec_module_vectors.bin
2022-03-14 17:24:12,234 - gensim.utils - INFO - setting ignored attribute vectors_norm to None
2022-03-14 17:24:12,235 - gensim.utils - INFO - loaded output/import2vec_module_vectors.bin


CPU times: user 12.8 ms, sys: 811 µs, total: 13.6 ms
Wall time: 16.3 ms


In [9]:
task_name = "3d reconstruction"

In [10]:
paperswithcode_with_imports_df = pd.read_csv("output/papers_with_imports.csv")
paperswithcode_with_imports_df["tasks"] = (
    paperswithcode_with_imports_df["tasks"]
    .apply(clean_task_name)
    .apply(ast.literal_eval)
)
paperswithcode_with_imports_df["imports"] = (
    paperswithcode_with_imports_df["imports"]
    .str.replace("set\(\)", "{}")
    .apply(ast.literal_eval)
)

  paperswithcode_with_imports_df = pd.read_csv('output/papers_with_imports.csv')
  paperswithcode_with_imports_df['imports'] = paperswithcode_with_imports_df['imports'].str.replace("set\(\)", "{}").apply(ast.literal_eval)


In [11]:
paperswithcode_with_imports_df.shape

(36209, 23)

In [12]:
paperswithcode_with_imports_df["n_imports"] = paperswithcode_with_imports_df[
    "imports"
].apply(len)

In [13]:
paperswithcode_with_imports_df[
    "n_imports_with_embeddings"
] = paperswithcode_with_imports_df["imports"].apply(
    lambda imps: len([imp in import2vec.vocab.keys() for imp in imps])
)

In [14]:
paperswithcode_with_imports_df["repo"].unique().size

36209

In [15]:
%%time
word_embeddings = mlutil.feature_extraction.embeddings.load_gensim_embedding_model(
    "glove-wiki-gigaword-300"
)

2022-03-14 17:24:15,327 - gensim.models.utils_any2vec - INFO - loading projection weights from /home/kuba/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz
2022-03-14 17:24:50,418 - gensim.models.utils_any2vec - INFO - loaded (400000, 300) matrix from /home/kuba/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz


CPU times: user 34.6 s, sys: 381 ms, total: 34.9 s
Wall time: 35.3 s


In [16]:
fasttext_model = fasttext.load_model("output/python_files_fasttext_dim200.bin")



In [17]:
from gensim.models.callbacks import CallbackAny2Vec


class LossCallback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print("Loss after epoch {}: {}".format(self.epoch, loss))
        else:
            print(
                "Loss after epoch {}: {}".format(
                    self.epoch, loss - self.loss_previous_step
                )
            )
        self.epoch += 1
        self.loss_previous_step = loss

In [18]:
python_word_embeddings = gensim.models.Word2Vec.load(
    "output/abstract_readme_w2v200.bin"
)

2022-03-14 17:25:14,363 - gensim.utils - INFO - loading Word2Vec object from output/abstract_readme_w2v200.bin
2022-03-14 17:25:14,758 - gensim.utils - INFO - loading wv recursively from output/abstract_readme_w2v200.bin.wv.* with mmap=None
2022-03-14 17:25:14,759 - gensim.utils - INFO - loading vectors from output/abstract_readme_w2v200.bin.wv.vectors.npy with mmap=None
2022-03-14 17:25:14,787 - gensim.utils - INFO - setting ignored attribute vectors_norm to None
2022-03-14 17:25:14,788 - gensim.utils - INFO - loading vocabulary recursively from output/abstract_readme_w2v200.bin.vocabulary.* with mmap=None
2022-03-14 17:25:14,788 - gensim.utils - INFO - loading trainables recursively from output/abstract_readme_w2v200.bin.trainables.* with mmap=None
2022-03-14 17:25:14,788 - gensim.utils - INFO - loading syn1neg from output/abstract_readme_w2v200.bin.trainables.syn1neg.npy with mmap=None
2022-03-14 17:25:14,811 - gensim.utils - INFO - setting ignored attribute cum_table to None
2022-0

In [19]:
# export


@attr.s
class RepoTaskData:

    tasks = attr.ib()
    repos = attr.ib()
    X = attr.ib()
    all_tasks = attr.ib()
    y = attr.ib()

    def split_tasks(area_grouped_tasks, test_size=0.2):
        tasks_train, tasks_test = model_selection.train_test_split(
            area_grouped_tasks["task"],
            stratify=area_grouped_tasks["area"],
            test_size=test_size,
            random_state=0,
        )
        return tasks_train, tasks_test

    def create_split(
        tasks_test,
        all_tasks,
        paperswithcode_with_features_df,
        X_repr,
        y_col="least_common_task",
    ):
        train_indicator = paperswithcode_with_features_df["tasks"].apply(
            lambda ts: not (any([t in list(tasks_test) for t in ts]))
        )
        repos_train = paperswithcode_with_features_df["repo"][train_indicator]
        repos_test = paperswithcode_with_features_df["repo"][~train_indicator]
        X_repr = X_repr.apply(lambda x: " ".join(x))
        X_train = X_repr[train_indicator]
        X_test = X_repr[~train_indicator]
        all_tasks_train = all_tasks[train_indicator]
        all_tasks_test = all_tasks[~train_indicator]
        y_train = (
            paperswithcode_with_features_df[train_indicator][y_col]
            .str.lower()
            .apply(clean_task_name)
        )
        y_test = (
            paperswithcode_with_features_df[~train_indicator][y_col]
            .str.lower()
            .apply(clean_task_name)
        )

        return (
            RepoTaskData(tasks_train, repos_train, X_train, all_tasks_train, y_train),
            RepoTaskData(tasks_test, repos_test, X_test, all_tasks_test, y_test),
        )

In [21]:
# export


def get_first_vocab_entry(vocab):
    return list(itertools.islice(vocab.items(), 1))[0][0]


class PairedKeyedVectors:
    @attr.s
    class wv:
        vocab = attr.ib()

    def __init__(self, kv1, kv2):
        self.kv1 = kv1
        self.kv2 = kv2
        self.vocab = {**kv1.vocab, **kv2.vocab}
        self.dim1 = len(kv1[get_first_vocab_entry(kv1.vocab)])
        self.dim2 = len(kv2[get_first_vocab_entry(kv2.vocab)])
        self.wv = PairedKeyedVectors.wv(self.vocab)

    def __getitem__(self, item):
        if not item in self.kv1.vocab.keys():
            return np.concatenate([np.zeros(self.dim1), self.kv2[item]])
        elif not item in self.kv2.vocab.keys():
            return np.concatenate([self.kv1[item], np.zeros(self.dim2)])
        else:
            return np.concatenate([self.kv1[item], self.kv2[item]])


@attr.s
class RetrieverLearner:

    zs_learner: zero_shot.ZeroShotClassifier = attr.ib()
    input_embedder: embeddings.EmbeddingVectorizer = attr.ib()
    y_embedder: embeddings.EmbeddingVectorizer = attr.ib()
    input_embedder_kwargs = attr.ib(default=dict())

    @staticmethod
    def create(
        zs_learner: zero_shot.ZeroShotClassifier,
        input_embeddings: gensim.models.KeyedVectors,
        target_embeddings: gensim.models.KeyedVectors,
        input_embedding_method: embeddings.EmbeddingVectorizer,
        y_embedding_method: embeddings.EmbeddingVectorizer,
        input_embedder_kwargs=dict(),
    ):
        input_embedder = input_embedding_method(
            input_embeddings, **input_embedder_kwargs
        )
        y_embedder = y_embedding_method(target_embeddings)
        return RetrieverLearner(zs_learner, input_embedder, y_embedder)

    def get_target_embeddings(self, y):
        unique_y = pd.Series(y.unique())
        y_embeddings = self.y_embedder.transform(unique_y)
        return unique_y, y_embeddings

    def fit_learner(self, data, **kwargs):
        self.input_embedder.fit(data.X)
        X_embeddings = self.input_embedder.transform(data.X)
        self.y_embedder.fit(data.y)
        unique_y, y_embeddings = self.get_target_embeddings(data.y)
        input_y_idxs = data.y.apply(lambda t: unique_y[unique_y == t].index[0])
        self.zs_learner.fit(
            np.array(X_embeddings),
            np.array(input_y_idxs),
            np.array(y_embeddings),
            **kwargs
        )

    def predict_idxs(self, X, y_embeddings):
        X_embeddings = self.input_embedder.transform(X)
        return self.zs_learner.predict(X_embeddings, y_embeddings)

    def predict_topk(
        self,
        X,
        y_embeddings,
        target_names,
        k=5,
        similarity=metrics.pairwise.cosine_similarity,
    ):
        X_embeddings = self.input_embedder.transform(X)
        predictions = self.zs_learner.predict_raw(X_embeddings)
        target_similarities = similarity(predictions, y_embeddings)
        targets = [
            target_names[row[:k]] for row in (-target_similarities).argsort(axis=1)
        ]
        return targets

    def evaluate(self, data, metric):
        unique_y, y_embeddings = self.get_target_embeddings(data.y)
        input_y_idxs = data.y.apply(lambda t: unique_y[unique_y == t].index[0])
        predicted_idxs = self.predict_idxs(data.X, y_embeddings)
        return metric(input_y_idxs, predicted_idxs)

In [22]:
graph = pickle.load(open("output/call_igraph.pkl", "rb"))

In [23]:
len(graph.get_vertex_dataframe().iloc[graph.neighborhood(vertices=["<ROOT>"])[0]])

53700

get repos that are in graph 

In [24]:
paperswithcode_with_imports_df["repo"].isin(graph.get_vertex_dataframe()["name"]).sum()

36208

In [25]:
graph_nodes = graph.get_vertex_dataframe()["name"].unique()

In [26]:
len(graph_nodes)

3600597

In [27]:
paperswithcode_with_tasks_df = (
    pd.read_csv("output/papers_with_readmes.csv")
    .dropna(subset=["least_common_task"])
    .dropna(subset=["readme", "abstract"])
)
paperswithcode_with_tasks_df["tasks"] = paperswithcode_with_tasks_df["tasks"].apply(
    ast.literal_eval
)

In [28]:
paperswithcode_with_tasks_df["readme"].shape

(39848,)

In [29]:
dependency_records_df = pd.read_csv("output/processed_dependency_records.csv").dropna()

In [30]:
# export


def filter_smaller_tasks(paperswithcode_with_tasks_df, min_task_count=10):
    task_counts = paperswithcode_with_tasks_df["least_common_task"].value_counts()
    return paperswithcode_with_tasks_df[
        paperswithcode_with_tasks_df["least_common_task"].isin(
            task_counts[task_counts >= min_task_count].index
        )
    ]


def prepare_paperswithcode_with_features_df(
    paperswithcode_with_tasks_df, dependency_records_df, min_task_count
):
    paperswithcode_with_features_df = paperswithcode_with_tasks_df[
        paperswithcode_with_tasks_df["repo"].isin(graph.get_vertex_dataframe()["name"])
        | paperswithcode_with_tasks_df["repo"]
        .apply(lambda s: s.split("/")[1])
        .isin(graph.get_vertex_dataframe()["name"])
    ]
    paperswithcode_with_features_df = paperswithcode_with_features_df.dropna(
        subset=["readme", "abstract"]
    )
    tasks = paperswithcode_with_features_df["least_common_task"].str.lower()

    per_repo_dependency_records = data_utils.get_repo_records(
        paperswithcode_with_features_df["repo"], dependency_records_df
    )
    per_repo_dependency_records = per_repo_dependency_records.reset_index()
    per_repo_dependency_records.columns = ["source", "dependency_records"]
    paperswithcode_with_features_df = paperswithcode_with_features_df.merge(
        per_repo_dependency_records.reset_index(), left_on="repo", right_on="source"
    )
    all_tasks = paperswithcode_with_features_df["tasks"]
    is_valid_record = all_tasks.apply(len) > 0
    paperswithcode_with_features_df = filter_smaller_tasks(
        paperswithcode_with_features_df[is_valid_record], min_task_count
    )
    all_tasks = paperswithcode_with_features_df["tasks"]
    all_task_counts = all_tasks.explode().value_counts()
    valid_tasks = all_task_counts[all_task_counts >= min_task_count].index
    paperswithcode_with_features_df["tasks"] = paperswithcode_with_features_df[
        "tasks"
    ].apply(lambda ts: [t for t in ts if t in valid_tasks])
    return paperswithcode_with_features_df

In [31]:
paperswithcode_with_features_df = prepare_paperswithcode_with_features_df(
    paperswithcode_with_tasks_df, dependency_records_df, min_task_count=10
)

In [32]:
paperswithcode_with_imports_df = paperswithcode_with_imports_df[
    paperswithcode_with_imports_df["repo"].isin(paperswithcode_with_features_df["repo"])
]

In [33]:
all_tasks = paperswithcode_with_features_df[
    "tasks"
]  # .apply(lambda tasks: [t for t in tasks if t in valid_tasks.index])

In [34]:
tasks = all_tasks.explode().drop_duplicates()

In [35]:
paperswithcode_with_features_df["least_common_task"].value_counts()

image classification             1765
object detection                 1085
language modelling                800
domain adaptation                 766
data augmentation                 696
                                 ... 
code summarization                 10
synthetic data generation          10
handwritten digit recognition      10
discourse parsing                  10
automatic post editing             10
Name: least_common_task, Length: 429, dtype: int64

In [36]:
paperswithcode_with_features_df.shape

(32815, 20)

In [38]:
area_grouped_tasks = get_area_grouped_tasks()
area_grouped_tasks["task"] = area_grouped_tasks["task"].apply(clean_task_name)

In [39]:
all_area_grouped_tasks = pd.read_csv("data/paperswithcode_tasks.csv").dropna()

In [40]:
all_area_grouped_tasks["task"] = all_area_grouped_tasks["task"].str.replace("-", " ")

In [47]:
tasks_train, tasks_test = RepoTaskData.split_tasks(all_area_grouped_tasks)

In [48]:
def get_readme_summaries(upstream, product, keywords=True):
    pool = concurrent.futures.ProcessPoolExecutor(max_workers=10)
    raw_readmes = list(
        pool.map(github_readmes.get_readme, paperswithcode_with_features_df["repo"])
    )
    readmes = pd.Series(raw_readmes).apply(github_readmes.try_decode)
    return readmes

In [49]:
def get_readmes(df, keywords=True):
    pool = concurrent.futures.ProcessPoolExecutor(max_workers=10)
    raw_readmes = list(pool.map(github_readmes.get_readme, df["repo"]))
    readmes = list(map(github_readmes.try_decode, raw_readmes))
    return readmes

In [50]:
paperswithcode_with_features_df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'paper_url', 'paper_title',
       'paper_arxiv_id', 'paper_url_abs', 'paper_url_pdf', 'repo_url',
       'mentioned_in_paper', 'mentioned_in_github', 'framework', 'repo',
       'title', 'abstract', 'tasks', 'least_common_task', 'readme', 'index',
       'source', 'dependency_records'],
      dtype='object')

In [51]:
paperswithcode_with_features_df.shape

(32815, 20)

%%time
readmes = get_readmes(paperswithcode_with_features_df)

In [52]:
def try_keywords(text):
    return python_call_graph.try_run(gensim.summarization.keywords)(text)

In [53]:
task_embedder = mlutil.feature_extraction.embeddings.AverageWordEmbeddingsVectorizer(
    word_embeddings
)

In [54]:
# export


def get_outgoing_edges(graph, node):
    # idx = pd.Index(graph.names).get_loc(node)
    # outgoing_edges_idx = np.where(graph.mat[idx].todense())[1]
    return graph.get_vertex_dataframe().iloc[graph.successors(node)]["name"]
    # return graph.names[outgoing_edges_idx]


def get_repo_functions(graph_records, repo):
    return " ".join(set(get_outgoing_edges(graph, repo).values)).replace(repo + ":", "")

In [55]:
# export


def prepare_task_train_test_split(upstream, product):
    area_grouped_tasks = pd.read_csv(str(upstream["prepare_area_grouped_tasks"]))
    tasks_train, tasks_test = RepoTaskData.split_tasks(area_grouped_tasks)
    tasks_train.to_csv(product["train"], index=None)
    tasks_test.to_csv(product["test"], index=None)


def prepare_graph_repo_task_data(upstream, product):
    graph_data_train, graph_data_test = RepoTaskData.create_split(
        tasks_train,
        all_tasks,
        paperswithcode_with_features_df,
        paperswithcode_with_imports_df["imports"],
    )
    graph_data_train.X = graph_data_train.repos.apply(
        lambda x: get_repo_functions(graph, x)
    )
    graph_data_test.X = graph_data_test.repos.apply(
        lambda x: get_repo_functions(graph, x)
    )
    pickle.dump((graph_data_train, graph_data_test), open(str(product), "wb"))

In [56]:
readme_data_train, readme_data_test = RepoTaskData.create_split(
    tasks_test,
    paperswithcode_with_features_df["tasks"],
    paperswithcode_with_features_df,
    paperswithcode_with_features_df["readme"].str.split(),
)

In [57]:
tasks_test.to_csv("output/test_tasks.csv", index=False)

In [58]:
readme_data_train.X.shape[0] / paperswithcode_with_features_df.shape[0]

0.718665244552796

In [59]:
readme_data_train.X.shape[0] / paperswithcode_with_features_df.shape[0]

0.718665244552796

In [60]:
graph_data_train, graph_data_test = RepoTaskData.create_split(
    tasks_test,
    paperswithcode_with_features_df["tasks"],
    paperswithcode_with_features_df,
    paperswithcode_with_features_df["repo"].apply(lambda t: [t]),
)

In [61]:
%%time
if os.path.exists("output/tmp_graph_data.pkl"):
    (graph_data_train, graph_data_test) = pickle.load(
        open("output/tmp_graph_data.pkl", "rb")
    )
else:
    graph_data_train, graph_data_test = RepoTaskData.create_split(
        tasks_test,
        all_tasks[is_valid_record],
        paperswithcode_with_features_df[is_valid_record],
        paperswithcode_with_features_df[is_valid_record]["readme"].str.split(),
    )

    graph_records_train_X = pd.Series(
        [
            get_repo_functions(graph, x)
            for x in tqdm.notebook.tqdm(graph_data_train.repos)
        ]
    )

    graph_records_test_X = pd.Series(
        [
            get_repo_functions(graph, x)
            for x in tqdm.notebook.tqdm(graph_data_test.repos)
        ]
    )
    graph_data_train.X = graph_records_train_X
    graph_data_test.X = graph_records_test_X
    pickle.dump(
        (graph_data_train, graph_data_test), open("output/tmp_graph_data.pkl", "wb")
    )

NameError: name 'is_valid_record' is not defined

In [62]:
for i in range(len(graph_data_train.X)):
    graph_data_train.X.iloc[i] = graph_data_train.X.iloc[i].replace(
        graph_data_train.repos.iloc[i], ""
    )
for i in range(len(graph_data_test.X)):
    graph_data_test.X.iloc[i] = graph_data_test.X.iloc[i].replace(
        graph_data_test.repos.iloc[i], ""
    )

In [None]:
graph_data_train.X = graph_data_train.X.str.replace(":", " ")
graph_data_train.X = graph_data_train.X.str.replace("<ROOT>", " ")
graph_data_test.X = graph_data_test.X.str.replace(":", " ")
graph_data_test.X = graph_data_test.X.str.replace("<ROOT>", " ")

In [63]:
# export
def maybe_get_ndarray_elem(arr, idx, default=-1):
    if len(arr) <= idx:
        return default
    else:
        return arr[idx]


def get_retrieval_results(
    learner, data, queried_tasks, k=10, similarity=metrics.pairwise.cosine_similarity
):
    if queried_tasks == "all":
        tasks = data.all_tasks.explode().drop_duplicates()
    elif queried_tasks == "target":
        tasks = data.y.drop_duplicates()
    else:
        tasks = queried_tasks
    y_names, __ = learner.get_target_embeddings(tasks)
    input_embeddings = learner.input_embedder.transform(data.X)
    y_embeddings = learner.y_embedder.transform(y_names)
    predictions = learner.zs_learner.predict_raw(input_embeddings)
    input_target_similarities = similarity(predictions, y_embeddings)

    X_recalled = [
        np.argsort(-input_target_similarities[:, y_idx])[:k]
        for (y_idx, __) in enumerate(y_names)
    ]
    return y_names, X_recalled


def get_retrieval_metrics(
    learner,
    data,
    k=10,
    similarity=metrics.pairwise.cosine_similarity,
    queried_tasks="all",
):
    y_names, retrieved_X = get_retrieval_results(
        learner, data, k=k, similarity=similarity, queried_tasks=queried_tasks
    )
    retrieved_X_actual_labels = [
        data.all_tasks.iloc[idxs_recalled].explode().values
        for idxs_recalled in retrieved_X
    ]
    retrieved_idxs = [
        np.where(retrieved_X_actual_labels[y_idx] == y_name)[0]
        for (y_idx, y_name) in enumerate(y_names)
    ]
    num_recalled = [len(r) for r in retrieved_idxs]
    pos_recalled = [maybe_get_ndarray_elem(r, 0) for r in retrieved_idxs]
    accurately_recalled = [r > -1 for r in pos_recalled]
    return pd.DataFrame(
        {
            "retrieved_labels": retrieved_X_actual_labels,
            "num_recalled": num_recalled,
            "recalled": accurately_recalled,
            "position": pos_recalled,
        },
        index=y_names,
    )


def get_retrieval_accuracy(
    learner,
    data,
    k=10,
    similarity=metrics.pairwise.cosine_similarity,
    queried_tasks=None,
):
    return np.mean(
        get_retrieval_metrics(learner, data, k, similarity, queried_tasks)["recalled"]
    )

In [64]:
# export


def run_learner_experiment(
    retriever_learner, data_train, data_test, queried_tasks="all", fit_learner=True
):
    if fit_learner:
        retriever_learner.fit_learner(data_train)

    accuracy_train = retriever_learner.evaluate(data_train, metrics.accuracy_score)
    accuracy_test = retriever_learner.evaluate(data_test, metrics.accuracy_score)
    top10_accuracy_train = get_retrieval_accuracy(
        retriever_learner, data_train, queried_tasks=queried_tasks, k=10
    )
    top10_accuracy_test = get_retrieval_accuracy(
        retriever_learner, data_test, queried_tasks=queried_tasks, k=10
    )

    return dict(
        accuracy_train=accuracy_train,
        accuracy_test=accuracy_test,
        top10_accuracy_train=top10_accuracy_train,
        top10_accuracy_test=top10_accuracy_test,
    )

In [65]:
from sentence_transformers import models as sbert_models
from sentence_transformers import SentenceTransformer

model_name = "microsoft/codebert-base"
word_embedding_model = sbert_models.Transformer(
    model_name, max_seq_length=64, do_lower_case=True
)
pooling_model = sbert_models.Pooling(
    word_embedding_model.get_word_embedding_dimension(), "cls"
)

2022-03-14 20:47:25,999 - faiss.loader - INFO - Loading faiss with AVX2 support.
2022-03-14 20:47:26,000 - faiss.loader - INFO - Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
2022-03-14 20:47:26,000 - faiss.loader - INFO - Loading faiss.
2022-03-14 20:47:26,014 - faiss.loader - INFO - Successfully loaded faiss.


In [72]:
sbert_model = SentenceTransformer("output/sbert/lstm_sts_512x2/")

2022-03-14 20:48:01,925 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: output/sbert/lstm_sts_512x2/
2022-03-14 20:48:02,231 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cuda


## Abstracts

In [73]:
abstract_data_train, abstract_data_test = RepoTaskData.create_split(
    tasks_test,
    paperswithcode_with_features_df["tasks"],
    paperswithcode_with_features_df,
    paperswithcode_with_features_df["abstract"].str.split(),
)

In [74]:
from scarce_learn.zero_shot import devise_jax

In [75]:
def retriever_objective_impl(zslearner_kwargs, embedding_kwargs, data_train, data_test):
    zs_learner = zero_shot.ESZSLearner(**zslearner_kwargs)
    learner = RetrieverLearner.create(zs_learner, **embedding_kwargs)
    exp_results = run_learner_experiment(
        learner, abstract_data_train, abstract_data_test
    )
    print(exp_results)
    return exp_results["top10_accuracy_test"]


def retrieval_objective(trial):
    lmbda = trial.suggest_float("lmbda", 1, 1000, log=True)
    gamma = trial.suggest_float("gamma", 1, 1000, log=True)
    zslearner_kwargs = dict(lmbda=lmbda, gamma=gamma)
    embedding_kwargs = dict(
        input_embeddings=python_word_embeddings,
        target_embeddings=python_word_embeddings,
        input_embedding_method=embeddings.AverageWordEmbeddingsVectorizer,
        y_embedding_method=embeddings.AverageWordEmbeddingsVectorizer,
    )
    return retriever_learner_trial_impl(
        zslearner_kwargs, embedding_kwargs, abstract_data_train, abstract_data_test
    )

In [76]:
%%time
abstract_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    python_word_embeddings,
    python_word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer,
)

abstract_learner.fit_learner(abstract_data_train)

CPU times: user 14.9 s, sys: 913 ms, total: 15.8 s
Wall time: 14.2 s


In [77]:
run_learner_experiment(
    abstract_learner,
    abstract_data_train,
    abstract_data_test,
    fit_learner=False,
    queried_tasks="target",
)

{'accuracy_train': 0.4836958826273163,
 'accuracy_test': 0.13572357019064124,
 'top10_accuracy_train': 0.9285714285714286,
 'top10_accuracy_test': 0.8142414860681114}

In [83]:
run_learner_experiment(
    abstract_learner,
    abstract_data_train,
    abstract_data_test,
    fit_learner=False,
    queried_tasks="all",
)

{'accuracy_train': 0.4836958826273163,
 'accuracy_test': 0.13572357019064124,
 'top10_accuracy_train': 0.8232558139534883,
 'top10_accuracy_test': 0.7375478927203065}

In [84]:
@attr.s
class SBertModelWrapper:

    model = attr.ib()

    def fit(self, *args, **kwargs):
        pass

    def transform(self, X):
        return self.model.encode(X.values)


sbert_wrapper = SBertModelWrapper(sbert_model)
abstract_sbert_learner = RetrieverLearner(
    zero_shot.ESZSLearner(100, 100),
    embeddings.AverageWordEmbeddingsVectorizer(python_word_embeddings),
    sbert_wrapper,
)

In [None]:
run_learner_experiment(
    abstract_sbert_learner, abstract_data_train, abstract_data_test, queried_tasks="all"
)

# Abstract model using fasttext trained on Python code

In [None]:
ezslearner = zero_shot.ESZSLearner()
abstract_fasttext_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 100),
    word_embeddings,
    fasttext_model,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.FastTextVectorizer,
)

In [None]:
run_learner_experiment(
    abstract_fasttext_learner, abstract_data_train, abstract_data_test
)

# word2vec model on READMEs

paperswithcode_with_readmes_df = pd.read_csv("output/papers_with_readmes.csv")
paperswithcode_with_imports_df['readme'] = paperswithcode_with_readmes_df['readme'] 
paperswithcode_with_features_df['readme'] = readmes

In [None]:
readme_data_train, readme_data_test = RepoTaskData.create_split(
    tasks_test,
    all_tasks,
    paperswithcode_with_features_df,
    paperswithcode_with_features_df["readme"].str.split(),
)

In [None]:
readme_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    python_word_embeddings,
    python_word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer,
)

readme_learner.fit_learner(readme_data_train)

In [None]:
%%time
run_learner_experiment(
    readme_learner, readme_data_train, readme_data_test, fit_learner=False
)

In [None]:
%%time
run_learner_experiment(
    readme_learner,
    readme_data_train,
    readme_data_test,
    fit_learner=False,
    queried_tasks="target",
)

In [None]:
readme_sbert_learner = RetrieverLearner(
    zero_shot.ESZSLearner(100, 100), sbert_wrapper, sbert_wrapper
)

In [292]:
run_learner_experiment(readme_sbert_learner, readme_data_train, readme_data_test)

Batches:   0%|          | 0/911 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 6.50 GiB (GPU 0; 23.70 GiB total capacity; 3.12 GiB already allocated; 1.43 GiB free; 3.51 GiB reserved in total by PyTorch)

## Fasttext on READMEs - worse than word2vec

In [None]:
ezslearner = zero_shot.ESZSLearner()
readme_fasttext_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    word_embeddings,
    fasttext_model,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.FastTextVectorizer,
)

In [None]:
%%time
run_learner_experiment(readme_fasttext_learner, readme_data_train, readme_data_test)

# README keywords

readme_keywords_data_train, readme_keywords_data_test = RepoTaskData.create_split(tasks_train[has_readme], all_tasks[has_readme], paperswithcode_with_features_df[has_readme], readme_keywords[has_readme].str.split())

ezslearner = zero_shot.ESZSLearner()
readme_keywords_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(10, 10),
    word_embeddings,
    word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

run_learner_experiment(readme_keywords_learner, readme_keywords_data_train, readme_keywords_data_test)

## Import2Vec

In [110]:
import_data_train, import_data_test = RepoTaskData.create_split(
    tasks_test[is_valid_record],
    all_tasks[is_valid_record],
    paperswithcode_with_features_df[is_valid_record],
    paperswithcode_with_features_df[is_valid_record]["imports"],
)

IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [None]:
ezslearner = zero_shot.ESZSLearner()
import2vec_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(lmbda=100.0, gamma=10.0),
    import2vec,
    word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer,
)

run_learner_experiment(import2vec_learner, import_data_train, import_data_test)

## PRoNe

In [None]:
prone_embeddings = gensim.models.KeyedVectors.load("data/prone_embeddings.bin")

Using repo embedding from node embeddings

In [None]:
prone_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    prone_embeddings,
    python_word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer,
)

run_learner_experiment(prone_learner, graph_data_train, graph_data_test)

## GraphSage

## aggregating vertex embeddings 

In [None]:
graphsage_kv_file = (
    "output/graphsage_embeddings_fasttext_dim200_epochs20_dim200_layers2.bin"
)

In [125]:
graphsage_embeddings = gensim.models.KeyedVectors.load(graphsage_kv_file)

NameError: name 'graphsage_kv_file' is not defined

## using GraphSAGE model for embedding

In [87]:
# export


class LambdaTransformer:
    def __init__(self, transform_fn):
        self.transform = transform_fn

    def fit(self, X, **kwargs):
        return self


class PyGGraphModelTransformer:
    def __init__(self, model, dependency_graph_wrapper):
        self.model = model
        self.dependency_graph_wrapper = dependency_graph_wrapper

    def transform(self, x):
        return self.dependency_graph_wrapper.get_vertex_embeddings(x, self.model)

    def fit(self, X, **kwargs):
        return self

In [88]:
from github_search.pytorch_geometric_data import PygGraphWrapper
import torch

In [89]:
graphsage_model = torch.load("output/graphsage_model_100_dim200_layers2.pth").cpu()
graphsage_model.eval()  # = False

SAGE(
  (convs): ModuleList(
    (0): SAGEConv(200, 200)
    (1): SAGEConv(200, 200)
  )
)

In [90]:
from github_search import data_utils

In [91]:
dependency_graph_wrapper = data_utils.make_extended_dependency_wrapper(
    repos=pd.concat([readme_data_train.repos, readme_data_test.repos]),
    dependency_records_df=dependency_records_df[
        dependency_records_df["edge_type"] == "repo-file"
    ],
    fasttext_model=fasttext_model,
)

2022-03-08 23:34:32,014 - root - INFO - creating dependency nodes
2022-03-08 23:34:34,796 - root - INFO - loading dependency records
2022-03-08 23:34:35,088 - root - INFO - creating dependency dataframe
2022-03-08 23:34:36,217 - root - INFO - creating dependency graph wrapper
  self.dataset = Data(torch.tensor(features), torch.tensor(edge_index))


In [92]:
from github_search import data_utils

In [93]:
# export


def get_vertex_embeddings(wrapper, vertex_subset, model):
    features = (
        model.full_forward(wrapper.dataset.x, wrapper.dataset.edge_index)
        .cpu()
        .detach()
        .numpy()
    )
    return features[wrapper.vertex_mapping.loc[vertex_subset]]

In [94]:
readme_data_test.repos.isin(dependency_graph_wrapper.records_df["source"]).mean()

1.0

In [95]:
# export
import pathlib


def make_pyggraph_retriever_learner(
    zs_learner, dependency_graph_wrapper, model, y_embedder
):

    lambda_transformer = PyGGraphModelTransformer(model, dependency_graph_wrapper)
    return RetrieverLearner(zs_learner, lambda_transformer, y_embedder)


def save_pyggraph_retriever_learner(pyggraph_retriever_learner, directory):
    p = pathlib.Path(directory)
    p.mkdir(exist_ok=True)

In [96]:
graphsage_data_train, graphsage_data_test = RepoTaskData.create_split(
    tasks_test,
    all_tasks,
    paperswithcode_with_features_df,
    paperswithcode_with_features_df["repo"].apply(lambda s: [s]),
)

In [97]:
graphsage_learner = make_pyggraph_retriever_learner(
    zero_shot.ESZSLearner(100, 10),
    dependency_graph_wrapper,
    graphsage_model,
    embeddings.AverageWordEmbeddingsVectorizer(python_word_embeddings),
)

In [98]:
%%time
graphsage_results = run_learner_experiment(
    graphsage_learner, graphsage_data_train, graphsage_data_test, fit_learner=True
)

  self.dataset = Data(torch.tensor(features), torch.tensor(edge_index))
  self.dataset = Data(torch.tensor(features), torch.tensor(edge_index))
  self.dataset = Data(torch.tensor(features), torch.tensor(edge_index))
  self.dataset = Data(torch.tensor(features), torch.tensor(edge_index))
  self.dataset = Data(torch.tensor(features), torch.tensor(edge_index))


CPU times: user 2min 53s, sys: 54.8 s, total: 3min 47s
Wall time: 2min


In [99]:
graphsage_results

{'accuracy_train': 0.11401542715761898,
 'accuracy_test': 0.035801312089971886,
 'top10_accuracy_train': 0.34991119005328597,
 'top10_accuracy_test': 0.21584699453551912}

In [102]:
%%time
abstract_results = run_learner_experiment(
    abstract_learner, abstract_data_train, abstract_data_test, fit_learner=False
)
readme_results = run_learner_experiment(
    readme_learner, readme_data_train, readme_data_test, fit_learner=False
)

CPU times: user 4min 30s, sys: 22.5 s, total: 4min 52s
Wall time: 4min 16s


In [None]:
%%time
import2vec_results = run_learner_experiment(
    import2vec_learner, import_data_train, import_data_test, fit_learner=False
)
prone_results = run_learner_experiment(
    prone_learner, graph_data_train, graph_data_test, fit_learner=False
)

# Results 

In [104]:
results_df = pd.DataFrame.from_records(
    [abstract_results, readme_results, graphsage_results]
)
results_df["method"] = ["abstract", "readme", "graphsage"]

In [105]:
results_df.round(3)

Unnamed: 0,accuracy_train,accuracy_test,top10_accuracy_train,top10_accuracy_test,method
0,0.502,0.248,0.75,0.658,abstract
1,0.303,0.167,0.7,0.59,readme
2,0.114,0.036,0.35,0.216,graphsage


In [106]:
from IPython import display

In [107]:
results_df.round(3).to_csv("output/retrieval_results.csv")

In [108]:
results_df[["method", "top10_accuracy_train", "top10_accuracy_test"]].round(3)

Unnamed: 0,method,top10_accuracy_train,top10_accuracy_test
0,abstract,0.75,0.658
1,readme,0.7,0.59
2,graphsage,0.35,0.216


In [178]:
# export


def get_query_level_results(retriever_learner, data_test, k=10):

    accuracy_test = retriever_learner.evaluate(data_test, metrics.accuracy_score)
    results = get_retrieval_metrics(retriever_learner, data_test, k=k)
    results["position"] = results["position"].replace(-1, np.inf)

    return accuracy_test, results

In [91]:
# export
def get_idx_or_inf(xs, a):
    idxs = np.where(xs == a)[0].astype(int)
    if len(idxs) == 0:
        return np.inf
    else:
        return idxs[0]


def get_areas(area_grouped_tasks, tasks):
    return tasks.apply(
        lambda ts: area_grouped_tasks["area"][
            area_grouped_tasks["task"].isin(ts)
        ].unique()
    )


erroneous_area_tasks = []


def analyze_query_level_results(
    query_level_results, area_grouped_tasks, erroneous_area_tasks=erroneous_area_tasks
):
    retrieval_results_with_area_test = area_grouped_tasks.merge(
        query_level_results, left_on="task", right_index=True
    )
    for tasks in retrieval_results_with_area_test["retrieved_labels"].values:
        for task in tasks:
            try:
                partial(get_areas, area_grouped_tasks)(pd.Series([[task]]))
            except:
                erroneous_area_tasks.append(task)
    retrieved_areas = get_areas(
        area_grouped_tasks, retrieval_results_with_area_test["retrieved_labels"]
    )  # apply(partial(get_areas, area_grouped_tasks))
    retrieval_results_with_area_test["retrieved_areas"] = retrieved_areas
    is_area_retrieved = retrieval_results_with_area_test.apply(
        lambda row: row["area"] in row["retrieved_areas"][:10], axis=1
    )
    num_area_retrieved = retrieval_results_with_area_test.apply(
        lambda row: len(
            np.where(row["area"] == np.array(row["retrieved_areas"])[:10])[0]
        ),
        axis=1,
    )
    area_idx = retrieval_results_with_area_test.apply(
        lambda row: get_idx_or_inf(np.array(row["retrieved_areas"]), row["area"]),
        axis=1,
    )
    retrieval_results_with_area_test["area_recalled"] = is_area_retrieved
    retrieval_results_with_area_test["area_recalled_position"] = area_idx
    retrieval_results_with_area_test["num_area_recalled"] = num_area_retrieved
    query_level_results = retrieval_results_with_area_test.groupby("area").agg(
        {
            "recalled": "mean",
            "num_recalled": "mean",
            # "position": ["median", "mean"],
            "area_recalled": "mean",
            "num_area_recalled": "mean",
            "area_recalled_position": ["median"],
        }
    )
    query_level_results["count"] = retrieval_results_with_area_test[
        "area"
    ].value_counts()  # groupby('area').agg('count')
    return query_level_results


def get_analyzed_query_level_results(retriever_learner, data_test, area_grouped_tasks):
    detailed_results_all = get_query_level_results(retriever_learner, data_test)
    retrieval_results_with_area_test = analyze_query_level_results(
        query_level_results, area_grouped_tasks
    )
    return retrieval_results_with_area_test

In [92]:
%%time
readme_accuracy, raw_readme_area_results = get_query_level_results(
    readme_learner, readme_data_test
)

CPU times: user 23.1 s, sys: 4.03 s, total: 27.2 s
Wall time: 21.8 s


In [93]:
graphsage_accuracy, raw_graphsage_area_results = get_query_level_results(
    graphsage_learner, graphsage_data_test
)

  self.dataset = Data(torch.tensor(features), torch.tensor(edge_index))
  self.dataset = Data(torch.tensor(features), torch.tensor(edge_index))


In [94]:
readme_accuracy

0.1670103092783505

  return array(a, dtype, copy=False, order=order)


ValueError: could not broadcast input array from shape (87,200) into shape (87,)

In [95]:
graph_data_test.y.value_counts()[:-20]

representation learning    390
continuous control         378
image retrieval            213
keypoint detection         205
meta learning              189
                          ... 
video retrieval             17
neural rendering            17
decipherment                17
argument mining             17
foveation                   16
Name: least_common_task, Length: 67, dtype: int64

In [96]:
readme_area_results = analyze_query_level_results(
    raw_readme_area_results, area_grouped_tasks
)

In [97]:
readme_area_results.round(2).sort_values("count", ascending=False)

Unnamed: 0_level_0,recalled,num_recalled,area_recalled,num_area_recalled,area_recalled_position,count
Unnamed: 0_level_1,mean,mean,mean,mean,median,Unnamed: 6_level_1
area,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
computer-vision,0.54,1.93,0.99,0.99,0.0,114
miscellaneous,0.51,1.88,0.96,0.96,3.0,113
methodology,0.65,2.35,1.0,1.0,1.0,66
natural-language-processing,0.48,1.95,0.98,0.98,2.0,60
graphs,0.7,2.45,0.95,0.95,0.0,20
time-series,0.47,2.47,0.73,0.73,2.0,15
medical,0.5,1.79,0.79,0.79,1.5,14
speech,0.69,1.54,1.0,1.0,3.0,13
playing-games,0.55,2.09,1.0,1.0,2.0,11
robots,0.57,1.0,0.86,0.86,4.0,7


In [98]:
graphsage_tasks = graphsage_data_train.all_tasks.explode()

In [99]:
readme_area_results.round(2).sort_values("count", ascending=False).to_latex(
    open("output/readme_area_results.tex", "w")
)

  readme_area_results.round(2).sort_values('count', ascending=False).to_latex(open("output/readme_area_results.tex", "w"))


In [100]:
graphsage_area_results = analyze_query_level_results(
    raw_graphsage_area_results, area_grouped_tasks
)

In [101]:
graphsage_area_results.round(2).sort_values("count", ascending=False)

Unnamed: 0_level_0,recalled,num_recalled,area_recalled,num_area_recalled,area_recalled_position,count
Unnamed: 0_level_1,mean,mean,mean,mean,median,Unnamed: 6_level_1
area,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
computer-vision,0.23,0.6,0.99,0.99,0.0,114
miscellaneous,0.17,0.58,0.96,0.96,4.0,113
methodology,0.3,0.59,1.0,1.0,2.0,66
natural-language-processing,0.17,0.32,0.92,0.92,3.5,60
graphs,0.25,0.45,0.85,0.85,1.0,20
time-series,0.0,0.0,0.2,0.2,inf,15
medical,0.29,0.57,0.86,0.86,2.0,14
speech,0.23,0.38,0.77,0.77,5.0,13
playing-games,0.0,0.0,0.91,0.91,3.0,11
robots,0.14,0.14,0.29,0.29,inf,7


In [102]:
graphsage_area_results.round(2).sort_values("count", ascending=False).to_latex(
    open("output/graphsage_area_results.tex", "w")
)

  graphsage_area_results.round(2).sort_values('count', ascending=False).to_latex(open("output/graphsage_area_results.tex", "w"))


In [103]:
graphsage_area_results.sort_values("count", ascending=False)

Unnamed: 0_level_0,recalled,num_recalled,area_recalled,num_area_recalled,area_recalled_position,count
Unnamed: 0_level_1,mean,mean,mean,mean,median,Unnamed: 6_level_1
area,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
computer-vision,0.22807,0.596491,0.991228,0.991228,0.0,114
miscellaneous,0.168142,0.575221,0.964602,0.964602,4.0,113
methodology,0.30303,0.590909,1.0,1.0,2.0,66
natural-language-processing,0.166667,0.316667,0.916667,0.916667,3.5,60
graphs,0.25,0.45,0.85,0.85,1.0,20
time-series,0.0,0.0,0.2,0.2,inf,15
medical,0.285714,0.571429,0.857143,0.857143,2.0,14
speech,0.230769,0.384615,0.769231,0.769231,5.0,13
playing-games,0.0,0.0,0.909091,0.909091,3.0,11
robots,0.142857,0.142857,0.285714,0.285714,inf,7


In [109]:
train_tasks_all = graph_data_train.all_tasks.explode().unique()

In [119]:
(graph_data_train.all_tasks.explode().value_counts() < 10).mean()

0.07815275310834814

In [123]:
(graphsage_data_test.all_tasks.explode().value_counts() < 10).mean()

0.47540983606557374

In [105]:
len(graph_data_train.y.unique())

364

In [106]:
len(graph_data_test.y.unique())

87

In [107]:
len(set(graph_data_test.y.unique()).intersection(train_tasks_all))

74

In [108]:
len(graph_data_test.y.unique())

87

In [109]:
!cat output/graphsage_area_results.tex

# Exporting models 

In [110]:
pickle.dump(readme_data_test, open("output/readme_data_test.pkl", "wb"))

In [111]:
pickle.dump(graphsage_data_test, open("output/graphsage_data_test.pkl", "wb"))

In [112]:
pickle.dump(graphsage_learner, open("output/graphsage_learner.pkl", "wb"))

TypeError: cannot pickle 'fasttext_pybind.fasttext' object

In [113]:
pickle.dump(readme_learner, open("output/readme_learner.pkl", "wb"))

# Concatenation of repo, import embeddings

In [None]:
paired_data_train, paired_data_test = RepoTaskData.create_split(
    tasks_test,
    all_tasks,
    paperswithcode_with_features_df,
    paperswithcode_with_imports_df["imports"],
)
paired_data_train.X = graph_data_train.X + " " + import_data_train.X
paired_data_test.X = graph_data_test.X + " " + import_data_test.X

In [None]:
paired_data_train.X

In [None]:
paired_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    PairedKeyedVectors(python_word_embeddings.wv, graphsage_embeddings),
    fasttext_model,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.FastTextVectorizer,
)

paired_learner.fit_learner(graph_data_train)

In [None]:
paired_learner.evaluate(graph_data_train, metric=metrics.accuracy_score)

In [None]:
get_retrieval_accuracy(paired_learner, paired_data_train, k=10)

In [None]:
get_retrieval_accuracy(paired_learner, paired_data_test, k=10)

In [None]:
results = []
for (learner, learner_name, test) in zip(
    [import2vec_learner, prone_learner, paired_learner],
    ["import2vec", "prone", "both"],
    [X_test, repo_graph_terms_test, X_paired_test],
):
    accs = []
    for k in [1, 3, 5, 10, 20]:
        rec = get_retrieval_accuracy(learner, test, y_test, test_task_idxs, k=k)
        accs.append(rec)
    results.append(pd.Series(name=learner_name, data=accs))

In [None]:
results_df = pd.DataFrame(results)
results_df.columns = ["Accuracy@{}".format(i) for i in [1, 3, 5, 10, 20]]

In [None]:
results_df.round(3).to_markdown(open("metrics/zsl_results.md", "w"))

In [None]:
!cat metrics/zsl_results.md

In [None]:
import toolz

In [None]:
task_distances = metrics.pairwise.cosine_distances(task_embeddings, task_embeddings)

In [None]:
poincare_embeddings = gensim.models.KeyedVectors.load("data/poincare5.vec")

In [None]:
import gensim.models.wrappers.fasttext
from gensim.test.utils import datapath

In [None]:
from github_search import typical_file_parts
from mlutil import prototype_selection

In [None]:
selected_lines_df = typical_file_parts.get_selected_lines_and_repos(
    python_files_df["repo_name"], python_files_df["content"]
)