In [1]:
#default_exp matching_zsl

In [2]:
#export
import os
import ast
import tqdm
import json
import attr
from operator import itemgetter

from scarce_learn import zero_shot
from mlutil.feature_extraction import embeddings
import itertools


import pandas as pd
import numpy as np
from sklearn import feature_extraction, metrics, model_selection

import matplotlib.pyplot as plt
import gensim

from github_search import paperswithcode_tasks

import mlutil
from functools import partial


from scarce_learn.zero_shot import devise_jax, devise_torch

In [3]:
%env XLA_PYTHON_CLIENT_PREALLOCATE=false

env: XLA_PYTHON_CLIENT_PREALLOCATE=false


In [4]:
# upstream

import_corpus_path = 'output/module_corpus.csv'
word_vectors_filename = 'output/import2vec_module_vectors.bin'

In [5]:
%cd ..

/home/kuba/Projects/github_search


%%time
import_corpus_df = pd.read_csv(import_corpus_path)
per_repo_imports = import_corpus_df.groupby('repo')['imports'].agg(sum).apply(set)
import_corpus_df['imports'] = import_corpus_df['imports'].apply(ast.literal_eval)

In [6]:
%%time
python_files_df = pd.read_csv('data/crawled_python_files.csv', encoding='latin-1')
repo_names = python_files_df['repo_name']
import_corpus_df = pd.read_csv(import_corpus_path)
per_repo_imports = import_corpus_df.groupby('repo')['imports'].agg(sum).apply(set)

CPU times: user 2min 50s, sys: 3.2 s, total: 2min 53s
Wall time: 2min 55s


In [7]:
python_files_df.shape

(1402272, 3)

In [8]:
import_corpus_df.shape

(1375818, 3)

In [9]:
python_files_df['repo_name']

0                      trangvu/ape-npi
1                      trangvu/ape-npi
2                      trangvu/ape-npi
3                      trangvu/ape-npi
4                      trangvu/ape-npi
                      ...             
1402267    wayne1204/NOAA-fish-finding
1402268    wayne1204/NOAA-fish-finding
1402269    wayne1204/NOAA-fish-finding
1402270    wayne1204/NOAA-fish-finding
1402271    wayne1204/NOAA-fish-finding
Name: repo_name, Length: 1402272, dtype: object

In [10]:
python_files_df['repo_name'].unique().shape

(18933,)

python_files_df['repo'] = python_files_df['repo_name'].str.split("/").apply(itemgetter(1))  + '/' + python_files_df['repo_name']
repo_names_tmp = python_files_df['repo_name']
repo_names = repo_names_tmp.unique()
python_files_df['repo_name'] = python_files_df['repo']
python_files_df['repo'] = repo_names_tmp

In [11]:
%%time
import2vec = gensim.models.KeyedVectors.load(word_vectors_filename)
import2vec_embedder = mlutil.feature_extraction.embeddings.AverageWordEmbeddingsVectorizer(import2vec)

CPU times: user 7.9 ms, sys: 0 ns, total: 7.9 ms
Wall time: 9.11 ms


In [12]:
paperswithcode_with_imports_df = pd.read_csv('output/papers_with_imports.csv')
paperswithcode_with_imports_df['tasks'] = paperswithcode_with_imports_df['tasks'].str.replace("2d ", "").str.replace("3d ", "").str.replace("4d ", "").str.replace("6d ", "").str.lower().apply(ast.literal_eval)
paperswithcode_with_imports_df['imports'] = paperswithcode_with_imports_df['imports'].str.replace("set\(\)", "{}").apply(ast.literal_eval)#str.replace("2d ", "").str.replace("3d ", "").str.replace("4d ", "").str.replace("6d ", "").str.lower().apply(ast.literal_eval)

In [13]:
paperswithcode_with_imports_df.shape

(12224, 23)

In [14]:
paperswithcode_with_imports_df['n_imports'] = paperswithcode_with_imports_df['imports'].apply(len) 

In [15]:
paperswithcode_with_imports_df['n_imports_with_embeddings'] = paperswithcode_with_imports_df['imports'].apply(lambda imps: len([imp in import2vec.vocab.keys() for imp in imps]))

In [16]:
%%time
word_embeddings = mlutil.feature_extraction.embeddings.load_gensim_embedding_model('glove-wiki-gigaword-300')

CPU times: user 32.7 s, sys: 243 ms, total: 33 s
Wall time: 33.2 s


In [17]:
python_word_embeddings = gensim.models.Word2Vec.load('output/abstract_w2v100.bin')

In [18]:
@attr.s
class RepoTaskData:
    
    tasks = attr.ib()
    repos = attr.ib()
    X = attr.ib()
    all_tasks = attr.ib()
    y = attr.ib()
    
    def split_tasks(area_grouped_tasks, test_size=0.2):
        tasks_train, tasks_test = model_selection.train_test_split(area_grouped_tasks['task'], stratify=area_grouped_tasks['area'], test_size=test_size, random_state=0)
        return tasks_train, tasks_test
    
    def create_split(tasks_train, all_tasks, paperswithcode_with_features_df, X_repr):
        train_indicator = paperswithcode_with_features_df['most_common_task'].isin(tasks_train)
        print(train_indicator.shape)
        repos_train = paperswithcode_with_features_df['repo'][train_indicator]
        repos_test = paperswithcode_with_features_df['repo'][~train_indicator]
        X_repr = X_repr.apply(lambda x: " ".join(x))
        X_train = X_repr[train_indicator]
        X_test = X_repr[~train_indicator]
        all_tasks_train = all_tasks[train_indicator]
        all_tasks_test = all_tasks[~train_indicator]
        y_train = paperswithcode_with_features_df[train_indicator]['most_common_task'].str.lower()
        y_test = paperswithcode_with_features_df[~train_indicator]['most_common_task'].str.lower()
        
        return (
            RepoTaskData(tasks_train, repos_train, X_train, all_tasks_train, y_train),
            RepoTaskData(tasks_test, repos_test, X_test, all_tasks_test, y_test)
        )

In [19]:
#export


def get_first_vocab_entry(vocab):
    return list(itertools.islice(vocab.items(), 1))[0][0] 


class PairedKeyedVectors:
    
    @attr.s
    class wv:
        vocab = attr.ib()
    
    def __init__(self, kv1, kv2):
        self.kv1 = kv1
        self.kv2 = kv2
        self.vocab = {**kv1.vocab, **kv2.vocab} 
        self.dim1 = len(kv1[get_first_vocab_entry(kv1.vocab)])
        self.dim2 = len(kv2[get_first_vocab_entry(kv2.vocab)])
        self.wv= PairedKeyedVectors.wv(self.vocab)
    
    def __getitem__(self, item):
        if not item in self.kv1.vocab.keys():
            return np.concatenate([np.zeros(self.dim1), self.kv2[item]])
        elif not item in self.kv2.vocab.keys():
            return np.concatenate([self.kv1[item], np.zeros(self.dim2)])
        else:
            return np.concatenate([self.kv1[item], self.kv2[item]])
    


@attr.s
class RetrieverLearner:
    
    zs_learner: zero_shot.ZeroShotClassifier = attr.ib()
    input_embedder: embeddings.EmbeddingVectorizer = attr.ib() 
    y_embedder: embeddings.EmbeddingVectorizer = attr.ib()
    input_embedder_kwargs = attr.ib(default=dict())
        
    @staticmethod
    def create(
        zs_learner: zero_shot.ZeroShotClassifier,
        input_embeddings: gensim.models.KeyedVectors,
        target_embeddings: gensim.models.KeyedVectors,
        input_embedding_method: embeddings.EmbeddingVectorizer,
        y_embedding_method: embeddings.EmbeddingVectorizer,
        input_embedder_kwargs=dict()
    ):
        input_embedder = input_embedding_method(input_embeddings, **input_embedder_kwargs) 
        y_embedder = y_embedding_method(target_embeddings)
        return RetrieverLearner(zs_learner, input_embedder, y_embedder)
    
    def get_target_embeddings(self, y):
        unique_y = pd.Series(y.unique())
        y_embeddings = self.y_embedder.transform(unique_y)
        return unique_y, y_embeddings
    
    def fit_learner(self, data, **kwargs):
        self.input_embedder.fit(data.X)
        X_embeddings = self.input_embedder.transform(data.X)
        self.y_embedder.fit(data.y)
        unique_y, y_embeddings = self.get_target_embeddings(data.y)
        input_y_idxs = data.y.apply(lambda t: unique_y[unique_y == t].index[0])
        self.zs_learner.fit(np.array(X_embeddings), np.array(input_y_idxs), np.array(y_embeddings), **kwargs)
        
    def predict_idxs(self, X, y_embeddings):
        X_embeddings = self.input_embedder.transform(X)
        return self.zs_learner.predict(X_embeddings, y_embeddings)
    
    def predict_topk(self, X, y_embeddings, target_names, k=5, similarity=metrics.pairwise.cosine_similarity):
        X_embeddings = self.input_embedder.transform(X)
        predictions = self.zs_learner.predict_raw(X_embeddings)
        target_similarities = similarity(predictions, y_embeddings)
        targets = [target_names[row[:k]] for row in (-target_similarities).argsort(axis=1)]
        return targets
        
    def evaluate(self, data, metric):
        unique_y, y_embeddings = self.get_target_embeddings(data.y)
        input_y_idxs = data.y.apply(lambda t: unique_y[unique_y == t].index[0])
        predicted_idxs = self.predict_idxs(data.X, y_embeddings)
        return metric(input_y_idxs, predicted_idxs)

In [20]:
#export

def get_accuracy(learner, X, y, y_names, k=10, similarity=metrics.pairwise.cosine_similarity):
    input_embeddings = learner.input_embedder.transform(X)
    y_embeddings = learner.y_embedder.transform(y_names)
    predictions = learner.zs_learner.predict_raw(input_embeddings)
    target_similarities = similarity(predictions, y_embeddings)
    target_idxs = (-target_similarities).argsort(axis=1)
    targets = [y_names.iloc[row[:k]] for row in target_idxs]

    accuracies = np.zeros(len(X))
    for i in range(len(X)):
        true_tasks = set(all_tasks_test.iloc[i])
        accuracies[i] = len(true_tasks.intersection(set(targets[i].values))) / min(len(true_tasks), k)
    return accuracies.mean()

In [21]:
import pickle

graph = pickle.load(open('output/call_igraph.pkl', 'rb'))

SyntaxError: unexpected EOF while parsing (<ipython-input-163-42a59506436d>, line 2)

In [None]:
ezslearner = zero_shot.ESZSLearner()
abstract_fasttext_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    word_embeddings,fasttext_model,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.FastTextVectorizer
)

abstract_fasttext_learner.fit_learner(abstract_data_train)

In [22]:
len(graph.get_vertex_dataframe().iloc[graph.neighborhood(vertices=["<ROOT>"])[0]])

18934

get repos that are in graph 

In [23]:
graph_nodes = graph.get_vertex_dataframe()['name'].unique()

In [24]:
%%time
paperswithcode_with_features_df = paperswithcode_with_imports_df[
    paperswithcode_with_imports_df['repo'].isin(graph.get_vertex_dataframe()['name']) |
    paperswithcode_with_imports_df['repo'].apply(lambda s: s.split("/")[1]).isin(graph.get_vertex_dataframe()['name'])
]

CPU times: user 527 ms, sys: 27.8 ms, total: 555 ms
Wall time: 559 ms


In [25]:
paperswithcode_with_imports_df = paperswithcode_with_imports_df[paperswithcode_with_imports_df['repo'].isin(paperswithcode_with_features_df['repo'])]

In [26]:
def clean_task_name(task_name):
    return task_name.replace("2d ", "").replace("3d ", "").replace("4d ", "").replace("6d ", "").lower()

paperswithcode_with_features_df['most_common_task'] = paperswithcode_with_features_df['most_common_task'].str.lower()
tasks = paperswithcode_with_features_df['most_common_task'].str.lower()
tasks = tasks.apply(clean_task_name)
all_tasks = paperswithcode_with_features_df['tasks'].apply(lambda s: [clean_task_name(t) for t in s])
paperswithcode_with_features_df.shape

(12224, 25)

In [27]:
all_tasks.explode().value_counts()[:100]

semantic segmentation             1066
object detection                  1050
image classification               946
language modelling                 494
representation learning            454
                                  ... 
cell segmentation                   70
nuclear segmentation                69
multi-person pose estimation        69
natural language understanding      69
semantic parsing                    68
Name: tasks, Length: 100, dtype: int64

In [28]:
#export

def get_area_grouped_tasks(paperswithcode_tasks_path='data/paperswithcode_tasks.csv'):
    area_grouped_tasks = pd.read_csv('data/paperswithcode_tasks.csv')
    area_grouped_tasks['task'] = area_grouped_tasks['task'].str.replace("-", ' ')
    area_grouped_tasks = area_grouped_tasks[area_grouped_tasks['task'].isin(tasks)]
    area_counts = area_grouped_tasks['area'].value_counts()
    area_grouped_tasks = area_grouped_tasks[area_grouped_tasks['area'].isin(area_counts.index[area_counts > 1])]
    return area_grouped_tasks

In [29]:
area_grouped_tasks = get_area_grouped_tasks()

In [30]:
tasks_train, tasks_test = RepoTaskData.split_tasks(area_grouped_tasks)

In [31]:
len(tasks_train)

295

In [32]:
tasks_test

467                        object counting
1253    conversational response generation
854              probabilistic programming
525          facial expression recognition
1683                   montezuma's revenge
                       ...                
896                      data augmentation
926                                    eeg
1646            neural architecture search
592                       graph regression
1230               sentence classification
Name: task, Length: 74, dtype: object

In [33]:
len(tasks_test)

74

In [34]:
paperswithcode_with_features_df['most_common_task']

0               dictionary learning
1                   region proposal
2                  image generation
3        natural language inference
4        natural language inference
                    ...            
12219             anomaly detection
12220             anomaly detection
12221             anomaly detection
12222                style transfer
12223       representation learning
Name: most_common_task, Length: 12224, dtype: object

In [35]:
paperswithcode_with_features_df['most_common_task'].isin(tasks_test).sum()

2915

In [36]:
paperswithcode_with_features_df.shape

(12224, 25)

In [37]:
paperswithcode_with_features_df.shape

(12224, 25)

In [38]:
from github_search import github_readmes
import concurrent.futures

In [39]:
def try_decode(s, codec="utf-8"):
    try:
        return s.decode(codec)
    except:
        return None


def get_readme_summaries(upstream, product, keywords=True):
    pool = concurrent.futures.ProcessPoolExecutor(max_workers=10)
    raw_readmes = list(pool.map(github_readmes.get_readme, paperswithcode_with_features_df['repo']))
    readmes = pd.Series(raw_readmes).apply(try_decode)
    return readmes

In [40]:
from github_search import python_call_graph

In [41]:
def get_readmes(df, keywords=True):
    pool = concurrent.futures.ProcessPoolExecutor(max_workers=10)
    raw_readmes = list(pool.map(github_readmes.get_readme, df['repo']))
    readmes = list(map(try_decode, raw_readmes))
    return readmes

In [42]:
paperswithcode_with_features_df.columns

Index(['Unnamed: 0', 'paper_url', 'arxiv_id', 'title', 'abstract', 'url_abs',
       'url_pdf', 'proceeding', 'authors', 'tasks', 'date', 'methods',
       'framework', 'mentioned_in_github', 'mentioned_in_paper',
       'paper_arxiv_id', 'paper_title', 'paper_url_abs', 'paper_url_pdf',
       'repo', 'repo_url', 'most_common_task', 'imports', 'n_imports',
       'n_imports_with_embeddings'],
      dtype='object')

In [47]:
%%time
readmes = get_readmes(paperswithcode_with_features_df)

KeyboardInterrupt: 

In [103]:
import gensim

def try_keywords(text):
    return python_call_graph.try_run(gensim.summarization.keywords)(text)

In [104]:
pool = concurrent.futures.ProcessPoolExecutor(max_workers=20)

In [105]:
%%time
readme_keywords = pd.Series(pool.map(try_keywords, readmes)).str.replace("\n", " ")

CPU times: user 2.91 s, sys: 1.52 s, total: 4.43 s
Wall time: 5min 28s


In [55]:
dependency_records_df = pd.read_csv('output/processed_dependency_records.csv').dropna()#.iloc[:1000000]
non_root_dependency_records_df = dependency_records_df[
    (dependency_records_df['source'] != "<ROOT>") &
    (dependency_records_df['edge_type'] != 'repo-repo')
]
repo_descriptions = non_root_dependency_records_df[['source', 'repo_description']].groupby('source').apply(lambda df: df['repo_description'].iloc[0])

describable_paperswithcode_with_features_df = paperswithcode_with_features_df[paperswithcode_with_features_df['repo'].isin(repo_descriptions.index)]
describable_paperswithcode_with_imports_df = paperswithcode_with_imports_df[paperswithcode_with_imports_df['repo'].isin(repo_descriptions.index)]
describable_repo_tasks = all_tasks[paperswithcode_with_imports_df['repo'].isin(repo_descriptions.index)]


import_data_train, import_data_test = RepoTaskData.create_split(tasks_train, describable_repo_tasks, describable_paperswithcode_with_features_df, describable_paperswithcode_with_imports_df['imports'])

In [56]:
describable_paperswithcode_with_features_df = paperswithcode_with_features_df[paperswithcode_with_features_df['repo'].isin(repo_descriptions.index)]
describable_paperswithcode_with_imports_df = paperswithcode_with_imports_df[paperswithcode_with_imports_df['repo'].isin(repo_descriptions.index)]
describable_repo_tasks = all_tasks[paperswithcode_with_imports_df['repo'].isin(repo_descriptions.index)]


import_data_train, import_data_test = RepoTaskData.create_split(tasks_train, describable_repo_tasks, describable_paperswithcode_with_features_df, describable_paperswithcode_with_imports_df['imports'])

In [57]:
describable_paperswithcode_with_features_df.shape

(11245, 25)

In [58]:
describable_paperswithcode_with_imports_df.shape

(11245, 25)

In [59]:
all_tasks

0                                    [dictionary learning]
1        [region proposal, user constrained thumbnail g...
2         [conditional image generation, image generation]
3        [answer selection, natural language inference,...
4        [answer selection, natural language inference,...
                               ...                        
12219    [anomaly detection, unsupervised anomaly detec...
12220    [anomaly detection, unsupervised anomaly detec...
12221    [anomaly detection, unsupervised anomaly detec...
12222                                     [style transfer]
12223    [denoising, domain adaptation, representation ...
Name: tasks, Length: 12224, dtype: object

(11245,)


In [61]:
import_data_train.X.shape, import_data_test.X.shape

((7647,), (3598,))

In [66]:
task_embedder = mlutil.feature_extraction.embeddings.AverageWordEmbeddingsVectorizer(word_embeddings)

In [67]:
from scipy.stats import hmean

In [68]:
#export


def get_outgoing_edges(graph, node):
    #idx = pd.Index(graph.names).get_loc(node)
    #outgoing_edges_idx = np.where(graph.mat[idx].todense())[1]
    return graph.get_vertex_dataframe().iloc[graph.successors(node)]['name']
    #return graph.names[outgoing_edges_idx]


def get_repo_functions(graph, repo):
    return ' '.join(get_outgoing_edges(graph, repo).values)

In [69]:
graph_records = pd.read_csv('output/dependency_records.csv')

In [70]:
%%time
if os.path.exists("output/tmp_graph_data.pkl"):
    (graph_data_train, graph_data_test) = pickle.load(open("output/tmp_graph_data.pkl", "rb"))
else:
    graph_data_train, graph_data_test = RepoTaskData.create_split(tasks_train, all_tasks, paperswithcode_with_features_df, paperswithcode_with_imports_df['imports'])
    graph_data_train.X = graph_data_train.repos.apply(lambda x: get_repo_functions(graph, x))
    graph_data_test.X = graph_data_test.repos.apply(lambda x: get_repo_functions(graph, x))
    pickle.dump((graph_data_train, graph_data_test), open("output/tmp_graph_data.pkl", "wb"))

CPU times: user 16.8 ms, sys: 37 µs, total: 16.9 ms
Wall time: 92.7 ms


In [71]:
repo_descriptions.loc[graph_data_train.repos[6]]

'kinimod23/ATS_Project read_one_stop_english fasttext_to_file preprocess_dump preprocess BCNN utils train test'

In [72]:
graph_data_train, graph_data_test = RepoTaskData.create_split(tasks_train, describable_repo_tasks, describable_paperswithcode_with_features_df, describable_paperswithcode_with_imports_df['imports'])

(11245,)


In [73]:
graph_data_train.X = pd.Series(repo_descriptions.loc[graph_data_train.repos].values, index=graph_data_train.repos.index)

In [74]:
graph_data_test.X = pd.Series(repo_descriptions.loc[graph_data_test.repos].values, index=graph_data_test.repos.index)

In [75]:
graph_data_train.repos.iloc[0]

'canbakiskan/neuro-inspired-defense'

In [76]:
get_outgoing_edges(graph, get_outgoing_edges(graph, graph_data_train.repos.iloc[0]).iloc[0])

vertex ID
3203          get_accuracy_score
4890                         hvp
16936      test_bitmap_mask_init
17720                     matmul
20276                        fix
                   ...          
1333883                write_wav
1337476                   repeat
1338246                    vgg19
1341862                      max
1344759              get_dataset
Name: name, Length: 406, dtype: object

In [77]:
len(graph_data_train.X)

7647

In [78]:
#export


def retrieve_query_results(learner, data, query, k=10, similarity=metrics.pairwise.cosine_similarity):
    input_embeddings = learner.input_embedder.transform(data.X)
    y_embeddings = learner.y_embedder.transform([query])
    predictions = learner.zs_learner.predict_raw(input_embeddings)
    input_target_similarities = similarity(predictions, y_embeddings)
    return data.X.iloc[np.argsort(-input_target_similarities[:,0])[:k]]

    
def get_retrieval_results(learner, data, k=10, similarity=metrics.pairwise.cosine_similarity):
    y_names, __ = learner.get_target_embeddings(data.y)
    input_embeddings = learner.input_embedder.transform(data.X)
    y_embeddings = learner.y_embedder.transform(y_names)
    predictions = learner.zs_learner.predict_raw(input_embeddings)
    input_target_similarities = similarity(predictions, y_embeddings)

    X_recalled = [
        np.argsort(-input_target_similarities[:,y_idx])[:k]
        for (y_idx, __) in enumerate(y_names)
    ]
    return X_recalled


def get_retrieval_accuracies(learner, data, k=10, similarity=metrics.pairwise.cosine_similarity):
    y_names, __ = learner.get_target_embeddings(data.y)
    recalled_X = get_retrieval_results(learner, data, k=k, similarity=similarity)
    recalled_X_actual_y = [data.y.iloc[idxs_recalled].explode() for idxs_recalled in recalled_X]
    accurately_recalled = [
        y_name in recalled_X_actual_y[y_idx].values 
        for (y_idx, y_name) in enumerate(y_names)
    ]
    return pd.Series(data=accurately_recalled, index=y_names)


def get_retrieval_accuracy(learner, data, k=10, similarity=metrics.pairwise.cosine_similarity):
    y_names, __ = learner.get_target_embeddings(data.y)
    return np.mean(get_retrieval_accuracies(learner, data, k, similarity))

In [167]:
#export


def run_learner_experiment(
    retriever_learner,
    data_train, data_test
):
    retriever_learner.fit_learner(data_train)
    
    accuracy_train = retriever_learner.evaluate(data_train, metrics.accuracy_score)
    accuracy_test = retriever_learner.evaluate(data_test, metrics.accuracy_score)
    top10_accuracy_train = get_retrieval_accuracy(retriever_learner, data_train, k=10)
    top10_accuracy_test = get_retrieval_accuracy(retriever_learner, data_test, k=10)
    
    return dict(
        accuracy_train=accuracy_train,
        accuracy_test=accuracy_test,
        top10_accuracy_train=top10_accuracy_train,
        top10_accuracy_test=top10_accuracy_test
    )

## Abstracts

In [164]:
paperswithcode_with_imports_df['abstract']

0        Deep Neural Networks (DNNs) are vulnerable to ...
1        Thumbnails are widely used all over the world ...
2        Despite the recent success of face image gener...
3        How to model a pair of sentences is a critical...
4        How to model a pair of sentences is a critical...
                               ...                        
12219    Obtaining models that capture imaging markers ...
12220    Obtaining models that capture imaging markers ...
12221    Obtaining models that capture imaging markers ...
12222    Gatys et al. recently introduced a neural algo...
12223    We introduce a new representation learning alg...
Name: abstract, Length: 12224, dtype: object

In [80]:
has_abstract = ~paperswithcode_with_imports_df['abstract'].isna()

In [81]:
tasks_train[has_abstract]
paperswithcode_with_features_df[has_abstract]

Unnamed: 0.1,Unnamed: 0,paper_url,arxiv_id,title,abstract,url_abs,url_pdf,proceeding,authors,tasks,...,paper_arxiv_id,paper_title,paper_url_abs,paper_url_pdf,repo,repo_url,most_common_task,imports,n_imports,n_imports_with_embeddings
0,0,https://paperswithcode.com/paper/a-neuro-inspi...,2011.10867,A Neuro-Inspired Autoencoding Defense Against ...,Deep Neural Networks (DNNs) are vulnerable to ...,https://arxiv.org/abs/2011.10867v2,https://arxiv.org/pdf/2011.10867v2.pdf,,"['Can Bakiskan', 'Metehan Cekic', 'Ahmet Dunda...",[dictionary learning],...,2011.10867,A Neuro-Inspired Autoencoding Defense Against ...,https://arxiv.org/abs/2011.10867v2,https://arxiv.org/pdf/2011.10867v2.pdf,canbakiskan/neuro-inspired-defense,https://github.com/canbakiskan/neuro-inspired-...,dictionary learning,"{os, train_test_functions, matplotlib, namers,...",30,30
1,3,https://paperswithcode.com/paper/user-constrai...,1810.13054,User Constrained Thumbnail Generation using Ad...,Thumbnails are widely used all over the world ...,http://arxiv.org/abs/1810.13054v3,http://arxiv.org/pdf/1810.13054v3.pdf,,"['Perla Sai Raj Kishore', 'Ayan Kumar Bhunia',...","[region proposal, user constrained thumbnail g...",...,1810.13054,User Constrained Thumbnail Generation using Ad...,http://arxiv.org/abs/1810.13054v3,http://arxiv.org/pdf/1810.13054v3.pdf,Aiyoj/Thumbnail-Generation,https://github.com/Aiyoj/Thumbnail-Generation,region proposal,"{re, custom_vgg19, os, pandas, sys, cv2, ops, ...",15,15
2,4,https://paperswithcode.com/paper/michigan-mult...,2010.16417,MichiGAN: Multi-Input-Conditioned Hair Image G...,Despite the recent success of face image gener...,https://arxiv.org/abs/2010.16417v1,https://arxiv.org/pdf/2010.16417v1.pdf,,"['Zhentao Tan', 'Menglei Chai', 'Dongdong Chen...","[conditional image generation, image generation]",...,2010.16417,MichiGAN: Multi-Input-Conditioned Hair Image G...,https://arxiv.org/abs/2010.16417v1,https://arxiv.org/pdf/2010.16417v1.pdf,tzt101/MichiGAN,https://github.com/tzt101/MichiGAN,image generation,"{data, os, ui, matplotlib, dominate, scipy, ar...",40,40
3,7,https://paperswithcode.com/paper/abcnn-attenti...,1512.05193,ABCNN: Attention-Based Convolutional Neural Ne...,How to model a pair of sentences is a critical...,http://arxiv.org/abs/1512.05193v4,http://arxiv.org/pdf/1512.05193v4.pdf,TACL 2016 1,"['Wenpeng Yin', 'Hinrich Schütze', 'Bing Xiang...","[answer selection, natural language inference,...",...,1512.05193,ABCNN: Attention-Based Convolutional Neural Ne...,http://arxiv.org/abs/1512.05193v4,http://arxiv.org/pdf/1512.05193v4.pdf,shamalwinchurkar/question-classification,https://github.com/shamalwinchurkar/question-c...,natural language inference,"{os, matplotlib, qc_plot, qc_emb, argparse, nu...",29,29
4,8,https://paperswithcode.com/paper/abcnn-attenti...,1512.05193,ABCNN: Attention-Based Convolutional Neural Ne...,How to model a pair of sentences is a critical...,http://arxiv.org/abs/1512.05193v4,http://arxiv.org/pdf/1512.05193v4.pdf,TACL 2016 1,"['Wenpeng Yin', 'Hinrich Schütze', 'Bing Xiang...","[answer selection, natural language inference,...",...,1512.05193,ABCNN: Attention-Based Convolutional Neural Ne...,http://arxiv.org/abs/1512.05193v4,http://arxiv.org/pdf/1512.05193v4.pdf,jastfkjg/semantic-matching,https://github.com/jastfkjg/semantic-matching,natural language inference,"{os, random, sklearn, matchpyramid, mvlstm, ma...",9,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12219,21737,https://paperswithcode.com/paper/unsupervised-...,1703.05921,Unsupervised Anomaly Detection with Generative...,Obtaining models that capture imaging markers ...,http://arxiv.org/abs/1703.05921v1,http://arxiv.org/pdf/1703.05921v1.pdf,,"['Thomas Schlegl', 'Philipp Seeböck', 'Sebasti...","[anomaly detection, unsupervised anomaly detec...",...,1703.05921,Unsupervised Anomaly Detection with Generative...,http://arxiv.org/abs/1703.05921v1,http://arxiv.org/pdf/1703.05921v1.pdf,YeongHyeon/f-AnoGAN-TF,https://github.com/YeongHyeon/f-AnoGAN-TF,anomaly detection,"{os, matplotlib, sklearn, math, argparse, nump...",9,9
12220,21738,https://paperswithcode.com/paper/unsupervised-...,1703.05921,Unsupervised Anomaly Detection with Generative...,Obtaining models that capture imaging markers ...,http://arxiv.org/abs/1703.05921v1,http://arxiv.org/pdf/1703.05921v1.pdf,,"['Thomas Schlegl', 'Philipp Seeböck', 'Sebasti...","[anomaly detection, unsupervised anomaly detec...",...,1703.05921,Unsupervised Anomaly Detection with Generative...,http://arxiv.org/abs/1703.05921v1,http://arxiv.org/pdf/1703.05921v1.pdf,LeeDoYup/AnoGAN,https://github.com/LeeDoYup/AnoGAN,anomaly detection,"{os, tqdm, zipfile, sys, subprocess, requests,...",16,16
12221,21739,https://paperswithcode.com/paper/unsupervised-...,1703.05921,Unsupervised Anomaly Detection with Generative...,Obtaining models that capture imaging markers ...,http://arxiv.org/abs/1703.05921v1,http://arxiv.org/pdf/1703.05921v1.pdf,,"['Thomas Schlegl', 'Philipp Seeböck', 'Sebasti...","[anomaly detection, unsupervised anomaly detec...",...,1703.05921,Unsupervised Anomaly Detection with Generative...,http://arxiv.org/abs/1703.05921v1,http://arxiv.org/pdf/1703.05921v1.pdf,tkwoo/anogan-keras,https://github.com/tkwoo/anogan-keras,anomaly detection,"{anogan, os, matplotlib, tqdm, sklearn, cv2, m...",12,12
12222,21740,https://paperswithcode.com/paper/arbitrary-sty...,1703.06868,Arbitrary Style Transfer in Real-time with Ada...,Gatys et al. recently introduced a neural algo...,http://arxiv.org/abs/1703.06868v2,http://arxiv.org/pdf/1703.06868v2.pdf,ICCV 2017 10,"['Xun Huang', 'Serge Belongie']",[style transfer],...,1703.06868,Arbitrary Style Transfer in Real-time with Ada...,http://arxiv.org/abs/1703.06868v2,http://arxiv.org/pdf/1703.06868v2.pdf,ptran1203/style_transfer,https://github.com/ptran1203/style_transfer,style transfer,"{os, matplotlib, sklearn, cv2, keras, utils, p...",12,12


In [82]:
abstract_data_train, abstract_data_test = RepoTaskData.create_split(tasks_train[has_abstract], all_tasks[has_abstract], paperswithcode_with_features_df[has_abstract], paperswithcode_with_features_df[has_abstract]['abstract'].str.split())

(12179,)


In [83]:
from scarce_learn.zero_shot import devise_jax

In [84]:
import fasttext
fasttext_model = fasttext.load_model("output/python_files_fasttext_dim200.bin")



In [85]:
abstract_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    python_word_embeddings,
    python_word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

abstract_learner.fit_learner(abstract_data_train)

In [168]:
run_learner_experiment(abstract_learner, abstract_data_train, abstract_data_test)

{'accuracy_train': 0.4405755047757224,
 'accuracy_test': 0.12461617195496417,
 'top10_accuracy_train': 0.6903914590747331,
 'top10_accuracy_test': 0.6147540983606558}

In [86]:
abstract_learner.evaluate(abstract_data_train, metrics.accuracy_score)

0.4405755047757224

In [87]:
abstract_learner.evaluate(abstract_data_test, metrics.accuracy_score)

0.12461617195496417

In [88]:
get_retrieval_accuracy(abstract_learner, abstract_data_train, k=10)

0.6903914590747331

In [89]:
get_retrieval_accuracy(abstract_learner, abstract_data_test, k=10)

0.6147540983606558

# Abstract model using fasttext trained on Python code

In [170]:
ezslearner = zero_shot.ESZSLearner()
abstract_fasttext_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    word_embeddings,
    fasttext_model,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.FastTextVectorizer
)

In [169]:
run_learner_experiment(abstract_fasttext_learner, abstract_data_train, abstract_data_test)

{'accuracy_train': 0.4176036754926853,
 'accuracy_test': 0.06704196519959059,
 'top10_accuracy_train': 0.498220640569395,
 'top10_accuracy_test': 0.38524590163934425}

# Fasttext model on READMEs

In [110]:
paperswithcode_with_imports_df['readme'] = readmes
paperswithcode_with_features_df['readme'] = readmes

In [111]:
has_readme = ~paperswithcode_with_imports_df['readme'].isna()

readme_data_train, readme_data_test = RepoTaskData.create_split(tasks_train[has_readme], all_tasks[has_readme], paperswithcode_with_features_df[has_readme], paperswithcode_with_features_df[has_readme]['readme'].str.split())

(11605,)


In [171]:
readme_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 100),
    python_word_embeddings,
    python_word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

readme_learner.fit_learner(readme_data_train)

In [177]:
run_learner_experiment(readme_learner, readme_data_train, readme_data_test)

{'accuracy_train': 0.32076953278366704,
 'accuracy_test': 0.04994954591321897,
 'top10_accuracy_train': 0.6401515151515151,
 'top10_accuracy_test': 0.45985401459854014}

## Fasttext on READMEs - worse than word2vec

In [174]:
ezslearner = zero_shot.ESZSLearner()
readme_fasttext_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    word_embeddings,
    fasttext_model,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.FastTextVectorizer
)

In [175]:
run_learner_experiment(readme_fasttext_learner, readme_data_train, readme_data_test)

{'accuracy_train': 0.23216856432404134,
 'accuracy_test': 0.020686175580221997,
 'top10_accuracy_train': 0.4621212121212121,
 'top10_accuracy_test': 0.291970802919708}

# README keywords

In [178]:
readme_keywords_data_train, readme_keywords_data_test = RepoTaskData.create_split(tasks_train[has_readme], all_tasks[has_readme], paperswithcode_with_features_df[has_readme], readme_keywords[has_readme].str.split())

(11605,)


In [179]:
ezslearner = zero_shot.ESZSLearner()
readme_keywords_keywords_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(10, 10),
    word_embeddings,
    word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

run_learner_experiment(readme_keywords_learner, readme_keywords_data_train, readme_keywords_data_test)

{'accuracy_train': 0.292893600314095,
 'accuracy_test': 0.03279515640766902,
 'top10_accuracy_train': 0.7916666666666666,
 'top10_accuracy_test': 0.24817518248175183}

## Import2Vec

In [180]:
ezslearner = zero_shot.ESZSLearner()
import2vec_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(lmbda=100.0, gamma=10.0),
    import2vec,
    word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

run_learner_experiment(import2vec_learner, import_data_train, import_data_test)

{'accuracy_train': 0.16856283509873152,
 'accuracy_test': 0.008337965536409116,
 'top10_accuracy_train': 0.34296028880866425,
 'top10_accuracy_test': 0.1557377049180328}

## PRoNe

In [182]:
prone_embeddings = gensim.models.KeyedVectors.load("data/prone_embeddings.bin")

Using repo embedding from node embeddings

In [183]:
prone_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100,10),
    prone_embeddings,
    python_word_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

run_learner_experiment(prone_learner, graph_data_train, graph_data_test)

{'accuracy_train': 0.10932391787629135,
 'accuracy_test': 0.0019455252918287938,
 'top10_accuracy_train': 0.27075812274368233,
 'top10_accuracy_test': 0.040983606557377046}

## GraphSage

## aggregating vertex embeddings 

In [185]:
graphsage_embeddings = gensim.models.KeyedVectors.load("output/graphsage_embeddings_fasttext_dim200_epochs50_dim200_layers2.bin")

In [186]:
list(graphsage_embeddings.vocab)[-1]

'coincidence'

In [187]:
graph_data_train.X

0        canbakiskan/neuro-inspired-defense ensemble dr...
1        Aiyoj/Thumbnail-Generation custom_vgg19 data_l...
2        tzt101/MichiGAN ui_palette coco generator repl...
3        shamalwinchurkar/question-classification qc_tf...
4        jastfkjg/semantic-matching matcramid utils mvl...
                               ...                        
12219    YeongHyeon/f-AnoGAN-TF neuralnet tf_process la...
12220        LeeDoYup/AnoGAN utils download ops main model
12221                       tkwoo/anogan-keras main anogan
12222      ptran1203/style_transfer utils dataloader model
12223    Kano-Wu/Domain-Adversarial-Neural-Networks uti...
Length: 7647, dtype: object

In [188]:
ezslearner = zero_shot.ESZSLearner()
graphsage_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    graphsage_embeddings,
    graphsage_embeddings,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.AverageWordEmbeddingsVectorizer
)

run_learner_experiment(graphsage_learner, graph_data_train, graph_data_test)

{'accuracy_train': 0.006276971361318164,
 'accuracy_test': 0.02584769316286826,
 'top10_accuracy_train': 0.18050541516245489,
 'top10_accuracy_test': 0.1557377049180328}

## using GraphSAGE model for embedding

In [190]:
graphsage_data_train.repos

0                canbakiskan/neuro-inspired-defense
1                        Aiyoj/Thumbnail-Generation
2                                   tzt101/MichiGAN
3          shamalwinchurkar/question-classification
4                        jastfkjg/semantic-matching
                            ...                    
12219                        YeongHyeon/f-AnoGAN-TF
12220                               LeeDoYup/AnoGAN
12221                            tkwoo/anogan-keras
12222                      ptran1203/style_transfer
12223    Kano-Wu/Domain-Adversarial-Neural-Networks
Name: repo, Length: 8307, dtype: object

In [191]:
class LambdaTransformer:
    
    def __init__(self, transform_fn):
        self.transform = transform_fn
        
    def fit(self, X, **kwargs):
        return self

In [192]:
from github_search.pytorch_geometric_data import PygGraphWrapper
import torch

In [193]:
fasttext_embedder = embeddings.FastTextVectorizer(fasttext_model)

In [194]:
repo_descriptions

source
0492wzl/tensorflow_slim_densenet                  0492wzl/tensorflow_slim_densenet inception_v2 ...
08173021/FCOS                                     08173021/FCOS deform_pool_func scale test_box_...
09jvilla/CS234_gym                                09jvilla/CS234_gym walker2d spec_list play bip...
0h-n0/cdn_molecule_pytorch                        0h-n0/cdn_molecule_pytorch __init__ data util ...
0h-n0/tfdbonas                                    0h-n0/tfdbonas layers setup deep_surrogate_mod...
                                                                        ...                        
xzgz/faster-rcnn                                  xzgz/faster-rcnn compress_net pyloss resize_an...
xzhangfox/ANALYSE-AND-CLASSIFY-MUSIC-BY-LYRICS    xzhangfox/ANALYSE-AND-CLASSIFY-MUSIC-BY-LYRICS...
xzr12/PredCNN                                     xzr12/PredCNN ResidualMultiplicativeBlock pred...
xzwlx/Difficulty-SR                               xzwlx/Difficulty-SR rcan vdsr general100 dd

In [195]:
%%time
dependency_graph_wrapper = PygGraphWrapper(fasttext_embedder.transform, non_root_dependency_records_df, "repo_description", "file_description")

CPU times: user 31.7 s, sys: 955 ms, total: 32.7 s
Wall time: 31.6 s


  self.dataset = Data(torch.tensor(features), torch.tensor(edge_index))


In [196]:
graphsage_model = torch.load("output/graphsage_model_11_dim200_layers2.pth").cpu()#"output/graphsage_model_60_dim200_layers3.pth").cpu()
graphsage_model.training = False

In [197]:
graph_data_train.repos.isin(repo_descriptions.index).mean()

1.0

In [198]:
graphsage_data_train, graphsage_data_test = RepoTaskData.create_split(tasks_train, all_tasks, paperswithcode_with_features_df, paperswithcode_with_imports_df['imports'])
graphsage_data_train.X = repo_descriptions.loc[graph_data_train.repos]
graphsage_data_test.X = repo_descriptions.loc[graph_data_test.repos]

(12224,)


In [199]:
def make_records_df(sources, connected_vertices):
    return pd.DataFrame.from_records(
        [
            {"source": src, "destination": dst, "edge_type": "repo-file"}
            for (src, destinations) in zip(sources, connected_vertices)
            for dst in destinations 
        ]
    )

In [200]:
train_records_df = make_records_df(graphsage_data_train.repos, graph_data_train.X.fillna("").str.split()).drop_duplicates()

In [201]:
def get_vertex_embeddings(wrapper, vertex_subset, model):
    features = (
        model.full_forward(
            wrapper.dataset.x, wrapper.dataset.edge_index
        )
        .cpu()
        .detach()
        .numpy()
    )
    return features[wrapper.vertex_mapping.loc[vertex_subset]]

In [203]:
extended_dependency_graph_wrapper = PygGraphWrapper(embeddings.FastTextVectorizer(fasttext_model).transform, non_root_dependency_records_df + make_records_df(graphsage_data_train.repos, graph_data_train.X.dropna().str.split()))

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [None]:
dependency_graph_wrapper.get_vertex_embeddings(graphsage_data_train.X.iloc[0].split(), graphsage_model)

In [156]:
graphsage_learner = RetrieverLearner(
    zero_shot.ESZSLearner(100,10),
    LambdaTransformer(lambda x: dependency_graph_wrapper.get_vertex_embeddings(x, graphsage_model)),
    embeddings.FastTextVectorizer(fasttext_model)
)
graphsage_learner.fit_learner(graphsage_data_train)
graphsage_learner.evaluate(graphsage_data_train, metric=metrics.accuracy_score)

  self.dataset = Data(torch.tensor(features), torch.tensor(edge_index))


AttributeError: 'SAGEConv' object has no attribute 'decomposed_layers'

In [None]:
get_retrieval_accuracy(graphsage_learner, graphsage_data_train, k=10)

In [None]:
get_retrieval_accuracy(graphsage_learner, graphsage_data_test, k=10)

# Concatenation of repo, import embeddings

In [None]:
paired_data_train, paired_data_test = RepoTaskData.create_split(tasks_train, all_tasks, paperswithcode_with_features_df, paperswithcode_with_imports_df['imports'])
paired_data_train.X = graph_data_train.X + " " + import_data_train.X
paired_data_test.X = graph_data_test.X + " " + import_data_test.X

In [None]:
paired_data_train.X

In [None]:
paired_learner = RetrieverLearner.create(
    zero_shot.ESZSLearner(100, 10),
    PairedKeyedVectors(python_word_embeddings.wv, graphsage_embeddings),
    fasttext_model,
    embeddings.AverageWordEmbeddingsVectorizer,
    embeddings.FastTextVectorizer
)

paired_learner.fit_learner(graph_data_train)

In [None]:
paired_learner.evaluate(graph_data_train, metric=metrics.accuracy_score)

In [None]:
get_retrieval_accuracy(paired_learner, paired_data_train, k=10)

In [None]:
get_retrieval_accuracy(paired_learner, paired_data_test, k=10)

In [None]:
results = []
for (learner, learner_name, test) in zip(
    [import2vec_learner, prone_learner, paired_learner],
    ['import2vec', 'prone', 'both'],
    [X_test, repo_graph_terms_test, X_paired_test]
):
    accs = []
    for k in [1, 3, 5, 10, 20]:
        rec = get_retrieval_accuracy(learner, test, y_test, test_task_idxs, k=k)
        accs.append(rec)
    results.append(pd.Series(name=learner_name, data=accs))

In [None]:
results_df = pd.DataFrame(results)
results_df.columns = ["Accuracy@{}".format(i) for i in [1, 3, 5, 10, 20]]

In [None]:
results_df.round(3).to_markdown(open("metrics/zsl_results.md", "w"))

In [None]:
!cat metrics/zsl_results.md

In [None]:
import toolz

In [None]:
task_distances = metrics.pairwise.cosine_distances(task_embeddings, task_embeddings)

In [None]:
poincare_embeddings = gensim.models.KeyedVectors.load('data/poincare5.vec')

In [None]:
import gensim.models.wrappers.fasttext
from gensim.test.utils import datapath

In [None]:
from github_search import typical_file_parts
from mlutil import prototype_selection

In [None]:
selected_lines_df = typical_file_parts.get_selected_lines_and_repos(python_files_df['repo_name'], python_files_df['content'])

# Selecting prototypical lines

In [None]:
fasttext_selector = prototype_selection.PrototypeSelector(fasttext_avg_embedder)

In [None]:
try:
    fasttext_prototypes = json.load(open('data/fasttext_prototypes.json', 'r'))
except:
    fasttext_selector.fit_prototypes(selected_lines_df['line'], selected_lines_df['repo'])
    fasttext_prototypes = fasttext_selector.prototypes
    json.dump(fasttext_prototypes, open('data/fasttext_prototypes.json', 'w'))

In [None]:
codebert_vectorizer = embeddings.TransformerVectorizer('microsoft/codebert-base', batch_size=64)

In [None]:
codebert_selector = prototype_selection.PrototypeSelector(codebert_vectorizer)

In [None]:
try:
    codebert_prototypes = json.load(open('data/codebert_prototypes.json', 'r'))
except:
    codebert_selector.fit_prototypes(selected_lines_df['line'], selected_lines_df['repo'])
    codebert_prototypes = codebert_selector.prototypes
    json.dump(codebert_prototypes, open('data/codebert_prototypes.json', 'w'))

In [None]:
def vectorize_prototypes(vectorizer, prototypes):
    prototype_aggregated_embeddings = {}
    for key in prototypes.keys():
        prototype_aggregated_embeddings[key] = np.mean(vectorizer.transform(prototypes[key]), axis=0)
    return list(prototype_aggregated_embeddings.keys()), np.row_stack(prototype_aggregated_embeddings.values())

In [None]:
codebert_prototypes = {
    repo: v
    for (repo, v) in codebert_prototypes.items()
    if repo in paperswithcode_with_imports_df['repo_name'].values
}

In [None]:
codebert_prototypes.keys()

In [None]:
repos_train

In [None]:
fasttext_prototypes = {
    repo: v
    for (repo, v) in fasttext_prototypes.items()
    if repo in paperswithcode_with_imports_df['repo_name'].values
}

In [None]:
def get_prototypes(repo_name):
    return pd.DataFrame({"codebert": codebert_prototypes[repo_name], "fasttext": fasttext_prototypes[repo_name]})

In [None]:
fasttext_prototypes.keys()

In [None]:
get_prototypes("transformer")

In [None]:
get_prototypes("mmdetection")

In [None]:
get_prototypes("Recommenders-movielens")

In [None]:
get_prototypes("mmdetection")

In [None]:
fasttext_prototypes['mmdetection']

In [None]:
codebert_repos, codebert_prototype_embeddings = vectorize_prototypes(codebert_vectorizer, codebert_prototypes)

In [None]:
fasttext_repos, fasttext_prototype_embeddings = vectorize_prototypes(fasttext_avg_embedder, fasttext_prototypes)

In [None]:
len(fasttext_prototype_embeddings)

In [None]:
paperswithcode_tasks_series = paperswithcode_with_imports_df['most_common_task']
paperswithcode_tasks_series.index = paperswithcode_with_imports_df['repo_name']
#paperswithcode_tasks_series = paperswithcode_tasks_series[paperswithcode_tasks_series.index.isin(fasttext_repos)]

In [None]:
fasttext_tasks = paperswithcode_tasks_series.loc[fasttext_repos]
fasttext_tasks_embeddings = task_embedder.transform(fasttext_tasks)
codebert_tasks = paperswithcode_tasks_series.loc[codebert_repos]
codebert_tasks_embeddings = task_embedder.transform(codebert_tasks)

In [None]:
codebert_prototype_embeddings.shape

In [None]:
eszs_learner = zero_shot.ESZSLearner()

In [None]:
codebert_prototype_embeddings.shape

In [None]:
len(codebert_tasks)

In [None]:
eszs_learner.fit(codebert_prototype_embeddings, codebert_tasks, task_embeddings[:-1])
eszs_learner.score(codebert_prototype_embeddings, codebert_tasks, task_embeddings[:-1])

In [None]:
eszs_learner.fit(fasttext_prototype_embeddings, fasttext_tasks, task_embeddings[:-1])
eszs_learner.score(fasttext_prototype_embeddings, fasttext_tasks, task_embeddings[:-1])

In [None]:
list(set(selected_lines_df['repo']))[3007]

In [None]:
problematic_lines_df = selected_lines_df[selected_lines_df['repo'] == 'auto_ml']

In [None]:
del codebert_vectorizer

In [None]:
problematic_lines_df['lines']

In [None]:
codebert_selector.prototypes

In [None]:
y_embeddings = fasttext_avg_embedder.transform(tasks)

In [None]:
repo_names = 
repo_embeddings = 

In [None]:
y_embeddings.shape