In [1]:
#default_exp retrieval_results

In [2]:
#export
import numpy as np
import pandas as pd

import attr
import pickle

from sklearn import metrics

from github_search.matching_zsl import *

In [3]:
%cd ..

/home/kuba/Projects/github_search


In [4]:
#export


@attr.s
class Retriever:
    
    input_embedder = attr.ib()
    query_embedder = attr.ib()
    zs_learner = attr.ib()
    embeddings_calculated = attr.ib(default=False)
    
    def set_embeddings(self, X_names, X, X_descriptions=None):
        self.X_embeddings = self.input_embedder.transform(X)
        self.X_df = pd.DataFrame({"input": X})
        if not X_descriptions is None:
            self.X_df['description'] = X_descriptions
        self.X_df.index = X_names
        self.embeddings_calculated = True

    def retrieve_query_results(self, query, k=25, similarity=metrics.pairwise.cosine_similarity):
        if not self.embeddings_calculated:
            raise Exception("embeddings not calculated")
        input_embeddings = self.X_embeddings
        y_embeddings = self.query_embedder.transform([query])
        predictions = self.zs_learner.predict_raw(input_embeddings)
        input_target_similarities = similarity(predictions, y_embeddings)
        top_idxs = np.argsort(-input_target_similarities[:,0])[:k]
        top_similarities = input_target_similarities[top_idxs, 0]
        results_df = self.X_df.iloc[top_idxs]
        results_df['similarity'] = top_similarities
        return results_df.drop(columns=['input']) 

    def from_retriever_learner(learner):
        return Retriever(learner.input_embedder, learner.y_embedder, learner.zs_learner)
    

In [5]:
readme_data_test = pickle.load(open("output/readme_data_test.pkl", "rb"))

In [6]:
readme_learner = pickle.load(open("output/readme_learner.pkl", "rb"))

In [7]:
readme_retriever = Retriever.from_retriever_learner(readme_learner)
readme_retriever.set_embeddings(readme_data_test.repos, readme_data_test.X, readme_data_test.X)

In [8]:
distance_learning_results = readme_retriever.retrieve_query_results("similarity learning")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['similarity'] = top_similarities


In [9]:
metric_learning_results = readme_retriever.retrieve_query_results("metric learning")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['similarity'] = top_similarities


In [10]:
metric_learning_results

Unnamed: 0_level_0,description,similarity
repo,Unnamed: 1_level_1,Unnamed: 2_level_1
RingBDStack/ELCO,# ELCO a Heuristic Semi-supervised Learning Fr...,0.242032
yuruntian/HyNet,# HyNet: Learning Local Descriptor with Hybrid...,0.237193
Confusezius/metric-learning-mining-interclass-characteristics,# MIC: Mining Interclass Characteristics for I...,0.231129
smilesun/reinbo,# Migration Please visit our new repository ht...,0.228726
dibyaghosh/gcsl,# Goal-Conditioned Supervised Learning (GCSL) ...,0.222977
notdibya/gcsl,# Goal-Conditioned Supervised Learning (GCSL) ...,0.222977
gdahia/meta_occ,# Meta Learning for Few-Shot One-class Classif...,0.21705
sfujim/BCQ,# Batch-Constrained Deep Q-Learning (BCQ) Batc...,0.21283
seovchinnikov/cosine_softmax_keras,# cosine_softmax_keras Quick implementation of...,0.20915
PeiqinZhuang/API-Net,# Learning Attentive Pairwise Interaction for ...,0.205377


In [11]:
word_embedding_results = readme_retriever.retrieve_query_results("word embeddings")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['similarity'] = top_similarities


In [12]:
word_embedding_results

Unnamed: 0_level_0,description,similarity
repo,Unnamed: 1_level_1,Unnamed: 2_level_1
yrbahn/deep_match_ctr_prediction,# deep_match_ctr_prediction https://arxiv.org/...,0.363135
Ganeshpadmanaban/Neural-Attention-Model,# Abstractive Text Summarization ## The Algori...,0.330384
jabalazs/gating,# Gating Mechanisms Code accompanying the pape...,0.326283
nlpub/watasense,# Watasense Watasense is a framework for unsup...,0.319607
armandvilalta/Full-network-multimodal-embeddings,# Full-network-multimodal-embeddings Code used...,0.318392
Lambda-3/pyindra,# The Python Indra Client ##### The official p...,0.314428
tsourolampis/Adagio,# Adagio: Fast Data-Aware Near-Isometric Linea...,0.30396
cambridgeltl/adversarial-postspec,# adversarial-postspec The implementation of a...,0.298076
passalis/sef,# PySEF: A Python Library for Similarity-based...,0.293374
pl8787/MatchPyramid-TensorFlow,# MatchPyramid-TensorFlow A simple version of ...,0.292611


In [13]:
image_generation_results = readme_retriever.retrieve_query_results("image generation")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['similarity'] = top_similarities


In [14]:
image_generation_results

Unnamed: 0_level_0,description,similarity
repo,Unnamed: 1_level_1,Unnamed: 2_level_1
uw-cmg/GAN-STEM-Conv2MultiSlice,# GAN-STEM-Conv2MultiSlice GAN method to help ...,0.353889
MandyZChen/srez,# srez Image super-resolution through deep lea...,0.352582
andreas128/SRFlow,# SRFlow #### Official SRFlow training code: S...,0.352293
cosmic119/StarGAN,"<p align=""center""><img width=""40%"" src=""jpg/lo...",0.343281
BradyFU/DVG,# Dual Variational Generation for Low Shot HFR...,0.342335
SummerHuiZhang/StarGAN_test,"<p align=""center""><img width=""40%"" src=""png/lo...",0.338064
VIGNESHinZONE/Face-Super-Resolution-Through-Wasserstein-GANs,# Face-Super-Resolution-Through-Wasserstein-GA...,0.336254
yhlleo/DWC-GAN,![Python 3.5](https://img.shields.io/badge/pyt...,0.334087
shikhadahiya/Image-to-image-translation-using-C-GAN,# Image Translation using Conditional Adversar...,0.330354
BCV-Uniandes/SMIT,[![Build Status](https://dev.azure.com/bcv-uni...,0.330151


In [15]:
from github_search import github_readmes

In [16]:
readmes_df = pd.DataFrame({"repo" : readme_data_test.repos, "readme": readme_data_test.X.copy()})

In [17]:
image_generation_results.merge(readmes_df, left_on='matched_record', right_on="repo").drop(columns=['matched_record']).iloc[:,[1,2,0]]

KeyError: 'matched_record'