In [1]:
!cd ..

In [2]:
import pandas as pd
import requests
import bs4
import mlutil.parallel
from io import StringIO
import sys
import pypi_cli
import time
import tqdm



In [3]:
!wc -l ../data/python/train.jsonl

251820 ../data/python/train.jsonl


In [4]:
!head  -2 ../data/python/train.jsonl

{"repo": "smdabdoub/phylotoast", "path": "phylotoast/util.py", "func_name": "split_phylogeny", "original_string": "def split_phylogeny(p, level=\"s\"):\n    \"\"\"\n    Return either the full or truncated version of a QIIME-formatted taxonomy string.\n\n    :type p: str\n    :param p: A QIIME-formatted taxonomy string: k__Foo; p__Bar; ...\n\n    :type level: str\n    :param level: The different level of identification are kingdom (k), phylum (p),\n                  class (c),order (o), family (f), genus (g) and species (s). If level is\n                  not provided, the default level of identification is species.\n\n    :rtype: str\n    :return: A QIIME-formatted taxonomy string up to the classification given\n            by param level.\n    \"\"\"\n    level = level+\"__\"\n    result = p.split(level)\n    return result[0]+level+result[1].split(\";\")[0]", "language": "python", "code": "def split_phylogeny(p, level=\"s\"):\n    \"\"\"\n    Return either the full or truncated versio

In [5]:
all_codesearch_df = pd.concat([
    pd.read_json('../data/python/train.jsonl', lines=True),
    pd.read_json('../data/python/valid.jsonl', lines=True),
    pd.read_json('../data/python/test.jsonl', lines=True),
])

In [6]:
all_codesearch_df['repo'].value_counts().index[:20]

Index(['saltstack/salt', 'mitsei/dlkit', 'google/grr', 'bcbio/bcbio-nextgen',
       'materialsproject/pymatgen', 'tensorflow/tensor2tensor',
       'iotile/coretools', 'pandas-dev/pandas', 'cloud9ers/gurumate',
       'spyder-ide/spyder', 'pypa/pipenv', 'apple/turicreate', 'gem/oq-engine',
       'pantsbuild/pants', 'log2timeline/plaso',
       'googleapis/google-cloud-python', 'inasafe/inasafe', 'gwastro/pycbc',
       'apache/incubator-mxnet', 'senaite/senaite.core'],
      dtype='object')

## Github page project descriptions

Most repositories have easily accesible descriptions on github.

Github page HTML has description in 'title' tag.

The problem with this approach is github's rate limit (we're not using API for this)

In [7]:
def get_html(url):
    return requests.get(url).text


def get_short_description(repo):
    url = 'http://www.github.com/{}'.format(repo)
    html = get_html(url)
    parsed_html = bs4.BeautifulSoup(html)
    return parsed_html.find('title').get_text()

In [8]:
repo = 'allenai/allennlp'

In [9]:
url = 'http://www.github.com/{}?'.format(repo)

In [10]:
assert get_short_description('allenai/allennlp') == 'GitHub - allenai/allennlp: An open-source NLP research library, built on PyTorch.'

In [11]:
repos = pd.Series(all_codesearch_df['repo'].unique())

In [12]:
repos.shape

(12361,)

In [13]:
descriptions = []
for repo in tqdm.tqdm(repos[:50]):
    descriptions.append(get_short_description(repo))

100%|██████████| 50/50 [00:50<00:00,  1.00s/it]


In [14]:
%%time
descriptions_p = list(mlutil.parallel.mapp(get_short_description, repos[:1000]))

KeyboardInterrupt: 

In [15]:
descriptions_p.index('Rate limit · GitHub')

NameError: name 'descriptions_p' is not defined

## PyPI project descriptions with pypi_cli

Most of dataset repositories are registered in PyPI.

In [13]:
def get_pypi_package_description(package_name, part=2):
    temp_out = StringIO()
    sys.stdout = temp_out
    try:
        pypi_cli.info([package_name])

    except:
        pass
    stdout = sys.stdout.getvalue().split('\n')
    if len(stdout) > part:
        description = stdout[part]
    else:
        description = None
    sys.stdout = sys.__stdout__
    return description

In [14]:
get_pypi_package_description('torch')

'Tensors and Dynamic neural networks in Python with strong GPU acceleration'

In [15]:
def get_pypi_repo_description(repo):
    print(repo.split('/'))
    return get_pypi_package_description(repo.split('/')[1])

In [16]:
get_pypi_repo_description('allenai/allennlp')

'An open-source NLP research library, built on PyTorch.'

In [31]:
%%capture
pypi_descriptions = []
for repo in tqdm.tqdm(repos[:100]):
    pypi_descriptions.append(get_pypi_package_description(repo.split('/')[1]))

In [17]:
import mlutil.parallel

In [348]:
%%capture
t_start = time.time()
pypi_descriptions_p = list(mlutil.parallel.mapp(get_pypi_repo_description, repos))
t_end = time.time()

### How long did it take to retrieve PyPI descriptions (minutes)

In [349]:
round((t_end - t_start) / 60, 2)

2.13

In [350]:
repos_with_descriptions = [repo for (repo, n) in zip(repos, pypi_descriptions_p) if not n is None]
descriptions = [desc for (repo, desc) in zip(repos, pypi_descriptions_p) if not (desc is None or desc == '')]

In [351]:
n_repos_with_no_pypi_description = len(repos) - len(repos_with_descriptions)

### Repositories without pypi description

In [352]:
str(round(100 * n_repos_with_no_pypi_description / len(pypi_descriptions_p), 2)) + '%'

'14.04%'

In [353]:
pypi_descriptions_p[:10]

['Tools for phylogenetic data analysis including visualization and cluster-computing support.',
 None,
 'Open-source algorithms for data-driven building analysis and control',
 'Bootstrap Python package',
 'Extremely fast and easy feature based HTML generator.',
 None,
 'Connection utilities',
 'Python library to work with Steam',
 'Distributed Network Packet Analysis Pipeline for Layer 2, 3 and 4 Frames',
 'Django Simple Multilingual Support for Models.']

In [24]:
all_codesearch_df.head()

Unnamed: 0,repo,path,func_name,original_string,language,code,code_tokens,docstring,docstring_tokens,sha,url,partition
0,smdabdoub/phylotoast,phylotoast/util.py,split_phylogeny,"def split_phylogeny(p, level=""s""):\n """"""\n ...",python,"def split_phylogeny(p, level=""s""):\n """"""\n ...","[def, split_phylogeny, (, p, ,, level, =, ""s"",...",Return either the full or truncated version of...,"[Return, either, the, full, or, truncated, ver...",0b74ef171e6a84761710548501dfac71285a58a3,https://github.com/smdabdoub/phylotoast/blob/0...,train
1,smdabdoub/phylotoast,phylotoast/util.py,ensure_dir,"def ensure_dir(d):\n """"""\n Check to make...",python,"def ensure_dir(d):\n """"""\n Check to make...","[def, ensure_dir, (, d, ), :, if, not, os, ., ...",Check to make sure the supplied directory path...,"[Check, to, make, sure, the, supplied, directo...",0b74ef171e6a84761710548501dfac71285a58a3,https://github.com/smdabdoub/phylotoast/blob/0...,train
2,smdabdoub/phylotoast,phylotoast/util.py,file_handle,"def file_handle(fnh, mode=""rU""):\n """"""\n ...",python,"def file_handle(fnh, mode=""rU""):\n """"""\n ...","[def, file_handle, (, fnh, ,, mode, =, ""rU"", )...",Takes either a file path or an open file handl...,"[Takes, either, a, file, path, or, an, open, f...",0b74ef171e6a84761710548501dfac71285a58a3,https://github.com/smdabdoub/phylotoast/blob/0...,train
3,smdabdoub/phylotoast,phylotoast/util.py,gather_categories,"def gather_categories(imap, header, categories...",python,"def gather_categories(imap, header, categories...","[def, gather_categories, (, imap, ,, header, ,...",Find the user specified categories in the map ...,"[Find, the, user, specified, categories, in, t...",0b74ef171e6a84761710548501dfac71285a58a3,https://github.com/smdabdoub/phylotoast/blob/0...,train
4,smdabdoub/phylotoast,phylotoast/util.py,parse_unifrac,"def parse_unifrac(unifracFN):\n """"""\n Pa...",python,"def parse_unifrac(unifracFN):\n """"""\n Pa...","[def, parse_unifrac, (, unifracFN, ), :, with,...",Parses the unifrac results file into a diction...,"[Parses, the, unifrac, results, file, into, a,...",0b74ef171e6a84761710548501dfac71285a58a3,https://github.com/smdabdoub/phylotoast/blob/0...,train


## Baseline - retrieval by bag of words from descriptions

Fit bag of words model to descriptions, then match 'descriptions' obtained from concatenating comments from all functions from repository

The assumptions here are very optimistic, since we're only matching features from known repositories 

In [427]:
len(repos_with_descriptions)

10625

In [428]:
codesearch_df = all_codesearch_df[all_codesearch_df['repo'].isin(repos_with_descriptions)]

In [429]:
train_codesearch_df = codesearch_df[codesearch_df['partition'] == 'train']
val_codesearch_df = codesearch_df[codesearch_df['partition'] == 'val']

In [430]:
set(all_codesearch_df['partition'].values)

{'test', 'train', 'valid'}

In [431]:
from sklearn import feature_extraction, metrics
import numpy as np

In [432]:
tfidf_vectorizer = feature_extraction.text.TfidfVectorizer()

In [433]:
train_codesearch_df.columns

Index(['repo', 'path', 'func_name', 'original_string', 'language', 'code',
       'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url',
       'partition'],
      dtype='object')

In [434]:
repo_descriptions = codesearch_df.groupby('repo')['docstring'].agg(' '.join)

In [435]:
description_bow_vectors = tfidf_vectorizer.fit_transform(descriptions).astype('float32')

In [436]:
bow_vectors = tfidf_vectorizer.transform(repo_descriptions)

In [437]:
description_docstring_distances = metrics.pairwise.cosine_distances(description_bow_vectors, bow_vectors)

In [438]:
%%time
description_closest_docstring_indices = pd.DataFrame(np.argsort(description_docstring_distances, axis=1)[:,:100])

In [439]:
val_codesearch_df

Unnamed: 0,repo,path,func_name,original_string,language,code,code_tokens,docstring,docstring_tokens,sha,url,partition


In [440]:
description_closest_docstrings = description_closest_docstring_indices.apply(lambda s: codesearch_df['repo'].iloc[s].values, axis=1)

In [441]:
val_description_closest_docstring_indices.values[0]

array([12315,  5357,  5496,  4823, 11440,  8370, 13226,  6336, 12316,
       13100,  5350,  4103,    34, 12463,  4410, 12314, 13227,  2647,
        2646,  4959,  5358, 12997,  9230,  1915, 11506,  4264,  2969,
        1121,  2011,  7498,  9228,  2661,  4822,  9519,  4429, 11504,
        4746,  1764, 11505,  1126,  1125,  8736, 10521,  9999,  9229,
         862,  2854,  1765, 11446,  3451])

In [442]:
bow_vectors.shape

(10625, 11178)

## Recall at 100

In [443]:
recall_at100= [
    repo in potential_repos
    for (repo, potential_repos) in zip(repos, description_closest_docstrings.values)
]

In [444]:
sum(recall_at50) / len(repos)

0.006795566701723162

## Pomysły

- jako baseline wyszukiwanie zanurzeń komentarzy
- wyszukiwanie po cechach korzystających z CodeBERTA
- wytrenowanie modelu do rankingu (?) 

### Ogólne pomysły na podejścia

Problem wydaje się dobrze opisywać podejście 'pairwise': mamy cechy jednego typu i drugiego typu, uczymy się dopasowywać jedne do drugich

- agregowanie cech kodu z repozytoriów
- agregowanie cech pochodzących od importów (Import2Vec) - **pokaż mi co importujesz, a powiem na jaki temat jest repo**
- ogólniej cechy dla specjalnych tokenów: nazwy funkcji, nazwy argumentów
- sprytniejsza agregacja - np hierarchiczna (np zrobić hiperboliczną wersję zanurzeń)