In [1]:
!cd ..

In [2]:
import pandas as pd
import requests
import bs4
import mlutil.parallel
from io import StringIO
import sys
import pypi_cli
import time
import tqdm



In [3]:
!wc -l ../data/python/train.jsonl

251820 ../data/python/train.jsonl


In [4]:
!head  -2 ../data/python/train.jsonl

{"repo": "smdabdoub/phylotoast", "path": "phylotoast/util.py", "func_name": "split_phylogeny", "original_string": "def split_phylogeny(p, level=\"s\"):\n    \"\"\"\n    Return either the full or truncated version of a QIIME-formatted taxonomy string.\n\n    :type p: str\n    :param p: A QIIME-formatted taxonomy string: k__Foo; p__Bar; ...\n\n    :type level: str\n    :param level: The different level of identification are kingdom (k), phylum (p),\n                  class (c),order (o), family (f), genus (g) and species (s). If level is\n                  not provided, the default level of identification is species.\n\n    :rtype: str\n    :return: A QIIME-formatted taxonomy string up to the classification given\n            by param level.\n    \"\"\"\n    level = level+\"__\"\n    result = p.split(level)\n    return result[0]+level+result[1].split(\";\")[0]", "language": "python", "code": "def split_phylogeny(p, level=\"s\"):\n    \"\"\"\n    Return either the full or truncated versio

In [5]:
all_codesearch_df = pd.concat([
    pd.read_json('../data/python/train.jsonl', lines=True),
    pd.read_json('../data/python/valid.jsonl', lines=True),
    pd.read_json('../data/python/test.jsonl', lines=True),
])

In [6]:
all_codesearch_df['repo'].value_counts().index[:20]

Index(['saltstack/salt', 'mitsei/dlkit', 'google/grr', 'bcbio/bcbio-nextgen',
       'materialsproject/pymatgen', 'tensorflow/tensor2tensor',
       'iotile/coretools', 'pandas-dev/pandas', 'cloud9ers/gurumate',
       'spyder-ide/spyder', 'pypa/pipenv', 'apple/turicreate', 'gem/oq-engine',
       'pantsbuild/pants', 'log2timeline/plaso',
       'googleapis/google-cloud-python', 'inasafe/inasafe', 'gwastro/pycbc',
       'apache/incubator-mxnet', 'senaite/senaite.core'],
      dtype='object')

## Github page project descriptions

Most repositories have easily accesible descriptions on github.

Github page HTML has description in 'title' tag.

The problem with this approach is github's rate limit (we're not using API for this)

In [7]:


def get_html(url):
    return requests.get(url).text


def get_short_description(repo):
    url = 'http://www.github.com/{}'.format(repo)
    html = get_html(url)
    parsed_html = bs4.BeautifulSoup(html)
    return parsed_html.find('title').get_text()

In [8]:
repo = 'allenai/allennlp'

In [9]:
url = 'http://www.github.com/{}?'.format(repo)

In [10]:
assert get_short_description('allenai/allennlp') == 'GitHub - allenai/allennlp: An open-source NLP research library, built on PyTorch.'

In [11]:
repos = pd.Series(all_codesearch_df['repo'].unique())

In [12]:
repos.shape

(12361,)

In [13]:
descriptions = []
for repo in tqdm.tqdm(repos[:50]):
    descriptions.append(get_short_description(repo))

100%|██████████| 50/50 [00:38<00:00,  1.28it/s]


In [14]:
%%time
descriptions_p = list(mlutil.parallel.mapp(get_short_description, repos[:1000]))

CPU times: user 274 ms, sys: 562 ms, total: 836 ms
Wall time: 14.7 s


In [15]:
descriptions_p.index('Rate limit · GitHub')

82

## PyPI project descriptions with pypi_cli

Most of dataset repositories are registered in PyPI.

In [16]:
def get_pypi_package_description(package_name):
    temp_out = StringIO()
    sys.stdout = temp_out
    try:
        pypi_cli.info([package_name])

    except:
        pass
    stdout = sys.stdout.getvalue().split('\n')
    if len(stdout) > 2:
        description = stdout[2]
    else:
        description = None
    sys.stdout = sys.__stdout__
    return description

In [17]:
get_pypi_package_description('matplotlib')

'Python plotting package'

In [18]:
def get_pypi_repo_description(repo):
    print(repo.split('/'))
    return get_pypi_package_description(repo.split('/')[1])

In [19]:
get_pypi_repo_description('allenai/allennlp')

'An open-source NLP research library, built on PyTorch.'

In [20]:
%%capture
pypi_descriptions = []
for repo in tqdm.tqdm(repos[:100]):
    pypi_descriptions.append(get_pypi_package_description(repo.split('/')[1]))

In [21]:
import mlutil.parallel

In [22]:
%%capture
t_start = time.time()
pypi_descriptions_p = list(mlutil.parallel.mapp(get_pypi_repo_description, repos))
t_end = time.time()

### How long did it take to retrieve PyPI descriptions (minutes)

In [23]:
round((t_end - t_start) / 60, 2)

1.83

In [24]:
n_repos_with_no_pypi_description = len([n for n in pypi_descriptions_p if n is None])

### Repositories without pypi description

In [25]:
str(round(100 * n_repos_with_no_pypi_description / len(pypi_descriptions_p), 2)) + '%'

'14.03%'

In [26]:
pypi_descriptions_p[:10]

['Tools for phylogenetic data analysis including visualization and cluster-computing support.',
 None,
 'Open-source algorithms for data-driven building analysis and control',
 'Bootstrap Python package',
 'Extremely fast and easy feature based HTML generator.',
 None,
 'Connection utilities',
 'Python library to work with Steam',
 'Distributed Network Packet Analysis Pipeline for Layer 2, 3 and 4 Frames',
 'Django Simple Multilingual Support for Models.']