In [1]:
# default_exp repository_descriptions

In [2]:
# export
import os
import requests
from io import StringIO
import sys
import time
import tqdm

import pypi_cli
from sklearn import feature_extraction, metrics
import numpy as np
import pandas as pd
import bs4

import mlutil.parallel

import haystack.document_store.memory
import haystack.document_store.elasticsearch
from haystack import document_store

import haystack.retriever.sparse
from haystack import retriever

03/05/2021 20:45:36 - INFO - faiss -   Loading faiss with AVX2 support.
03/05/2021 20:45:36 - INFO - faiss -   Loading faiss.
03/05/2021 20:45:36 - INFO - farm.modeling.prediction_head -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [3]:
%cd ..

/home/kuba/Projects/github_search


In [4]:
!wc -l ../data/python/train.jsonl

wc: ../data/python/train.jsonl: No such file or directory


In [5]:
!head  -1 ../data/python/train.jsonl

head: cannot open '../data/python/train.jsonl' for reading: No such file or directory


In [6]:
# export


def get_all_codesearch_df(data_dir):
    return pd.concat(
        [
            pd.read_json(os.path.join(data_dir, split), lines=True)
            for split in ["train.jsonl", "valid.jsonl", "test.jsonl"]
        ]
    )

In [7]:
all_codesearch_df = get_all_codesearch_df("data/python")

In [8]:
all_codesearch_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 280652 entries, 0 to 14917
Data columns (total 12 columns):
repo                280652 non-null object
path                280652 non-null object
func_name           280652 non-null object
original_string     280652 non-null object
language            280652 non-null object
code                280652 non-null object
code_tokens         280652 non-null object
docstring           280652 non-null object
docstring_tokens    280652 non-null object
sha                 280652 non-null object
url                 280652 non-null object
partition           280652 non-null object
dtypes: object(12)
memory usage: 27.8+ MB


In [7]:
all_codesearch_df["repo"].value_counts().index[:20]

Index(['saltstack/salt', 'mitsei/dlkit', 'google/grr', 'bcbio/bcbio-nextgen',
       'materialsproject/pymatgen', 'tensorflow/tensor2tensor',
       'iotile/coretools', 'pandas-dev/pandas', 'cloud9ers/gurumate',
       'spyder-ide/spyder', 'pypa/pipenv', 'apple/turicreate', 'gem/oq-engine',
       'pantsbuild/pants', 'log2timeline/plaso',
       'googleapis/google-cloud-python', 'inasafe/inasafe', 'gwastro/pycbc',
       'apache/incubator-mxnet', 'senaite/senaite.core'],
      dtype='object')

## Github page project descriptions

Most repositories have easily accesible descriptions on github.

Github page HTML has description in 'title' tag.

The problem with this approach is github's rate limit (we're not using API for this)

In [8]:
# export


def get_html(url):
    return requests.get(url).text


def get_short_description(repo):
    url = "http://www.github.com/{}".format(repo)
    html = get_html(url)
    parsed_html = bs4.BeautifulSoup(html)
    return parsed_html.find("title").get_text()

In [9]:
repo = "allenai/allennlp"

In [10]:
url = "http://www.github.com/{}?".format(repo)

In [11]:
assert (
    get_short_description("allenai/allennlp")
    == "GitHub - allenai/allennlp: An open-source NLP research library, built on PyTorch."
)

In [12]:
repos = pd.Series(all_codesearch_df["repo"].unique())

In [13]:
repos.shape

(12361,)

descriptions = []
for repo in tqdm.tqdm(repos[:50]):
    descriptions.append(get_short_description(repo))

%%time
descriptions_p = list(mlutil.parallel.mapp(get_short_description, repos[:1000]))

descriptions_p.index('Rate limit · GitHub')

## PyPI project descriptions with pypi_cli

Most of dataset repositories are registered in PyPI.

In [14]:
# export


def get_pypi_package_description(package_name, part=2):
    temp_out = StringIO()
    sys.stdout = temp_out
    try:
        pypi_cli.info([package_name])

    except:
        pass
    stdout = sys.stdout.getvalue().split("\n")
    if len(stdout) > part:
        description = stdout[part]
    else:
        description = None
    sys.stdout = sys.__stdout__
    return description

In [15]:
get_pypi_package_description("torch")

'Tensors and Dynamic neural networks in Python with strong GPU acceleration'

In [16]:
# export


def get_pypi_repo_description(repo):
    print(repo.split("/"))
    return get_pypi_package_description(repo.split("/")[1])

In [17]:
get_pypi_repo_description("allenai/allennlp")

'An open-source NLP research library, built on PyTorch.'

In [18]:
import mlutil.parallel
import os

In [19]:
# export


def load_pypi_repo_descriptions(
    repos_descriptions_path="data/repo_pypi_descriptions.csv",
):
    if not os.path.exists(repos_descriptions_path):
        t_start = time.time()
        pypi_descriptions_p = list(
            mlutil.parallel.mapp(get_pypi_repo_description, repos)
        )
        t_end = time.time()

        # repos_with_descriptions = [repo for (repo, n) in zip(repos, pypi_descriptions_p) if not n is None]
        repos_descriptions = [
            (repo, desc)
            for (repo, desc) in zip(repos, pypi_descriptions_p)
            if not (desc is None or desc == "")
        ]
        repos, descriptions = zip(*repos_descriptions)
        repos_descriptions_df = pd.DataFrame(
            {"repo": repos, "pypi_description": descriptions}
        )
        repos_descriptions_df.to_csv(repos_descriptions_path)
        print("loaded descriptions in", round((t_end - t_start) / 60, 2), "minutes")
    else:
        repos_descriptions_df = pd.read_csv(repos_descriptions_path, index_col=0)
    return repos_descriptions_df

In [22]:
repos_descriptions_df = load_pypi_repo_descriptions()

### How long did it take to retrieve PyPI descriptions (minutes)

In [23]:
n_repos_with_no_pypi_description = len(repos) - len(repos_descriptions_df)

### Repositories without pypi description

In [24]:
str(round(100 * n_repos_with_no_pypi_description / len(repos), 2)) + "%"

'27.68%'

In [25]:
repos_descriptions_df.head()

Unnamed: 0_level_0,repo,pypi_description
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,smdabdoub/phylotoast,Tools for phylogenetic data analysis including visualization and cluster-com...
1,mkouhei/bootstrap-py,Open-source algorithms for data-driven building analysis and control
2,elbow-jason/Uno-deprecated,Bootstrap Python package
3,disqus/nydus,Extremely fast and easy feature based HTML generator.
4,jay-johnson/network-pipeline,Connection utilities


In [31]:
repos_descriptions_df.shape

(8940, 2)