In [1]:
# default_exp github_crawling

In [2]:
# export
import pandas as pd
import requests
import json
from operator import itemgetter
import tqdm

import os
import tqdm
import glob
from git.repo.base import Repo
import pathlib
from github_search import paperswithcode_tasks

import itertools
import codecs
import concurrent.futures

In [3]:
%cd ..

/home/kuba/Projects/github_search


# Crawling github

Getting Python files from github repos

Before running that code put your github auth token in data/token.txt. Doing this will increase rate limit from 60 to 5000 calls per hour.

`_get_tree` gets file tree from github repository

`_get_python_files` retrieves Python files from a given repository: it returns tuples consisting of

owner, repo_name, file path, file content, hash of file content


`get_python_files_df` puts information described above into dataframe with appropriate column names 

In [4]:
# export

token = open("data/token.txt", "r").read().strip()


def _get_tree(owner, repo_name):
    url_template = "https://api.github.com/repos/{}/{}/git/trees/master?recursive=1"
    headers = {"Authorization": "token " + token}
    url = url_template.format(owner, repo_name)
    return requests.get(url, headers=headers)


def _get_python_files(owner, repo_name):
    try:
        files = json.loads(_get_tree(owner, repo_name).text)["tree"]
        for maybe_file in files:
            path = maybe_file["path"]
            extension = path.split(".")[-1]
            is_valid_file = (
                extension == "py"
            )  # or os.path.basename(path) == 'README.md'
            if maybe_file["type"] == "blob" and is_valid_file:
                path = maybe_file["path"]
                raw_file_url_template = (
                    "https://raw.githubusercontent.com/{}/{}/master/{}"
                )
                raw_file_url = raw_file_url_template.format(owner, repo_name, path)
                yield owner, repo_name, path, requests.get(
                    raw_file_url
                ).text, maybe_file["sha"]

    except (KeyError, json.JSONDecodeError) as e:
        print("failed for {}/{}".format(owner, repo_name))
        print(type(e))
        print(e)
        return
        yield

In [5]:
token

'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'

In [6]:
example_owner = "lambdaofgod"
example_repo = "mlutil"

Getting python files should return empty list when ran on nonexisting repository

Getting python files should return correct number of files for an existing repository 

In [7]:
python_file_tuples = list(_get_python_files("lambdaofgod", "findkit"))

In [8]:
assert len(python_file_tuples) == 26

In [9]:
# export


def _make_python_files_df(file_tuples):
    if len(file_tuples) > 0:
        df = pd.DataFrame.from_records(file_tuples)
        df.columns = ["owner", "repo_name", "file_path", "content", "sha"]
        return df
    else:
        return pd.DataFrame({})


def get_python_files_df(owner, repo_name):
    file_tuples = list(_get_python_files(owner, repo_name))
    return _make_python_files_df(file_tuples)

In [10]:
df_cols = list(_make_python_files_df(python_file_tuples).columns)
assert df_cols == ["owner", "repo_name", "file_path", "content", "sha"]

## Brute force - clone repos

In [11]:
!mkdir -p repos

In [12]:
import subprocess

In [13]:
# export
DST_PREFIX = "/media/kuba/167A50387A5016B9/repos/repos"


def clone_repo(destination_prefix, repo_name):
    dest_path = os.path.join(destination_prefix, repo_name)
    if not os.path.exists(dest_path):
        proc = subprocess.Popen(
            [
                "git",
                "clone",
                "https://{}:x-oauth-basic@github.com/{}".format(token, repo_name),
                dest_path,
            ]
        )
        for p in pathlib.Path(dest_path).rglob("*"):
            if os.path.isfile(p) and not (
                str(p).endswith(".py") or str(p).endswith(".md")
            ):
                os.remove(p)
        proc.wait()


def clone_repos(repos, max_workers=2, destination_prefix=DST_PREFIX):
    def _clone_repo(repo_name):
        try:
            clone_repo(destination_prefix, repo_name)
        except Exception as e:
            print(e)

    results = list(tqdm.tqdm(map(_clone_repo, repos), total=len(repos)))

In [14]:
def get_python_files_generator_from_directory(dir_path):
    for p in pathlib.Path(dir_path).rglob("*.py"):
        try:
            with codecs.open(p, "r", encoding="utf-8") as f:
                path = str(p)
                contents = f.read().strip().encode("utf-8").decode("utf-8")
                if len(contents) > 0:
                    yield path, contents
        except (FileNotFoundError, IsADirectoryError, UnicodeDecodeError):
            pass


def make_python_files_df_from_directory(dir_path):
    try:
        repo_name = "/".join(dir_path.split("/")[-2:])
        paths_with_contents = list(
            zip(*list(get_python_files_generator_from_directory(dir_path)))
        )
        if len(paths_with_contents) == 2:
            paths, contents = zip(
                *list(get_python_files_generator_from_directory(dir_path))
            )
            df = pd.DataFrame({"content": contents, "path": paths})
            df["path"] = df["path"].str.replace(dir_path + "/", "")
            df["repo_name"] = repo_name
            return df
    except:
        return None

In [15]:
def check_if_python_files_exist(dir_prefix, repo):
    maybe_python_files = itertools.islice(
        (pathlib.Path(dir_prefix) / repo).rglob("*.py"), 1
    )
    return len(list(maybe_python_files)) > 0

In [16]:
paperswithcode_df, all_papers_df = paperswithcode_tasks.get_paperswithcode_dfs()

  paperswithcode_df['repo'] = paperswithcode_df['repo_url'].str.replace('https://github.com/', '')


In [17]:
paperswithcode_df

Unnamed: 0,paper_url,paper_title,paper_arxiv_id,paper_url_abs,paper_url_pdf,repo_url,mentioned_in_paper,mentioned_in_github,framework,repo
0,https://paperswithcode.com/paper/automatic-pos...,Automatic Post-Editing of Machine Translation:...,,https://www.aclweb.org/anthology/D18-1341/,https://www.aclweb.org/anthology/D18-1341,https://github.com/trangvu/ape-npi,False,False,tf,trangvu/ape-npi
1,https://paperswithcode.com/paper/deep-transfer...,Deep Transferring Quantization,,https://www.ecva.net/papers/eccv_2020/papers_E...,https://www.ecva.net/papers/eccv_2020/papers_E...,https://github.com/xiezheng-cs/DTQ,True,False,pytorch,xiezheng-cs/DTQ
2,https://paperswithcode.com/paper/batch-bayesia...,Batch Bayesian Optimization via Multi-objectiv...,,https://icml.cc/Conferences/2018/Schedule?show...,http://proceedings.mlr.press/v80/lyu18a/lyu18a...,https://github.com/Alaya-in-Matrix/MACE,True,False,none,Alaya-in-Matrix/MACE
3,https://paperswithcode.com/paper/semantic-inst...,Semantic Instance Segmentation with a Discrimi...,1708.02551,http://arxiv.org/abs/1708.02551v1,http://arxiv.org/pdf/1708.02551v1.pdf,https://github.com/harryhan618/LaneNet,False,True,pytorch,harryhan618/LaneNet
4,https://paperswithcode.com/paper/misbehaviour-...,Misbehaviour Prediction for Autonomous Driving...,1910.04443,https://arxiv.org/abs/1910.04443v1,https://arxiv.org/pdf/1910.04443v1.pdf,https://github.com/testingautomated-usi/selfor...,False,True,tf,testingautomated-usi/selforacle
...,...,...,...,...,...,...,...,...,...,...
92580,https://paperswithcode.com/paper/on-matrix-mod...,On matrix-model approach to simplified Khovano...,1506.07516,http://arxiv.org/abs/1506.07516v2,http://arxiv.org/pdf/1506.07516v2.pdf,https://github.com/mabragor/cl-vknots,True,True,none,mabragor/cl-vknots
92581,https://paperswithcode.com/paper/next-to-minim...,Next-to-Minimal SOFTSUSY,1311.7659,http://arxiv.org/abs/1311.7659v5,http://arxiv.org/pdf/1311.7659v5.pdf,https://github.com/Expander/FlexibleSUSY,True,True,none,Expander/FlexibleSUSY
92582,https://paperswithcode.com/paper/hierarchical-...,Hierarchical Question-Image Co-Attention for V...,1606.00061,http://arxiv.org/abs/1606.00061v5,http://arxiv.org/pdf/1606.00061v5.pdf,https://github.com/jiasenlu/HieCoAttenVQA,True,True,torch,jiasenlu/HieCoAttenVQA
92583,https://paperswithcode.com/paper/ctc-based-com...,CTC-based Compression for Direct Speech Transl...,2102.01578,https://arxiv.org/abs/2102.01578v1,https://arxiv.org/pdf/2102.01578v1.pdf,https://github.com/mgaido91/FBK-fairseq-ST,True,False,pytorch,mgaido91/FBK-fairseq-ST


In [18]:
paperswithcode_df["repo"].unique().size

72111

In [19]:
paperswithcode_df["repo_url"].str.contains("bitbucket").sum()

311

In [20]:
paperswithcode_df["repo_url"].str.contains("github").mean()

0.9901495922665658

In [21]:
clone_repos(paperswithcode_df["repo"])

  0%|          | 411/92585 [00:10<38:04, 40.34it/s]  


KeyboardInterrupt: 

###### 

In [None]:
!pwd

In [22]:
def prepare_python_files_df(repos_dir, repos):
    paperswithcode_df, all_papers_df = paperswithcode_tasks.get_paperswithcode_dfs()
    dfs = [
        make_python_files_df_from_directory(os.path.join(repos_dir, repo))
        for repo in tqdm.tqdm_notebook(repos)
        if check_if_python_files_exist(repos_dir, repo)
    ]
    return dfs

In [23]:
_tst_df = make_python_files_df_from_directory(
    os.path.join(DST_PREFIX, paperswithcode_df["repo"].iloc[0])
)

In [24]:
import glob

In [25]:
repos = glob.glob(os.path.join(DST_PREFIX, "*/*"))

In [26]:
python_files_dfs = prepare_python_files_df(DST_PREFIX, paperswithcode_df["repo"])

  paperswithcode_df['repo'] = paperswithcode_df['repo_url'].str.replace('https://github.com/', '')
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for repo in tqdm.tqdm_notebook(repos)


  0%|          | 0/92585 [00:00<?, ?it/s]

  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] = df['path'].str.replace(dir_path + "/", "")
  df['path'] =

In [29]:
len([x for x in python_files_dfs if x is not None])

70415

In [32]:
len(python_files_dfs)  # ['repo'].unique().shape

70459

In [33]:
paperswithcode_df["repo"].drop_duplicates()[
    ~paperswithcode_df["repo"].drop_duplicates().isin(python_files_df["repo_name"])
]

NameError: name 'python_files_df' is not defined

In [34]:
python_files_df["repo_name"].unique().size

NameError: name 'python_files_df' is not defined

In [35]:
for df in python_files_dfs:
    if df is None:
        pass
    else:
        df["content"] = df["content"].str.replace("\r", "\n")

In [None]:
len(python_files_dfs)

In [None]:
dfs

In [37]:
for df in tqdm.tqdm(python_files_dfs):
    if df is None:
        pass
    else:
        df.to_csv(
            "data/all_crawled_python_files.csv", index=False, encoding="utf-8", mode="a"
        )

100%|██████████| 70459/70459 [04:39<00:00, 252.06it/s]


In [1]:
import pandas as pd

%cd ..

/home/kuba/Projects/github_search


In [2]:
%%time
python_files_df = pd.read_csv("data/all_crawled_python_files.csv")

CPU times: user 3min 56s, sys: 18 s, total: 4min 14s
Wall time: 4min 22s




In [None]:
file_lengths = python_files_df["content"].str.split().apply(len)

In [5]:
python_files_df.dropna(inplace=True)

In [9]:
python_files_df.index = pd.RangeIndex(0, len(python_files_df))

In [2]:
import pandas as pd

python_files_df = pd.read_feather("../data/all_crawled_python_files.feather")

In [5]:
python_files_df["repo_name"].unique().size

53690

In [6]:
python_files_df.shape

(3169680, 3)

In [10]:
%%time
python_files_df[["content", "path", "repo_name"]].to_feather(
    "data/all_crawled_python_files.feather"
)  # .to_hdf("data/all_crawled_python_files.h5", "data")

CPU times: user 35.4 s, sys: 18.5 s, total: 53.9 s
Wall time: 52.2 s


In [None]:
%%time
python_files_df.to_csv(
    "data/all_crawled_python_files.csv"
)  # .to_hdf("data/all_crawled_python_files.h5", "data")

In [None]:
%%time
python_files_df = pd.read_csv(
    "data/all_crawled_python_files.csv"
)  # .to_hdf("data/all_crawled_python_files.h5", "data")

In [None]:
python_files_df["repo_name"].unique().size

In [None]:
python_files_df

In [None]:
len(python_files_df["repo_name"])

In [None]:
(python_files_df["repo_name"].value_counts().cumsum() / len(python_files_df)).iloc[:25]

In [None]:
python_files_df["repo_name"].value_counts()[:20].sum() / len(python_files_df)

In [None]:
python_files_df.to_csv(
    "data/all_crawled_python_files.csv", index=False, encoding="utf-8"
)

In [None]:
%%time
python_files_df[file_lengths < 1e5].to_csv(
    "data/crawled_python_files.csv", index=False, encoding="utf-8"
)

In [None]:
python_files_df[file_lengths < 1e5].shape

In [None]:
(file_lengths < 1e5).sum()

In [None]:
len(python_files_df.iloc[1281222]['content']

In [None]:
(file_lengths > 1e6).mean()

In [None]:
len(python_files_df)

In [None]:
repo = "CaitinZhao/cvpr2019_Pyramid-Feature-Attention-Network-for-Saliency-detection"

In [None]:
python_files_df[python_files_df["content"].str.contains("~")].sum()

In [None]:
python_files_df[python_files_df["repo_name"] == repo].iloc[3]["content"]

In [None]:
python_files_df.head()

In [None]:
repo_files_df = pd.read_csv("data/scraped_python_files.csv")

In [None]:
repo_files_df["repo_name"].unique()

In [None]:
repo_files.to_csv("data/scraped_python_files.csv", index=False)

In [None]:
repo_files_df["repo_name"]

In [None]:
import_corpus_df = pd.read_csv("output/module_corpus.csv")  # ['repo'].unique().shape

In [None]:
python_files_df = repo_files_df

In [None]:
import ast

In [None]:
repo_names = python_files_df["repo_name"]
paperswithcode_df, all_papers_df = paperswithcode_tasks.get_paperswithcode_dfs()
papers_with_repo_df = paperswithcode_tasks.get_papers_with_repo_df(
    all_papers_df, paperswithcode_df, repo_names
)
papers_with_repo_df = paperswithcode_tasks.get_papers_with_biggest_tasks(
    papers_with_repo_df, 500
)

In [None]:
import_corpus_df["imports"] = import_corpus_df["imports"].apply(ast.literal_eval)
per_repo_imports = import_corpus_df.groupby("repo")["imports"].agg(sum).apply(set)

In [None]:
per_repo_imports.shape

In [None]:
paperswithcode_with_imports_df = get_paperswithcode_with_imports_df(
    papers_with_repo_df, per_repo_imports
)

In [None]:
ps = list(pathlib.Path("repos/lambdaofgod/examples-counterexamples").rglob("*"))

In [None]:
%cd ..