In [1]:
#default_exp github_crawling

In [2]:
#export
import pandas as pd
import requests
import json
from operator import itemgetter
import tqdm

# Crawling github

Getting Python files from github repos

Before running that code put your github auth token in data/token.txt. Doing this will increase rate limit from 60 to 5000 calls per hour.

`_get_tree` gets file tree from github repository

`_get_python_files` retrieves Python files from a given repository: it returns tuples consisting of

owner, repo_name, file path, file content, hash of file content


`get_python_files_df` puts information described above into dataframe with appropriate column names 

In [3]:
json.JSONDecodeError

json.decoder.JSONDecodeError

In [4]:
#export

token = open('../data/token.txt', 'r').read().strip()


def _get_tree(owner, repo_name):
    url_template = 'https://api.github.com/repos/{}/{}/git/trees/master?recursive=1'
    headers = {'Authorization': 'token ' + token}
    return requests.get(url_template.format(owner, repo_name), headers=headers)


def _get_python_files(owner, repo_name):
    try:
        files = json.loads(_get_tree(owner, repo_name).text)['tree']
        for maybe_file in files:
            is_python_file = maybe_file['path'][-3:] == '.py'
            if maybe_file['type'] == 'blob' and is_python_file:
                path = maybe_file['path']
                raw_file_url_template = 'https://raw.githubusercontent.com/{}/{}/master/{}'
                raw_file_url = raw_file_url_template.format(owner, repo_name, path)
                yield owner, repo_name, path, requests.get(raw_file_url).text, maybe_file['sha']
    except (KeyError, json.JSONDecodeError) as e:
        print('failed for {}/{}'.format(owner, repo_name), e)
        return
        yield

In [5]:
example_owner = 'lambdaofgod'
example_repo = 'mlutil'

Getting python files should return empty list when ran on nonexisting repository

In [6]:
assert list(_get_python_files('lambdaofgod', 'foo')) == []

failed for lambdaofgod/foo 'tree'


Getting python files should return correct number of files for an existing repository 

In [7]:
python_file_tuples = list(_get_python_files('lambdaofgod', 'findkit'))
assert len(python_file_tuples) == 26 

In [8]:
#export


def _make_python_files_df(file_tuples):
    if len(file_tuples) > 0:
        df = pd.DataFrame.from_records(file_tuples)
        df.columns = ['owner', 'repo_name', 'file_path', 'content', 'sha']
        return df
    else:
        return pd.DataFrame({})


def get_python_files_df(owner, repo_name):
    file_tuples = list(_get_python_files(owner, repo_name))
    return _make_python_files_df(file_tuples)

In [9]:
df_cols = list(_make_python_files_df(python_file_tuples).columns)
assert df_cols == ['owner', 'repo_name', 'file_path', 'content', 'sha']