In [1]:
import requests
import json
import time
from pymongo import MongoClient
import random
import pandas as pd

In [5]:
def github_request(params, headers, per_page=30, page=1, show_head=False, show_body=False, is_write=False):
    """ Description: function to perform a github api request
        - params: a list of request parameters
        - headers: a dict of request headers
        - per_page: defalt 30 records per page, can go up to 100
        - page: default starts from page 1
        - show_head: default False, not show the response head
        - show_body: default False, not show the response body
        - is_write: default False, if write out to json file, default in current directory
        - return: a string of status code, and a json object of response
    """
    url = "https://api.github.com/" + '/'.join(params) + '?per_page={}&page={}'.format(per_page, page)
    print 'requesting: ' + url
    response = requests.get(url, headers=headers) # get response
    if show_body:
        # body
        print json.dumps(response.json(), indent=1)
        print '------------------------------------'
    if show_head:
        # header
        for (k,v) in response.headers.items():
            print k, "=>", v
        print '------------------------------------'
    file_name = '.'.join(params) + '.json'
    if is_write:
        with open(file_name, 'w') as jsonfile:
            json.dump(response.json(), jsonfile)
    print 'total records of this request: {}'.format(len(response.json()))
    return response.headers['Status'], response.json()

In [6]:
def dump_mongo(db_url, db_name, params, headers):
    """ Description: function to dump github api into mongodb
        - db_url: mongodb url
        - db_name: mongodb database name
        - params: github api params
        - headers: github api headers
    """
    mongocli = MongoClient(db_url)# init mongodb client
    mongodb = mongocli[db_name] # connect 'github' database
    
    page = 1
    collection_name = '.'.join(params)
    start_time = time.time()
    while True:
        status, json_body = github_request(params, headers,per_page=100,page=page)
        print 'page {} :'.format(page), 'status: ' + status + '\n'
        if status == '422 Unprocessable Entity' or len(json_body) == 0:
            break
        mongodb[collection_name].insert(json_body)
        page += 1
        time.sleep(1)
    mongocli.close() # close connection
    elapsed_time = time.time() - start_time
    print '------------------------------------'
    print 'completed: {}'.format(elapsed_time)
    

In [7]:
ACCESS_TOKEN = '219c8c8184c45933dd259f21eaf5ff726387e7ed'

In [8]:
db_url = 'ec2-54-67-97-244.us-west-1.compute.amazonaws.com:27017'
db_name = 'activities'
headers = {'Authorization' : 'token {}'.format(ACCESS_TOKEN),'Accept' : 'application/vnd.github.v3.star+json'}

In [9]:
repos = pd.read_csv('./data/repos.csv')

In [12]:
repo_portfolio = repos[['org_id', 'name']]

In [15]:
starring_list = []
for _, row in repo_portfolio.iterrows():
    starring_list.append(['repos', row['org_id'], row['name'], 'stargazers'])

In [17]:
for starring in starring_list:
    print starring
    dump_mongo(db_url,db_name,starring,headers)
    print
    break

['repos', 'google', 'upb', 'stargazers']
requesting: https://api.github.com/repos/google/upb/stargazers?per_page=100&page=1
total records of this request: 100
page 1 : status: 200 OK

requesting: https://api.github.com/repos/google/upb/stargazers?per_page=100&page=2
total records of this request: 100
page 2 : status: 200 OK

requesting: https://api.github.com/repos/google/upb/stargazers?per_page=100&page=3
total records of this request: 100
page 3 : status: 200 OK

requesting: https://api.github.com/repos/google/upb/stargazers?per_page=100&page=4
total records of this request: 100
page 4 : status: 200 OK

requesting: https://api.github.com/repos/google/upb/stargazers?per_page=100&page=5
total records of this request: 100
page 5 : status: 200 OK

requesting: https://api.github.com/repos/google/upb/stargazers?per_page=100&page=6
total records of this request: 100
page 6 : status: 200 OK

requesting: https://api.github.com/repos/google/upb/stargazers?per_page=100&page=7
total records of t