# Compute GitHub Stats

In [7]:
# NOTE: The RuntimeWarnings (if any) are harmless. See ContinuumIO/anaconda-issues#6678.
from pandas.io import gbq
import pandas as pd

In [3]:
import getpass
import subprocess
# Configuration Variables. Modify as desired.

PROJECT = subprocess.check_output(["gcloud", "config", "get-value", "project"]).strip().decode()

## Setup Authorization

If you are using a service account run
%%bash

# Activate Service Account provided by Kubeflow.
gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS}

If you are running using user credentials

gcloud auth application-default login

In [4]:
months = []
for year in ["2018"]:
    for month in range(1, 13):
        months.append("\"{0}{1:02}\"".format(year, month))

for year in ["2019"]:
    for month in range(1, 5):
        months.append("\"{0}{1:02}\"".format(year, month))
months

['"201801"',
 '"201802"',
 '"201803"',
 '"201804"',
 '"201805"',
 '"201806"',
 '"201807"',
 '"201808"',
 '"201809"',
 '"201810"',
 '"201811"',
 '"201812"',
 '"201901"',
 '"201902"',
 '"201903"',
 '"201904"']

# Read in user affiliations

* github_users.json is produced using CNCF scripts
* There can be multiple entries for a user showing their company & affiliation during different time periods

In [30]:
import json
import os
import requests
if not os.path.exists(".cache"):
    os.makedirs(".cache")
    

users_file = os.path.join(".cache", "github_users.json")

if not os.path.exists(users_file):
    url = "https://github.com/kubeflow/community/blob/master/devstats/data/github_users.json?raw=true"

    r = requests.get(url, allow_redirects=True)
    
    with open(users_file, "wb") as hf:
        hf.write(r.content)

with open(users_file) as hf:    
    data = json.load(hf)
users=pd.DataFrame(data)
users = users[["login", "company"]]

In [126]:
# Dedupe companies
c = ["cisco", "datawire", "google", "ibm", "intel", "teradata", "red hat"]
known_companies = dict(zip(c,c))
known_companies["redhat"] = "red hat"
def normalize_company(name):
    if name is None:
        return "None"
    name = name.strip().lower().strip("!").strip("@")
        
     
    for k, v in known_companies.items():
        if k in name:
            return v
    return name

users["company"] = users["company"].apply(normalize_company)

* Users can have multiple entries
* We pick the first non None entry
* TODO(jlewi) We should find a better way to combine multiple entries

In [127]:
def combine_company(names):
    for i in names:
        if i != "None":
            return i
    return None

user_map= users.groupby("login")["company"].apply(combine_company)

# You can now look up users as user_map[actor]
user_map["jlewi"]

'google'

## Unique PR Creators

In [5]:
query = """
SELECT
    DATE(created_at) AS pr_date,
    actor.id,
    actor.login
  FROM `githubarchive.month.*`
  WHERE
    _TABLE_SUFFIX IN ({0})
    AND type = 'PullRequestEvent'
    AND org.login = 'kubeflow'
    AND JSON_EXTRACT(payload, '$.action') IN ('"opened"')
""".format(",".join(months))

prs=gbq.read_gbq(str(query), dialect='standard', project_id=PROJECT)



In [8]:
p=pd.Series(data=prs["id"].values,index=prs["pr_date"])
p=p.sort_index()

In [10]:
prs

Unnamed: 0,pr_date,id,login
0,2018-09-02,608782,everpeace
1,2018-09-02,10945651,Akado2009
2,2018-09-02,5247283,xyhuang
3,2018-09-01,777219,jlewi
4,2018-09-02,5247283,xyhuang
5,2018-09-03,5100735,gaocegege
6,2018-09-03,59893,holdenk
7,2018-09-03,3724388,cheyang
8,2018-09-10,10945651,Akado2009
9,2018-09-12,37601826,kunmingg


In [128]:
prs["company"] = user_map[prs["login"]].values

In [129]:
d=prs[["pr_date", "company"]]
d["count"]=1

In [130]:
pr_counts = d.pivot_table("count", columns="company", index="pr_date", aggfunc="sum", fill_value=0)

In [131]:
# Some solutions here: https://stackoverflow.com/questions/46470743/how-to-efficiently-compute-a-rolling-unique-count-in-a-pandas-time-series
# Need to figure out how to do a time based window

counts = pr_counts.rolling('28d').sum()

In [132]:
# Use plotly cufflinks to plot data frames
# https://plot.ly/ipython-notebooks/cufflinks/
# instructions for offline plotting
# https://plot.ly/python/getting-started/#initialization-for-offline-plotting
#
# Follow the instructions for online plotting:
# https://plot.ly/python/getting-started/
# You will need to setup an account
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks as cf
#from importlib import reload
import itertools

In [134]:
trace = go.Pie(labels=counts.columns, values=counts.iloc[-1], title="PRs Created Last 28 days")
py.iplot([trace], filename='basic_pie_chart')

In [139]:
d=counts.iloc[-1]
total = d.sum()
google_prs= d["google"]
other = total - google_prs

In [140]:

trace = go.Pie(labels=["google", "other"], values=[google_prs, other], title="PRs Created Last 28 days")
py.iplot([trace], filename='basic_pie_chart')


Consider using IPython.display.IFrame instead

