# Compute GitHub Stats

In [1]:
# NOTE: The RuntimeWarnings (if any) are harmless. See ContinuumIO/anaconda-issues#6678.
from pandas.io import gbq
import pandas as pd

In [2]:
import getpass
import subprocess
# Configuration Variables. Modify as desired.

PROJECT = subprocess.check_output(["gcloud", "config", "get-value", "project"]).strip().decode()

## Setup Authorization

If you are using a service account run
%%bash

# Activate Service Account provided by Kubeflow.
gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS}

If you are running using user credentials

gcloud auth application-default login

In [25]:
datetime.datetime.year()

NameError: name 'datetime' is not defined

In [3]:
months = []
for year in ["2018"]:
    for month in range(1, 13):
        months.append("\"{0}{1:02}\"".format(year, month))

for year in ["2019"]:
    for month in range(1, 5):
        months.append("\"{0}{1:02}\"".format(year, month))
months

['"201801"',
 '"201802"',
 '"201803"',
 '"201804"',
 '"201805"',
 '"201806"',
 '"201807"',
 '"201808"',
 '"201809"',
 '"201810"',
 '"201811"',
 '"201812"',
 '"201901"',
 '"201902"',
 '"201903"',
 '"201904"']

# Read in user affiliations

* github_users.json is produced using CNCF scripts
* There can be multiple entries for a user showing their company & affiliation during different time periods

In [4]:
import json
import os
import requests
if not os.path.exists(".cache"):
    os.makedirs(".cache")
    

users_file = os.path.join(".cache", "github_users.json")

if not os.path.exists(users_file):
    url = "https://github.com/kubeflow/community/blob/master/devstats/data/github_users.json?raw=true"

    r = requests.get(url, allow_redirects=True)
    
    with open(users_file, "wb") as hf:
        hf.write(r.content)

with open(users_file) as hf:    
    data = json.load(hf)
users=pd.DataFrame(data)
users = users[["login", "company"]]

In [5]:
# Dedupe companies
c = ["cisco", "datawire", "google", "ibm", "intel", "teradata", "red hat"]
known_companies = dict(zip(c,c))
known_companies["redhat"] = "red hat"
def normalize_company(name):
    if name is None:
        return "None"
    name = name.strip().lower().strip("!").strip("@")
        
     
    for k, v in known_companies.items():
        if k in name:
            return v
    return name

users["company"] = users["company"].apply(normalize_company)

* Users can have multiple entries
* We pick the first non None entry
* TODO(jlewi) We should find a better way to combine multiple entries

In [6]:
def combine_company(names):
    for i in names:
        if i != "None":
            return i
    return None

user_map= users.groupby("login")["company"].apply(combine_company)

# You can now look up users as user_map[actor]
user_map["jlewi"]

'google'

## Unique PR Creators

In [7]:
query = """
SELECT
    DATE(created_at) AS pr_date,
    actor.id,
    actor.login
  FROM `githubarchive.month.*`
  WHERE
    _TABLE_SUFFIX IN ({0})
    AND type = 'PullRequestEvent'
    AND org.login = 'kubeflow'
    AND JSON_EXTRACT(payload, '$.action') IN ('"opened"')
""".format(",".join(months))

prs=gbq.read_gbq(str(query), dialect='standard', project_id=PROJECT)

  progress_bar_type=progress_bar_type,


In [8]:
p=pd.Series(data=prs["id"].values,index=prs["pr_date"])
p=p.sort_index()

In [9]:
prs

Unnamed: 0,pr_date,id,login
0,2018-04-01,777219,jlewi
1,2018-04-02,697528,inc0
2,2018-04-02,777219,jlewi
3,2018-04-03,4869572,jose5918
4,2018-04-03,7599217,mhbuehler
...,...,...,...
4341,2019-04-29,1829149,Ark-kun
4342,2019-04-29,1829149,Ark-kun
4343,2019-04-29,34456002,rileyjbauer
4344,2019-03-15,777219,jlewi


In [10]:
prs["company"] = user_map[prs["login"]].values

In [11]:
d=prs[["pr_date", "company"]]
d["count"]=1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:
pr_counts = d.pivot_table("count", columns="company", index="pr_date", aggfunc="sum", fill_value=0)

In [13]:
# Some solutions here: https://stackoverflow.com/questions/46470743/how-to-efficiently-compute-a-rolling-unique-count-in-a-pandas-time-series
# Need to figure out how to do a time based window

counts = pr_counts.rolling('28d').sum()

In [17]:
counts.columns

Index(['agile stacks', 'agilestacks', 'ai lab, cs, pu', 'airbnb', 'alauda.io',
       'alibaba', 'alibaba cloud', 'alibaba.com', 'alipay', 'ant financial',
       'arrikto', 'aws', 'blackduck software', 'bolcom', 'caicloud',
       'canonical', 'cern', 'cisco', 'connected-io', 'cornell',
       'crowdflower.com', 'daocloud', 'datatonic', 'datawire', 'doc-ai',
       'docker, inc', 'elementai', 'generalassembly @maishelf', 'github',
       'gojekindonesia', 'google', 'grab', 'huawei', 'huawei-paas', 'ibm',
       'intel', 'inwinstack', 'jd.com', 'kesci', 'kredoai',
       'kubernetes @meituan-dianping', 'kumulus technologies',
       'linkedin, findsimilarmovies.com', 'lsa lab, cs, nthu', 'maxkelsen',
       'mercari inc.', 'microsoft', 'momenta', 'muxinc', 'nanjing university',
       'nearmap', 'ntt laboratories', 'nvidia', 'ocelot uproar',
       'one convergence', 'openshift', 'pachyderm', 'pinterest',
       'preferred networks, inc.', 'publicis worldwide', 'red hat',
       'sada 

In [23]:
counts_df = pd.DataFrame({"day": counts["google"].index, "google": counts["google"].values})
import altair as alt
chart = alt.Chart(counts_df, title= "PRs")
line = chart.mark_line().encode(
  x= alt.X('day', title = "Day"),
  y=alt.Y("google", title="# PRs"),    
)

point = line + line.mark_point()
point.interactive()

In [14]:
# Use plotly cufflinks to plot data frames
# https://plot.ly/ipython-notebooks/cufflinks/
# instructions for offline plotting
# https://plot.ly/python/getting-started/#initialization-for-offline-plotting
#
# Follow the instructions for online plotting:
# https://plot.ly/python/getting-started/
# You will need to setup an account
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks as cf
#from importlib import reload
import itertools

ModuleNotFoundError: No module named 'plotly'

In [None]:
trace = go.Pie(labels=counts.columns, values=counts.iloc[-1], title="PRs Created Last 28 days")
py.iplot([trace], filename='basic_pie_chart')

In [None]:
d=counts.iloc[-1]
total = d.sum()
google_prs= d["google"]
other = total - google_prs

In [None]:

trace = go.Pie(labels=["google", "other"], values=[google_prs, other], title="PRs Created Last 28 days")
py.iplot([trace], filename='basic_pie_chart')