# Compute GitHub Stats

In [19]:
# NOTE: The RuntimeWarnings (if any) are harmless. See ContinuumIO/anaconda-issues#6678.
from pandas.io import gbq

In [20]:
import getpass
import subprocess
# Configuration Variables. Modify as desired.

PROJECT = subprocess.check_output(["gcloud", "config", "get-value", "project"]).strip().decode()

## Setup Authorization

If you are using a service account run
%%bash

# Activate Service Account provided by Kubeflow.
gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS}

If you are running using user credentials

gcloud auth application-default login

In [34]:
months = []
for year in ["2018"]:
    for month in range(1, 13):
        months.append("\"{0}{1:02}\"".format(year, month))

for year in ["2019"]:
    for month in range(1, 5):
        months.append("\"{0}{1:02}\"".format(year, month))
months

['"201801"',
 '"201802"',
 '"201803"',
 '"201804"',
 '"201805"',
 '"201806"',
 '"201807"',
 '"201808"',
 '"201809"',
 '"201810"',
 '"201811"',
 '"201812"',
 '"201901"',
 '"201902"',
 '"201903"',
 '"201904"']

In [100]:

query = """
  SELECT
  pr_date,
  COUNT(*) AS prs 
From (
SELECT
    DATE(created_at) AS pr_date
  FROM `githubarchive.month.*`
  WHERE
    #_TABLE_SUFFIX IN (Format_Date("%Y%m", CURRENT_DATE()))
    _TABLE_SUFFIX IN ({0})
    AND type = 'PullRequestEvent'
    AND org.login = 'kubeflow'
    AND JSON_EXTRACT(payload, '$.action') IN ('"opened"')
  ORDER BY
    pr_date) group by pr_date order by pr_date desc

""".format(",".join(months))

data=gbq.read_gbq(str(query), dialect='standard', project_id=PROJECT)
data

Unnamed: 0,pr_date,prs
0,2019-04-30,29
1,2019-04-29,17
2,2019-04-28,3
3,2019-04-27,10
4,2019-04-26,27
5,2019-04-25,28
6,2019-04-24,19
7,2019-04-23,29
8,2019-04-22,26
9,2019-04-21,4


In [101]:
import pandas as pd
counts = pd.Series(data["prs"].values, index=data["pr_date"])
counts = counts.sort_index()
#data["prs"].rolling(28).sum()

In [103]:
# TODO: There's a bug. This assumes there are enteries for every day but if there were some days missing we will
# have no enteries
d=counts.rolling('28d').sum()

In [122]:
# Use plotly cufflinks to plot data frames
# https://plot.ly/ipython-notebooks/cufflinks/
# instructions for offline plotting
# https://plot.ly/python/getting-started/#initialization-for-offline-plotting
#
# Follow the instructions for online plotting:
# https://plot.ly/python/getting-started/
# You will need to setup an account
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks as cf
#from importlib import reload
import itertools

layout = go.Layout(
    title='New PRs Last 28 Days',
    font=dict(family='Courier New, monospace', size=18, color='black')
)
d.iplot(kind='scatter', width=5, filename='project-stats', layout=layout)


## Unique PR Creators

In [106]:
query = """
SELECT
    DATE(created_at) AS pr_date,
    actor.id,
    actor.login
  FROM `githubarchive.month.*`
  WHERE
    _TABLE_SUFFIX IN ({0})
    AND type = 'PullRequestEvent'
    AND org.login = 'kubeflow'
    AND JSON_EXTRACT(payload, '$.action') IN ('"opened"')
""".format(",".join(months))

prs=gbq.read_gbq(str(query), dialect='standard', project_id=PROJECT)

In [107]:
p=pd.Series(data=prs["id"].values,index=prs["pr_date"])
p=p.sort_index()


In [108]:
# Some solutions here: https://stackoverflow.com/questions/46470743/how-to-efficiently-compute-a-rolling-unique-count-in-a-pandas-time-series
# Need to figure out how to do a time based window

creators = p.rolling('28d').apply(lambda arr: pd.Series(arr).nunique())





In [120]:
layout = go.Layout(
    title='Unique PR Authors Last 28 Days',
    font=dict(family='Courier New, monospace', size=18, color='black')
)
creators.iplot(kind='scatter', layout=layout, width=5, filename='creator-stats')


In [116]:
help(creators.iplot)

Help on method _iplot in module cufflinks.plotlytools:

_iplot(kind='scatter', data=None, layout=None, filename='', sharing=None, title='', xTitle='', yTitle='', zTitle='', theme=None, colors=None, colorscale=None, fill=False, width=None, dash='solid', mode='', interpolation='linear', symbol='circle', size=12, barmode='', sortbars=False, bargap=None, bargroupgap=None, bins=None, histnorm='', histfunc='count', orientation='v', boxpoints=False, annotations=None, keys=False, bestfit=False, bestfit_colors=None, mean=False, mean_colors=None, categories='', x='', y='', z='', text='', gridcolor=None, zerolinecolor=None, margin=None, labels=None, values=None, secondary_y='', secondary_y_title='', subplots=False, shape=None, error_x=None, error_y=None, error_type='data', locations=None, lon=None, lat=None, asFrame=False, asDates=False, asFigure=False, asImage=False, dimensions=None, asPlot=False, asUrl=False, online=None, **kwargs) method of pandas.core.series.Series instance
           Returns