# Clean and summarize mybinder.org traffic

This cleans up our traffic data and removes/summarizes rare repositories.

In [1]:
from urllib.parse import urlparse

import pandas as pd

In [2]:
fname = 'Analytics All Web Site Data Pages 20180820-20180918.csv'

In [3]:
date_range = fname.split()[-1].replace('.csv', '')

There is junk at the start and the end of the Google Analytics export.

In [4]:
!head "$fname"

# ----------------------------------------
# All Web Site Data
# Pages
# 20180820-20180918
# ----------------------------------------

Page,Page Views,Unique Page Views,Avg. Time on Page,Entrances,Bounce Rate,% Exit,Page Value
/v2/gh/ipython/ipython-in-depth/master?filepath=binder/Index.ipynb,"141,510","135,235",00:00:03,"135,234",99.93%,95.56%,US$0.00
/v2/gh/jupyterlab/jupyterlab-demo/master?urlpath=lab/tree/demo/Lorenz.ipynb,"76,870","76,851",00:00:22,"76,850",99.98%,99.97%,US$0.00
/,"19,199","19,168",00:01:13,"19,168",99.84%,99.84%,US$0.00


In [5]:
with open('data.csv', 'w') as output:
    with open(fname) as f:
        blanks = 0
        for line in f:
            line = line.strip()
            if line.startswith("/v2/") or line.startswith("Page"):
                output.write(line + '\n')

In [6]:
# check file ends with sensible lines of page visits
!tail data.csv

/v2/git/https://github.uconn.edu/rcc02007/ME3263_Lab-0.git/f25072f2e708c231ea05040cab6aae2699a7be6f?urlpath=https://github.uconn.edu/rcc02007/ME3263_Lab-0/blob/master/ME3263_lab-00.ipynb,1,1,00:00:00,1,100.00%,100.00%,US$0.00
/v2/git/https://gitlab.oceantrack.org/otndc/resonate-glatos-workshop/0acb1b3ca3e97a87bccd826ed43712d493964cc4?urlpath=rstudio,1,1,00:00:00,1,100.00%,100.00%,US$0.00
/v2/git/https://gitlab.oceantrack.org/otndc/resonate-glatos-workshop/7ca5da70cc102ed14709f379e30b70c10e5b4fa3,1,1,00:00:00,1,100.00%,100.00%,US$0.00
/v2/git/Mybinder@wanderer0927.synology.me:/volume1/github/demo02-Qt4+pyserial:asdfghjkl/49743082ad8c5dcff8bac57a71d0fe5147e3e196,1,1,00:00:00,1,100.00%,100.00%,US$0.00
/v2/gl/https://git.ecdf.ed.ac.uk/jmorton5/data_science_intro/874bb80ab45f051126ff48176ac24afa3bf415ff,1,1,00:00:00,1,100.00%,100.00%,US$0.00
/v2/gl/jumson/pinode_notes/master?urlpath=lab,1,1,00:00:00,1,100.00%,100.00%,US$0.00
/v2/gl/ktiwari9/gaussian-process/master?filepath=GP_1D.ipynb

In [7]:
df = pd.read_csv("data.csv")

In [8]:
df.head()

Unnamed: 0,Page,Page Views,Unique Page Views,Avg. Time on Page,Entrances,Bounce Rate,% Exit,Page Value
0,/v2/gh/ipython/ipython-in-depth/master?filepat...,141510,135235,00:00:03,135234,99.93%,95.56%,US$0.00
1,/v2/gh/jupyterlab/jupyterlab-demo/master?urlpa...,76870,76851,00:00:22,76850,99.98%,99.97%,US$0.00
2,/v2/gh/bokeh/bokeh-notebooks/master?filepath=t...,7252,7251,00:00:05,7251,99.99%,99.99%,US$0.00
3,/v2/gh/binder-examples/r/master?filepath=index...,5571,5567,00:00:02,5567,99.93%,99.93%,US$0.00
4,/v2/gh/QuantStack/xeus-cling/stable?filepath=n...,4182,4182,00:00:00,4182,100.00%,100.00%,US$0.00


In [9]:
df['Path'] = df['Page'].map(lambda x: urlparse(x).path, na_action='ignore')
df['Repo'] = df['Page'].map(lambda x: urlparse(x).path.rsplit("/", maxsplit=1)[0], na_action='ignore')

df['Page Views'] = df['Page Views'].map(lambda x: x.replace(',', '')).astype(int)

In [10]:
df.head()

Unnamed: 0,Page,Page Views,Unique Page Views,Avg. Time on Page,Entrances,Bounce Rate,% Exit,Page Value,Path,Repo
0,/v2/gh/ipython/ipython-in-depth/master?filepat...,141510,135235,00:00:03,135234,99.93%,95.56%,US$0.00,/v2/gh/ipython/ipython-in-depth/master,/v2/gh/ipython/ipython-in-depth
1,/v2/gh/jupyterlab/jupyterlab-demo/master?urlpa...,76870,76851,00:00:22,76850,99.98%,99.97%,US$0.00,/v2/gh/jupyterlab/jupyterlab-demo/master,/v2/gh/jupyterlab/jupyterlab-demo
2,/v2/gh/bokeh/bokeh-notebooks/master?filepath=t...,7252,7251,00:00:05,7251,99.99%,99.99%,US$0.00,/v2/gh/bokeh/bokeh-notebooks/master,/v2/gh/bokeh/bokeh-notebooks
3,/v2/gh/binder-examples/r/master?filepath=index...,5571,5567,00:00:02,5567,99.93%,99.93%,US$0.00,/v2/gh/binder-examples/r/master,/v2/gh/binder-examples/r
4,/v2/gh/QuantStack/xeus-cling/stable?filepath=n...,4182,4182,00:00:00,4182,100.00%,100.00%,US$0.00,/v2/gh/QuantStack/xeus-cling/stable,/v2/gh/QuantStack/xeus-cling


In [11]:
df.tail()

Unnamed: 0,Page,Page Views,Unique Page Views,Avg. Time on Page,Entrances,Bounce Rate,% Exit,Page Value,Path,Repo
4121,/v2/gl/jumson/pinode_notes/master?urlpath=lab,1,1,00:00:00,1,100.00%,100.00%,US$0.00,/v2/gl/jumson/pinode_notes/master,/v2/gl/jumson/pinode_notes
4122,/v2/gl/ktiwari9/gaussian-process/master?filepa...,1,1,00:00:00,1,100.00%,100.00%,US$0.00,/v2/gl/ktiwari9/gaussian-process/master,/v2/gl/ktiwari9/gaussian-process
4123,/v2/gl/qianrumegrelate/meg,1,1,00:00:00,1,100.00%,100.00%,US$0.00,/v2/gl/qianrumegrelate/meg,/v2/gl/qianrumegrelate
4124,/v2/gl/SantiagoSantana/tiempo_real/binder?file...,1,1,00:00:00,1,100.00%,100.00%,US$0.00,/v2/gl/SantiagoSantana/tiempo_real/binder,/v2/gl/SantiagoSantana/tiempo_real
4125,/v2/gl/SantiagoSantana/tiempo_real/master,1,1,00:00:00,1,100.00%,100.00%,US$0.00,/v2/gl/SantiagoSantana/tiempo_real/master,/v2/gl/SantiagoSantana/tiempo_real


In [12]:
# Unique repositories launched this period
len(df['Repo'].unique())

2503

In [13]:
df.shape

(4126, 10)

## Per Repo stats

In [14]:
repo_stats = pd.DataFrame(df.groupby('Repo').sum()['Page Views']).reset_index()

In [15]:
repo_stats.head()

Unnamed: 0,Repo,Page Views
0,/v2/gh/ 2gcpeixoto/lecture-ipynb,2
1,/v2/gh/2018-Arizona-Opportunity-Hack/npo-selec...,2
2,/v2/gh/3ptscience/steno3d-notebooks,1
3,/v2/gh/4QuantOSS/Augmentor,2
4,/v2/gh/4QuantOSS/DashIntro,292


In [16]:
# remove weird hits
repo_stats = repo_stats[repo_stats['Repo'].str.startswith("/v2/")]

In [17]:
# Remove repositories that weren't launched often to
# protect their identity
repo_stats = repo_stats[repo_stats['Page Views'] >= 5]

In [18]:
repo_stats.head()

Unnamed: 0,Repo,Page Views
4,/v2/gh/4QuantOSS/DashIntro,292
5,/v2/gh/4QuantOSS/scijava-jupyter-kernel,59
9,/v2/gh/9735ccj/ipad-python3.git,22
12,/v2/gh/AM207/2018spring,7
13,/v2/gh/ASamarkRoth/gammalab_analysis,13


In [19]:
repo_stats.sort_values(by='Page Views', ascending=False).head()

Unnamed: 0,Repo,Page Views
1303,/v2/gh/ipython/ipython-in-depth,143174
1432,/v2/gh/jupyterlab/jupyterlab-demo,79239
722,/v2/gh/bokeh/bokeh-notebooks,7299
689,/v2/gh/binder-examples/r,5854
361,/v2/gh/QuantStack/xeus-cling,4267


In [20]:
repo_stats.to_csv('repo-{}.csv'.format(date_range), index=False)

## Per Page Stats

In [21]:
page_stats = df[['Page', 'Page Views']]
# remove weird pages
page_stats = page_stats[page_stats.notna()]

page_stats = page_stats[page_stats['Page'].str.startswith("/v2/")]

page_stats = page_stats[page_stats['Page Views']>=5]

page_stats.sort_values(by='Page Views', ascending=False).head()

Unnamed: 0,Page,Page Views
0,/v2/gh/ipython/ipython-in-depth/master?filepat...,141510
1,/v2/gh/jupyterlab/jupyterlab-demo/master?urlpa...,76870
2,/v2/gh/bokeh/bokeh-notebooks/master?filepath=t...,7252
3,/v2/gh/binder-examples/r/master?filepath=index...,5571
4,/v2/gh/QuantStack/xeus-cling/stable?filepath=n...,4182


In [22]:
page_stats.tail()

Unnamed: 0,Page,Page Views
1352,/v2/gh/zhzhzh/python-notebooks/master?urlpath=...,5
1353,/v2/gist/darribas/4121857/master?filepath=guar...,5
1354,/v2/gist/mdhk/ad0725cf494385d699aef6d6c40131be...,5
1355,/v2/gl/jumson/pinode_notes/master,5
1356,/v2/gl/wichit2s/programmingfundamentals/master,5


In [23]:
page_stats.to_csv('page-{}.csv'.format(date_range), index=False)