# Analysing mybinder.org launches

The first few cells download and massage the data. Later on we answer questions on which repositories are popular and such.

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import datetime
import pandas as pd

In [2]:
index = pd.read_json("https://archive.analytics.mybinder.org/index.jsonl",
                     lines=True)

In [None]:
# get all days since start of 2019
now = datetime.datetime.now()
n = (now - datetime.datetime(2019, 1, 1)).days

# small range for previewing
#n = 30

frames = []
for idx, day in index.sort_index(ascending=False).iterrows():
    df = pd.read_json("https://archive.analytics.mybinder.org/{}".format(day['name']),
                      lines=True, )
    frames.append(df)
    if len(frames) > n:
        break
        
print(f"Fetched data for {n} days.")

In [None]:
df = pd.concat(frames)

In [None]:
# make it easier to grab the ref
def get_repo(spec):
    s = spec.rsplit("/", 1)[0]
    if s.endswith('.git'):
        s = s[:-4]
    return s
df['repo'] = df['spec'].apply(get_repo) #lambda s: s.rsplit("/", 1)[0].replace(".git", ""))
df['org'] = df['spec'].apply(lambda s: s.split("/", 1)[0])
df['ref'] = df['spec'].apply(lambda s: s.rsplit("/", 1)[1])

In [None]:
# take a look at the data, does it look sensible?
df.sample(10)

In [None]:
df = df.drop(columns=['schema', 'version', 'spec', 'status'])

In [None]:
df.head()

## Monthly, weekly, daily active repos

In [None]:
df_ = df.set_index("timestamp")

In [None]:
def n_active_repos(period=30):
    """Unique active repos over the period
    
    With period=30 this gives monthly active repos
    With period=1 you get daily active repos
    """
    now = datetime.datetime.now()
    start = datetime.datetime(2019, 1, 1)
    days_since_start = (now - start).days

    data = []

    for n in range(days_since_start):
        s = start + datetime.timedelta(days=n)
        e = start + datetime.timedelta(days=n+period)
        if e > now:
            break

        # our "monthly" data
        monthly = df_.loc[s.isoformat():e.isoformat()]
        data.append(dict(timestamp=e, repos=len(set(monthly.repo))))
        
    return pd.DataFrame.from_records(data).set_index("timestamp")

In [None]:
monthly_active = n_active_repos(30)
weekly_active = n_active_repos(7)
daily_active = n_active_repos(1)

In [None]:
activity = pd.merge(monthly_active, daily_active,
                    suffixes=("_monthly", "_daily"), left_index=True, right_index=True)
activity = pd.merge(activity, weekly_active, left_index=True, right_index=True)

activity.columns = ['30day active', '1d active', '7d active']
activity.plot();

## Total launches

In [None]:
# Sneak peek: total launches!
df.shape

## Launches per day

In [None]:
daily = df.set_index("timestamp").resample('D').count()

In [None]:
daily['repo'].plot()

## Estimate number of unique repositories

Expect the raw number of launches to be bigger than the number of repositories launched more than once. Those launched only once might have been accidents.

In [None]:
len(set(df.repo))

In [None]:
from collections import Counter

sum(1 for k,v in Counter(df.repo).items() if v > 3)

## Popular repositories and their branches

Twenty most popular repos:

In [None]:
top20 = df.groupby("repo").count().sort_values("timestamp", ascending=False).head(20)
top20

In [None]:
print("Cumulative top20 launches:", top20['ref'].sum())
print("The top20 repos are {:.1f}% of all "
      "launches.".format(100 * top20['ref'].sum() / df.shape[0]))

## Per org stats

Needs more RAM than mybinder.org gives us for now

In [None]:
# add a new column showing total launches per repo
totals_per_repo = (df.groupby(["repo"])
 .size()
 .reset_index(name='repo_counts'))

In [None]:
# add a nnew column showing total launches per org
totals_per_org = (df.groupby(["org"])
 .size()
 .reset_index(name='org_counts'))

In [None]:
df_ = pd.merge(df, totals_per_repo, on='repo')
df_ = pd.merge(df_, totals_per_org, on='org')
#df_.sample(10)

In [None]:
(df.groupby("provider")
   .size()
   .reset_index(name='Launches')
   .sort_values('Launches', ascending=False))

In [None]:
(df_.groupby(["org", "repo", "ref", "repo_counts", "org_counts"])
 .size()
 # give the column a nice name
 .reset_index(name='ref_counts')
 # sort first by total launches, then within a repo by ref launches
 .sort_values(['org_counts', 'repo_counts', 'ref_counts'],
              ascending=[False,False, False])
 .set_index(["org", 'repo', 'ref'])
)