In [83]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask import dataframe as dd 
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
sns.set_style("whitegrid")
from pprint import pprint
import sqlite3
import yaml
import re
from datetime import datetime
from datetime import timezone
import maya
from collections import Counter
from glob import glob
import ast
import json
import cProfile
import time
from sklearn.preprocessing import StandardScaler

try: # for pip >= 10
    from pip._internal.req import parse_requirements
except ImportError: # for pip <= 9.0.3
    from pip.req import parse_requirements

from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA

import networkx as nx
import networkx.algorithms.community as nxcom

import bokeh.io
from bokeh.io import output_file, show
from bokeh.resources import INLINE
from bokeh.models import (BoxSelectTool, Circle, EdgesAndLinkedNodes, HoverTool,
                          MultiLine, NodesAndLinkedEdges, Plot, Range1d, TapTool,
                         BoxZoomTool, ResetTool, OpenURL, CustomJS, Column, SaveTool)
from bokeh.palettes import Spectral4
from bokeh.plotting import figure, output_notebook
from bokeh.models.graphs import from_networkx
from bokeh.models import TextInput, Button

from scipy.spatial.distance import cosine

%matplotlib inline

# Code for hiding seaborn warnings
import warnings
warnings.filterwarnings("ignore")

## Easier navigation:

------------------------------------

### 1. <a href='#setup'>Setup</a>
------------------------------------

## Models: 
### 2. <a href='#naive'>Naive</a>
### 3. <a href='#lru'>Least Recently Used (LRU)</a>
### 4. <a href='#landlord'>Basic LANDLORD</a>
### 5. <a href='#landlordplus'>LANDLORD++</a>

<a name='setup'></a>
## 1. Setup

<br>

In [3]:
con = sqlite3.connect('binder.sqlite')
launches_df = pd.read_sql('SELECT * FROM events', con)

In [4]:
with open('pentagon_df.txt') as f:
    reloaded_example = json.load(f)
q_df = pd.read_json(reloaded_example)

In [42]:
dep_df = pd.read_hdf('dependency_table_final.h5', 'df')
dep_df = dep_df.rename(columns=dep_df.loc["ref"]).drop("ref")

------------------------------------------------

In [6]:
launches_df["combined_ref"] = launches_df['ref'].fillna('') + launches_df['guessed_ref'].fillna('')
launches_df

Unnamed: 0,version,timestamp,provider,spec,origin,ref,guessed_ref,combined_ref
0,1,2018-11-03T00:00:00+00:00,GitHub,Qiskit/qiskit-tutorial/master,,,27d67cc4485ebfebae2b36b4856f1cea9fc7f693,27d67cc4485ebfebae2b36b4856f1cea9fc7f693
1,1,2018-11-03T00:00:00+00:00,GitHub,ipython/ipython-in-depth/master,,,7e5ce96cc9251083979efdfc393425f1229a4a68,7e5ce96cc9251083979efdfc393425f1229a4a68
2,1,2018-11-03T00:00:00+00:00,GitHub,QISKit/qiskit-tutorial/master,,,27d67cc4485ebfebae2b36b4856f1cea9fc7f693,27d67cc4485ebfebae2b36b4856f1cea9fc7f693
3,1,2018-11-03T00:01:00+00:00,GitHub,QISKit/qiskit-tutorial/master,,,27d67cc4485ebfebae2b36b4856f1cea9fc7f693,27d67cc4485ebfebae2b36b4856f1cea9fc7f693
4,1,2018-11-03T00:01:00+00:00,GitHub,jupyterlab/jupyterlab-demo/master,,,79184a07d47cf03787b39cfc345da98794d76554,79184a07d47cf03787b39cfc345da98794d76554
...,...,...,...,...,...,...,...,...
18230449,4,2021-06-06T23:58:00+00:00,GitHub,jupyterlab/jupyterlab-demo/master,gke.mybinder.org,5a5eb6bb04250b199a1cbb529e744075216a17a5,,5a5eb6bb04250b199a1cbb529e744075216a17a5
18230450,4,2021-06-06T23:58:00+00:00,GitHub,jupyterlab/jupyterlab-demo/master,gke.mybinder.org,5a5eb6bb04250b199a1cbb529e744075216a17a5,,5a5eb6bb04250b199a1cbb529e744075216a17a5
18230451,4,2021-06-06T23:58:00+00:00,GitHub,jupyterlab/jupyterlab-demo/master,gke.mybinder.org,5a5eb6bb04250b199a1cbb529e744075216a17a5,,5a5eb6bb04250b199a1cbb529e744075216a17a5
18230452,4,2021-06-06T23:58:00+00:00,GitHub,jupyterlab/jupyterlab-demo/master,gke.mybinder.org,5a5eb6bb04250b199a1cbb529e744075216a17a5,,5a5eb6bb04250b199a1cbb529e744075216a17a5


In [7]:
q_df.head()

Unnamed: 0,dependencies,total size,size_full,Contributors,Forks,Open Issues/PRs,SourceRank,Stars,Total releases,Version Number,...,dependencies full,Incoming Dep,Outgoing Dep,Degree,Authority,size,dependencies drained,dependencies drained count,Observed Outgoing Dep,time
cutecharts,[jinja2],1.296,11M /home/ubuntu/ve/randomenv/lib/python3.6/si...,2.0,56.0,2.0,11.0,503.0,3.0,"[1.2.0, 1.1.0, 1.0.0]",...,"[markupsafe, jinja2]",0,2,0.000431,0.0,0.188,[],0,2,0.396333
postgres,"[psycopg2-binary, psycopg2-pool]",0.144,11M /home/ubuntu/ve/randomenv/lib/python3.6/si...,6.0,17.0,16.0,12.0,55.0,12.0,"[3.0.0, 2.2.2, 2.2.1, 2.2.0, 2.1.2, 2.1.1, 2.1...",...,"[psycopg2-pool, psycopg2-binary]",0,2,0.000431,0.0,0.144,[],0,2,0.408333
gpustat,"[six, nvidia-ml-py3, psutil, blessings]",1.864,11M /home/ubuntu/ve/randomenv/lib/python3.6/si...,10.0,202.0,23.0,16.0,2479.0,11.0,"[1.0.0b1, 0.6.0, 0.5.0, 0.4.1, 0.4.0, 0.3.2, 0...",...,"[psutil, blessings, six, nvidia-ml-py3]",1,4,0.001078,1.7e-05,0.088,[],0,4,1.8506
df2gspread,"[argparse, google-api-python-client, gspread, ...",82.632,47M /home/ubuntu/ve/randomenv/lib/python3.6/si...,6.0,26.0,12.0,10.0,117.0,21.0,"[1.0.4, 1.0.3, 1.0.2, 1.0.1, 1.0.0, 0.2.5, 0.0...",...,"[idna, google-api-core, certifi, google-api-py...",0,31,0.006684,0.0,0.0,[],0,31,0.822531
ddeint,"[numpy, scipy]",87.04,62M /home/ubuntu/ve/randomenv/lib/python3.6/si...,,,,4.0,,4.0,"[0.2, 0.1.02, 0.1.01, 0.1.0]",...,"[scipy, numpy]",0,2,0.000431,0.0,0.04,[],0,2,1.020333


In [43]:
dep_df

Unnamed: 0,ab1be186a74c81c83f863bc3b6258fee46eeca56,6464ab1b75787590096275bf36b6e3035f23d2ab,f4d256f2e4e8688261871073f9e7f87cacd7486c,562d55fbc439fbf936f32fcb9e09fbfa1f145936,770ff3fac88cd8e4b49876a551514d3c3028740e,9420f894ac5aa0e37fef9141a05a7269553a08c3,e6d48e9afa449d6c6db8ed3e3b50e456a05bec5e,138564ad1b0d7282cc69aed1a1d98bd32d504e9f,0a97c8c82041738713463e121f355e29b076883b,afaed6cfbbbb5f83af2b4e55c2bc9c9d95e3f71d,...,0d7e825f6bcfb303c5ccc287c6ed47561407a822,900ebf102d4173a75d92d333fa1581dc158504bc,ea6101702573150f4815ce19ffa678b95b3249cb,77f5eb02f52fd186d86337e8a503109dcc52a4bc,72908049bc4a7968b148c2cff32a8a37a82f74da,c4d2c75d1807a1d1189b84bd6f4a0aafca5b8c53,959514fb6451184747ee0d26dc359fd87c1d6447,24698dfd180e2bb0de1805a8ce2040ba914d70df,3e4c9af0dfdd2be3ebbd7f71ad9f463db9b915d4,729d216ddfce5e20901b127e357c8f9b51185965
numpy,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
pandas,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
fuzzywuzzy,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
bs4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
geopandas,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
imdb-cli-tool,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pyforest,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hpbandster,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pyswip,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


------------------------------------------------

Similarity between two recipes/columns:

In [9]:
def cosine_sim(c1, c2):
    return 1 - cosine(c1, c2)

Combine two recipes/columns:

In [10]:
def combine_col(c1, c2):
    return 1 - (1 - c1) * (1 - c2)

Check if all of c1 is contained in c2:

In [11]:
def contains_image(c1, c2):
    # Previous command that resulted in huge runtime increase
    # set(c1[c1 == 1].index)
    
    s = set(c1.to_numpy().nonzero()[0])
    image = set(c2.to_numpy().nonzero()[0])
    
    is_subset = s.issubset(image)
    return is_subset
    

Container class:

In [12]:
class Container:
    def __init__(self, size, time):
        self.size = size
        self.time = time
    

Convert launches_db's timestamps from string to a unix int to match the version time column from q_df

In [29]:
def str2date2unix(s):
    dt = maya.parse(s).datetime()
    return dt.replace(tzinfo=timezone.utc).timestamp()

Note: a lot of launches won't be used - specs only has 34400 unique refs

In [65]:
len(launches_df["combined_ref"].unique())

233086

<a name='naive'></a>
## 2. Naive

<br>

In [82]:
def MODEL_naive():
    start = time.time()
    count = 0
    containers = []
    
    for index, row in launches_df[0:1000].iterrows():
        ref = row["combined_ref"]
        timestamp = str2date2unix(row["timestamp"])
        try:
            dep_list = dep_df[ref]
            count += 1
            
            q_df_equiv = q_df.loc[[a for a in 
                                       dep_df.iloc[dep_list.to_numpy().nonzero()[0]].index if a in q_df.index]]
            
            try:
                pass
            except:
                print("hi")
                pass
            
            
        except:
            pass
        
    
        
    end = time.time()
    print(end - start)
    return count

MODEL_naive()

12.683371305465698


115

In [58]:
launches_df[-1000:]

Unnamed: 0,version,timestamp,provider,spec,origin,ref,guessed_ref,combined_ref
18229454,4,2021-06-06T21:52:00+00:00,GitHub,explosion/spacy-io-binder/spacy.io,ovh.mybinder.org,59ccd22305599cdc87ba38f6af8049213a96489e,,59ccd22305599cdc87ba38f6af8049213a96489e
18229455,4,2021-06-06T21:52:00+00:00,GitHub,john-adeojo/AdvancedRegression/bc005403eba5e1a...,ovh.mybinder.org,bc005403eba5e1a12f2e2560493f48b635875d25,,bc005403eba5e1a12f2e2560493f48b635875d25
18229456,4,2021-06-06T21:52:00+00:00,GitHub,john-adeojo/AdvancedRegression/bc005403eba5e1a...,ovh.mybinder.org,bc005403eba5e1a12f2e2560493f48b635875d25,,bc005403eba5e1a12f2e2560493f48b635875d25
18229457,4,2021-06-06T21:52:00+00:00,GitHub,ipython/ipython-in-depth/master,gke.mybinder.org,7e5ce96cc9251083979efdfc393425f1229a4a68,,7e5ce96cc9251083979efdfc393425f1229a4a68
18229458,4,2021-06-06T21:52:00+00:00,GitHub,Dimoooooooooon/fastai-mybinder/HEAD,gke.mybinder.org,f34fee350abaa82add8a2625ae649695b39f38d3,,f34fee350abaa82add8a2625ae649695b39f38d3
...,...,...,...,...,...,...,...,...
18230449,4,2021-06-06T23:58:00+00:00,GitHub,jupyterlab/jupyterlab-demo/master,gke.mybinder.org,5a5eb6bb04250b199a1cbb529e744075216a17a5,,5a5eb6bb04250b199a1cbb529e744075216a17a5
18230450,4,2021-06-06T23:58:00+00:00,GitHub,jupyterlab/jupyterlab-demo/master,gke.mybinder.org,5a5eb6bb04250b199a1cbb529e744075216a17a5,,5a5eb6bb04250b199a1cbb529e744075216a17a5
18230451,4,2021-06-06T23:58:00+00:00,GitHub,jupyterlab/jupyterlab-demo/master,gke.mybinder.org,5a5eb6bb04250b199a1cbb529e744075216a17a5,,5a5eb6bb04250b199a1cbb529e744075216a17a5
18230452,4,2021-06-06T23:58:00+00:00,GitHub,jupyterlab/jupyterlab-demo/master,gke.mybinder.org,5a5eb6bb04250b199a1cbb529e744075216a17a5,,5a5eb6bb04250b199a1cbb529e744075216a17a5


In [79]:
xd = dep_df["ab1be186a74c81c83f863bc3b6258fee46eeca56"]
xd

numpy            1.0
pandas           1.0
fuzzywuzzy       1.0
bs4              1.0
geopandas        1.0
                ... 
imdb-cli-tool    0.0
pyforest         0.0
hpbandster       0.0
pyswip           0.0
gputil           0.0
Name: ab1be186a74c81c83f863bc3b6258fee46eeca56, Length: 5190, dtype: object

In [81]:
q_df[["size"]].loc[[a for a in dep_df.iloc[xd.to_numpy().nonzero()[0]].index if a in q_df.index]]

Unnamed: 0,size
numpy,25.0
pandas,47.0
fuzzywuzzy,0.084
bs4,0.856
geopandas,2.7
pycountry,30.0
matplotlib,35.0
seaborn,2.5
bokeh,59.0
sklearn,82.0
