# Loading

Create a generator function for importing the file

In [2]:
import urllib.request
import lzma
import base64
import re

def get_dependencies():
    localfile, info = urllib.request.urlretrieve("https://github.com/ogirardot/meta-deps/raw/master/pypi-deps.csv.lzma")
    with lzma.open(localfile, mode='r') as file:
        for line in file:
            package, version, deps = line.decode("utf-8").split('\t')
            deps = base64.b64decode(deps).decode("utf-8")
            deps = eval(deps)
            regex = r'\"*([\w\.]+)[><]?\=?.*\"*'
            
            def cleanDep(s):
                try:
                    match = re.search(regex, s)
                    return match.group(0)
                except:
                    return None
            
            deps = [cleanDep(x) for x in deps if cleanDep(x) is not None]
            
            yield package, deps
        
        


Store as dataframe

In [3]:
import pandas as pd
import numpy as np

rows = np.array([[None,None]])
for package, deps in get_dependencies():
    # Packages with no dependencies get blank string
    if deps:
        args= [[package, dep] for dep in deps]
    else:
        args = [[package, None]]
    rows = np.append(rows, args, axis=0)

        
df = pd.DataFrame(rows, columns=["package", "dependency"]).drop_duplicates()
df.head()

Unnamed: 0,package,dependency
0,,
1,0x10c-asm,
2,2gis,
3,3to2,
4,3to2_py3k,


In [4]:
deps = pd.Series(df.dependency.unique())
packages = pd.Series(df.package.unique())
new_deps = deps[~deps.isin(packages)]
new_df = pd.DataFrame({'package': new_deps, 'dependency': [None]*len(new_deps)})
df = pd.concat([df, new_df])
df = df.dropna(subset=['package'])

In [5]:
from timeit import default_timer, timeit

## Redshift

In Redshift we create a single table that will hold one "edge" per row.

In [2]:
import redshift_connector
conn = redshift_connector.connect(
     host='<PUT URL HERE>',
     database='<PUT DB NAME HERE>',
     user='awsuser',
     password='<PUT PASSWORD HERE>'
  )
conn.autocommit = True
cursor = conn.cursor()

Create the table

In [7]:
cursor.execute("DROP TABLE dependencies")
cursor.execute(
"""
    CREATE TABLE dependencies (
        package VARCHAR(150),
        dependency VARCHAR(150) NULL
    )
"""
)

<redshift_connector.cursor.Cursor at 0x7f61fbde1750>

Load the data

In [24]:
from itertools import product

# Time it
start = default_timer()

#cursor.write_dataframe(df, "dependencies")
cursor.executemany("INSERT INTO dependencies VALUES (%s, %s)", df.itertuples(index=False, name=None))

end = default_timer()
redshift_load_time = end - start
redshift_load_time

KeyboardInterrupt: 

# Neptune Graph

Clear existing data from graph

In [9]:
%%gremlin

g.V().drop()

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…

Load in nodes and edges

In [10]:
from gremlin_python import statics
from gremlin_python.structure.graph import Graph
from gremlin_python.process.graph_traversal import __
from gremlin_python.process.strategies import *
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.traversal import T
from gremlin_python.process.traversal import Order
from gremlin_python.process.traversal import Scope

In [11]:
def load_nodes(g, df):
    for package_name in df.package.unique():
        try:
            g.add_v('package').property(T.id, package_name).as_(package_name).next()
            #print("added package: " + package_name)
        except:
            print(f"Failed to add package {package_name}")

def load_edges(g, df):
    for index, row in df.iterrows():
        if row['dependency'] is not None:
            g.V(row['package']).add_e("depends_on").to(__.V(row['dependency'])).next()
            #print("Added dependency: " + row['package'] + "->" + row['dependency'])



In [12]:
remoteConnStr = 'wss://<PUT URL HERE>:8182/gremlin'
remoteConn = DriverRemoteConnection(remoteConnStr,'g')

g = Graph().traversal().withRemote(remoteConn)

start = default_timer()

load_nodes(g, df)
load_edges(g, df)

end = default_timer()
remoteConn.close()

neptune_load_time = end - start
neptune_load_time

244.29565478400036

In [13]:
%%gremlin
g.V()
g.E()

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…

# Queries

## Helper Funcs

In [11]:
def timeRedshiftQuery(sqlText):
    cursor = conn.cursor()
    f = lambda : cursor.execute(sqlText)
    return timeit("f()")

def timeNeptuneQuery(f):
    g = Graph().traversal().withRemote(remoteConn)
    return timeit.timeit("f(g)")

## 1: Which package has the most direct dependencies and how many?

### Redshift

In [1]:
cursor = conn.cursor()
query = """
    SELECT TOP 1 *
    FROM 
    (
        SELECT package, count(*) as "dep_count"
        FROM dependencies
        WHERE dependency is not NULL
        GROUP BY package
    ) DepCounts
    ORDER BY dep_count DESC, package ASC

"""
cursor.execute(query)
cursor.fetchall()

NameError: name 'conn' is not defined

In [None]:
timeRedshiftQuery(query)

### Neptune

In [45]:
def query(g):
    g.V().hasLabel('package').inE('depends_on'). \
    groupCount().by('package').as_('group'). \
    order(Scope.local). \
        by('values', Order.desc). \
        by('keys', Order.asc). \
    unfold().limit(1).toList()

In [20]:
%%gremlin

g.V().hasLabel('package').in('depends_on').
groupCount().by(id).
order(Scope.local).
    by(values, Order.desc).
    by(keys, Order.asc).
unfold().limit(1)

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…

# 3: Which packages depend on Pandas (directly and indirectly)?

## Redshift

In [61]:
query = """
    WITH RECURSIVE panda_deps(package, dependency, level) AS 
    (SELECT package, dependency, 0 AS level 
     FROM dependencies
     WHERE dependency = 'numpy'
     UNION ALL
     SELECT d.package, d.dependency, level + 1
     FROM dependencies d, panda_deps p
     WHERE d.dependency = p.package
    )
    SELECT package FROM panda_deps;

"""
cursor.execute(query)
cursor.fetchall()

(['ADAM-Tools'],
 ['BiologicalProcessNetworks'],
 ['dcmt'],
 ['django-instakit'],
 ['fastinterval'],
 ['gyroid'],
 ['hdf5-django'],
 ['lmj.c3d'],
 ['lmj.nethack'],
 ['lmj.rbm'],
 ['nplook'],
 ['nwalign'],
 ['oceans'],
 ['ocupy'],
 ['PyBUFR'],
 ['pyorbital'],
 ['python-consume'],
 ['pyucsc'],
 ['SimpleHist'],
 ['smith'],
 ['pyucsc'],
 ['canvas'],
 ['chebpy'],
 ['fitsio'],
 ['gceising'],
 ['gsw'],
 ['h5dj'],
 ['his2h5'],
 ['linguistic-helper-functions'],
 ['lmj.kohonen'],
 ['lmj.particle'],
 ['lmj.plot'],
 ['lmj.pursuit'],
 ['metaseq'],
 ['multichain_mcmc'],
 ['pyRserve'],
 ['readfmf'],
 ['RTM'],
 ['topzootools'],
 ['zipline'])

In [64]:
timeRedshiftQuery(query)

NameError: name 'f' is not defined

# Neptune