# Loading

Create a generator function for importing the file

In [18]:
import urllib.request
import lzma
import base64
import re

def get_dependencies():
    localfile, info = urllib.request.urlretrieve("https://github.com/ogirardot/meta-deps/raw/master/pypi-deps.csv.lzma")
    with lzma.open(localfile, mode='r') as file:
        for line in file:
            package, version, deps = line.decode("utf-8").split('\t')
            deps = base64.b64decode(deps).decode("utf-8")
            deps = eval(deps)
            regex = r'\"*([\w\.]+)[><]?\=?.*\"*'
            
            def cleanDep(s):
                try:
                    match = re.search(regex, s)
                    return match.group(0)
                except:
                    return None
            
            deps = [cleanDep(x) for x in deps if cleanDep(x) is not None]
            
            yield package, deps
        
        


Store as dataframe

In [19]:
import pandas as pd
import numpy as np

rows = np.array([[None,None]])
for package, deps in get_dependencies():
    # Packages with no dependencies get blank string
    if deps:
        args= [[package, dep] for dep in deps]
    else:
        args = [[package, None]]
    rows = np.append(rows, args, axis=0)

        
df = pd.DataFrame(rows, columns=["package", "dependency"]).drop_duplicates()
df.head()

Unnamed: 0,package,dependency
0,,
1,0x10c-asm,
2,2gis,
3,3to2,
4,3to2_py3k,


In [20]:
deps = pd.Series(df.dependency.unique())
packages = pd.Series(df.package.unique())
new_deps = deps[~deps.isin(packages)]
new_df = pd.DataFrame({'package': new_deps, 'dependency': [None]*len(new_deps)})
df = pd.concat([df, new_df])
df = df.dropna(subset=['package'])

In [21]:
from timeit import default_timer, timeit

## Redshift

In Redshift we create a single table that will hold one "edge" per row.

In [22]:
import redshift_connector
conn = redshift_connector.connect(
     host='data512-redshift-cluster.cdlpt3opcvmp.us-west-2.redshift.amazonaws.com',
     database='dev',
     user='awsuser',
     password='REDACTED'
  )
conn.autocommit = True
cursor = conn.cursor()

Create the table

In [7]:
cursor.execute("DROP TABLE dependencies")
cursor.execute(
"""
    CREATE TABLE dependencies (
        package VARCHAR(150),
        dependency VARCHAR(150) NULL
    )
"""
)

<redshift_connector.cursor.Cursor at 0x7f61fbde1750>

Load the data

In [24]:
from itertools import product

# Time it
start = default_timer()

#cursor.write_dataframe(df, "dependencies")
cursor.executemany("INSERT INTO dependencies VALUES (%s, %s)", df.itertuples(index=False, name=None))

end = default_timer()
redshift_load_time = end - start
redshift_load_time

KeyboardInterrupt: 

# Neptune Graph

Clear existing data from graph

In [9]:
%%gremlin

g.V().drop()

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…

Load in nodes and edges

In [23]:
from gremlin_python import statics
from gremlin_python.structure.graph import Graph
from gremlin_python.process.graph_traversal import __
from gremlin_python.process.strategies import *
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.traversal import T
from gremlin_python.process.traversal import Order
from gremlin_python.process.traversal import Scope

In [38]:
def load_nodes(g, df):
    for package_name in df.package.unique():
        try:
            g.add_v('package').property(T.id, package_name).as_(package_name).next()
            #print("added package: " + package_name)
        except:
            print(f"Failed to add package {package_name}")

def load_edges(g, df):
    for index, row in df.iterrows():
        if row['dependency'] is not None:
            g.V(row['package']).add_e("depends_on").to(__.V(row['dependency'])).next()
            #print("Added dependency: " + row['package'] + "->" + row['dependency'])



In [64]:
remoteConnStr = 'wss://database-1.cluster-c8vhkkn9knug.us-west-2.neptune.amazonaws.com:8182/gremlin'
remoteConn = DriverRemoteConnection(remoteConnStr,'g')

g = Graph().traversal().withRemote(remoteConn)

start = default_timer()

load_nodes(g, df)
load_edges(g, df)

end = default_timer()
remoteConn.close()

neptune_load_time = end - start
neptune_load_time

NameError: name 'DriverRemoteConnection' is not defined

In [36]:
%%gremlin
g.V()
g.E()

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…

# Queries

## Helper Funcs

In [24]:
def timeRedshiftQuery(sqlTxt):
    cursor = conn.cursor()
    start = default_timer()
    cursor.execute(sqlTxt)
    end = default_timer()
    return end - start

def timeNeptuneQuery(f):
    start = default_timer()
    g = Graph().traversal().withRemote(remoteConn)
    end = default_timer()
    return end - start

## Redshift Recursive View
We save a view that has a row for each package and package it depends on either directly or indirectly.

## 1: Which package has the most direct dependencies and how many?

### Redshift

In [25]:
cursor = conn.cursor()
query = """
    SELECT TOP 1 *
    FROM 
    (
        SELECT package, count(*) as "dep_count"
        FROM dependencies
        WHERE dependency is not NULL
        GROUP BY package
    ) DepCounts
    ORDER BY dep_count DESC, package ASC

"""
cursor.execute(query)
cursor.fetchall()

(['Products.CMFPlone', 100],)

In [26]:
timeRedshiftQuery(query)

0.0028214300000399817

### Neptune

In [27]:
def query(g):
    g.V().hasLabel('package').inE('depends_on'). \
    groupCount().by('package').as_('group'). \
    order(Scope.local). \
        by('values', Order.desc). \
        by('keys', Order.asc). \
    unfold().limit(1).toList()

In [28]:
%%gremlin

g.V().hasLabel('package').in('depends_on').
groupCount().by(id).
order(Scope.local).
    by(values, Order.desc).
    by(keys, Order.asc).
unfold().limit(1)

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…

# 2: Which package has the most total dependencies?

## Redshift

In [29]:
query = """
WITH RECURSIVE total_deps(package, dependency, level) AS 
    (SELECT package, dependency, 0 AS level 
     FROM dependencies
     UNION ALL
     SELECT d.package, d.dependency, level + 1
     FROM dependencies d, total_deps p
     WHERE 
        d.dependency = p.package AND
        d.dependency is not NULL AND
        level <= 50
    )
SELECT TOP 1 package, dep_count from
(
    SELECT package, count(dependency) as dep_count
    FROM total_deps
    GROUP BY package, dependency
) t
ORDER BY dep_count DESC, package ASC
"""
cursor.execute(query)
cursor.fetchall()

(['horae.planning', 130303],)

In [30]:
timeRedshiftQuery(query)

0.2521183139999721

## Neptune

In [32]:
%%gremlin

g.V().hasLabel('package').emit().repeat(__.in('depends_on')).until(
    inE().count().is(0).or(
    loops().count().is(15))
).
groupCount().by(id).
order(Scope.local).
    by(values, Order.desc).
    by(keys, Order.asc).
unfold().limit(1)

Tab(children=(Output(layout=Layout(overflow='scroll')),), _titles={'0': 'Error'})

# 3: Which packages depend on Products.CMFPlone (directly and indirectly)?

## Redshift

In [48]:

query = """
    WITH RECURSIVE CMFPlone(package, dependency, level) AS 
    (SELECT package, dependency, 0 AS level 
     FROM dependencies
     WHERE dependency = 'Products.CMFPlone'
     UNION ALL
     SELECT d.package, d.dependency, level + 1
     FROM dependencies d, CMFPlone p
     WHERE d.dependency = p.package
    )
    SELECT package, level FROM CMFPlone;

"""
cursor.execute(query)
cursor.fetchall()

(['collective.ie8nomore', 0],
 ['collective.installedpackages', 0],
 ['plone.app.event', 0],
 ['Products.BlingPortlet', 0],
 ['slc.clicktracker', 0],
 ['zest.ploneglossaryhighlight', 0],
 ['agsci.blognewsletter', 0],
 ['collective.folderishtraverse', 0],
 ['izug.ticketbox', 0],
 ['plone.app.layout', 0],
 ['Products.CMFPlomino', 0],
 ['Products.UserAndGroupSelectionWidget', 0],
 ['rer.portlet.advanced_static', 0],
 ['wildcard.cleanprint', 0],
 ['plone.app.event', 1],
 ['collective.contextimage', 1],
 ['collective.amberjack.core', 1],
 ['Products.UserField', 1],
 ['collective.amberjack.plonetour', 2],
 ['collective.amberjack.portlet', 2])

In [34]:
timeRedshiftQuery(query)

0.005593705999899612

# Neptune

In [35]:
%%gremlin

g.V('Products.CMFPlone').emit().repeat(in()).hasId(without('Products.CMFPlone'))

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…

# 4: Which packages have multiple dependency?


## Redshift

In [36]:
query = """
WITH RECURSIVE total_deps(package, dependency, level) AS 
    (SELECT package, dependency, 0 AS level 
     FROM dependencies
     UNION ALL
     SELECT d.package, d.dependency, level + 1
     FROM dependencies d, total_deps p
     WHERE 
        d.dependency = p.package AND
        d.dependency is NOT NULL AND
        level <= 50
    )
SELECT DISTINCT package from
(
    SELECT package, count(dependency) as total_paths
    FROM total_deps
    GROUP BY package, dependency
    HAVING total_paths > 1
) t
"""
cursor.execute(query)
cursor.fetchall()

(['actdiag'],
 ['affinitic.verifyinterface'],
 ['afpy.xap'],
 ['AgileCLU'],
 ['agsci.blognewsletter'],
 ['al_papi'],
 ['alstat'],
 ['alto'],
 ['amonpy'],
 ['amo-validator'],
 ['android-benchmark-views'],
 ['anobii.api'],
 ['ansible'],
 ['anthill.querytool'],
 ['anybox.recipe.sysdeps'],
 ['anyvc'],
 ['appfy.recipe.gae'],
 ['appypi'],
 ['aptop'],
 ['aranha'],
 ['archetypes.configure'],
 ['arecibo'],
 ['areciboware'],
 ['argyle'],
 ['Art3dUtils'],
 ['artichoke'],
 ['asm.cms'],
 ['asm.cmsui'],
 ['asm.translation'],
 ['asqc'],
 ['assentio'],
 ['assetgen'],
 ['atomisator.readers'],
 ['authgoogle-middleware'],
 ['autoflake'],
 ['autojenkins'],
 ['automa'],
 ['aws.authrss'],
 ['baas'],
 ['bagit_profile'],
 ['balanced-ach'],
 ['bamboo.pantrybell'],
 ['banners'],
 ['bda.awstatsparser'],
 ['bda.blogview'],
 ['bda.calendar.base'],
 ['bda.plone.wfintranet'],
 ['beanstalk-stack'],
 ['beets'],
 ['betahaus.debug'],
 ['bezel'],
 ['bitmat'],
 ['bitsyblog'],
 ['blanc-basic-events'],
 ['blockdiagcontrib-c

In [37]:
timeRedshiftQuery(query)

0.03814578399988022

## Neptune

From each node, traverse out. Store visited nodes, and return if a visited node is found.

In [38]:
%%gremlin
g.V().as('a').
    repeat(aggregate('visited').out()).until(out().as('visited')).
    select('a').dedup()

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…

# 5: How many packages are exactly 2 degrees of separation from Products.CMFPlone?

## Redshift

In [40]:
query = """
    WITH RECURSIVE CMFP_deps(package, dependency, level) AS 
    (SELECT package, dependency, 0 AS level 
     FROM dependencies
     WHERE dependency = 'Products.CMFPlone'
     UNION ALL
     SELECT d.package, d.dependency, level + 1
     FROM dependencies d, CMFP_deps p
     WHERE d.dependency = p.package
    ),
                  CMFP_packs(package, dependency, level) AS
    (SELECT package, dependency, 0 as level
     FROM dependencies
     WHERE package = 'Products.CMFPlone'
     UNION ALL
     SELECT d.package, d.dependency, level + 1
     FROM dependencies d, CMFP_packs p
     WHERE d.package = p.dependency
    )
    
    SELECT package 
    FROM CMFP_deps
    WHERE level = 1
    UNION 
    SELECT package
    FROM CMFP_packs 
    WHERE level = 1;

"""
cursor.execute(query)
cursor.fetchall()

(['collective.contextimage'],
 ['Products.UserField'],
 ['ExtensionClass'],
 ['Products.ATContentTypes >= 2.1.3'],
 ['Products.CMFCalendar'],
 ['Products.CMFEditions'],
 ['Products.CMFFormController'],
 ['Products.CMFQuickInstallerTool'],
 ['Products.CMFUid'],
 ['Products.PlacelessTranslationService'],
 ['Products.ResourceRegistries'],
 ['ZODB3'],
 ['five.localsitemanager'],
 ['plone.app.controlpanel'],
 ['plone.app.form'],
 ['plone.app.i18n'],
 ['plone.app.users'],
 ['plone.app.upgrade'],
 ['plone.app.vocabularies'],
 ['plone.app.workflow'],
 ['plone.contentrules'],
 ['plone.locking'],
 ['plone.registry'],
 ['plone.portlet.collection'],
 ['plonetheme.classic'],
 ['zope.dottedname'],
 ['zope.event'],
 ['zope.pagetemplate'],
 ['zope.site'],
 ['zope.tal'],
 ['zope.traversing'],
 ['Products.CMFCore'],
 ['Products.CMFDynamicViewFTI'],
 ['Products.DCWorkflow'],
 ['Products.PortalTransforms'],
 ['Products.statusmessages'],
 ['kss.core'],
 ['plone.app.jquerytools'],
 ['plone.app.layout >=1.1.

In [45]:
#get dependencies of CMFP
#get dependencies of dependencies of CMFP
#Union packages that depend on CMFP and packages of packages that depend on CMFP
query = """
    SELECT d.package 
    FROM (SELECT dependency
          FROM dependencies
          WHERE package = 'Products.CMFPlone') as p 
    JOIN dependencies d
    ON p.dependency = d.dependency
    WHERE d.package != 'Products.CMFPlone'
    UNION
    SELECT d2.dependency
    FROM (SELECT dependency
          FROM dependencies
          WHERE dependency = 'Products.CMFPlone') as p2
    JOIN dependencies d2
    ON p2.dependency = d2.package
    
"""
cursor.execute(query)
cursor.fetchall()

(['actdiag'],
 ['affinitic.verifyinterface'],
 ['afpy.xap'],
 ['agsci.blognewsletter'],
 ['anthill.querytool'],
 ['anybox.recipe.sysdeps'],
 ['archetypes.configure'],
 ['archetypes.fieldtraverser'],
 ['asm.cms'],
 ['asm.cmsui'],
 ['asm.translation'],
 ['automa'],
 ['aws.authrss'],
 ['aws.windowsplonecluster'],
 ['bda.calendar.base'],
 ['bda.intellidatetime'],
 ['bda.plone.finder'],
 ['betahaus.debug'],
 ['blockdiagcontrib-class'],
 ['bookreader'],
 ['bowerrecipe'],
 ['bud.nospam'],
 ['buildout.dumppickedversions'],
 ['buildout.eggnest'],
 ['busyflow.pivotal'],
 ['c2.app.replaceword'],
 ['causal'],
 ['clearwind.arecibo'],
 ['ClueDojo'],
 ['collective.akismet'],
 ['collective.amberjack.core'],
 ['collective.amberjack.portlet'],
 ['collective.anotherdynamicgroupsplugin'],
 ['collective.atspreadsheet'],
 ['collective.behavior.contactinfo'],
 ['collective.blog.star'],
 ['collective.chromatable'],
 ['collective.codemirror'],
 ['collective.collage.plonetruegallery'],
 ['collective.configviews

In [46]:
timeRedshiftQuery(query)

0.016223505999960253

## Neptune

In [47]:
%%gremlin

g.V('Products.CMFPlone').both().both().simplePath().dedup()

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…