# Loading

Create a generator function for importing the file

In [1]:
import urllib.request
import lzma
import base64
import re

def get_dependencies():
    localfile, info = urllib.request.urlretrieve("https://github.com/kevSweet/meta-deps/raw/master/pypi-deps.csv.lzma")
    with lzma.open(localfile, mode='r') as file:
        for line in file:
            package, version, deps = line.decode("utf-8").split('\t')
            deps = base64.b64decode(deps).decode("utf-8")
            deps = eval(deps)
            regex = r'\"*([\w\.]+)[><]?\=?.*\"*'
            
            def cleanDep(s):
                try:
                    match = re.search(regex, s)
                    return match.group(0)
                except:
                    return None
            
            deps = [cleanDep(x) for x in deps if cleanDep(x) is not None]
            
            yield package, deps
        
        


Store as dataframe

In [23]:
import pandas as pd
import numpy as np

rows = np.array([[None,None]])
for package, deps in get_dependencies():
    # Packages with no dependencies get blank string
    if deps:
        args= [[package, dep] for dep in deps]
    else:
        args = [[package, None]]
    rows = np.append(rows, args, axis=0)

        
df = pd.DataFrame(rows, columns=["package", "dependency"]).drop_duplicates()
df.head()

Unnamed: 0,package,dependency
1,0x10c-asm,
2,2gis,
3,3to2,
4,3to2_py3k,
5,42qucc,


In [24]:
deps = pd.Series(df.dependency.unique())
packages = pd.Series(df.package.unique())
new_deps = deps[~deps.isin(packages)]
new_df = pd.DataFrame({'package': new_deps, 'dependency': [None]*len(new_deps)})
df = pd.concat([df, new_df])
df = df.dropna(subset=['package'])

In [None]:
from timeit import default_timer

## Redshift

In Redshift we create a single table that will hold one "edge" per row.

In [None]:
import redshift_connector
conn = redshift_connector.connect(
     host='redshift-python-dep.cantgzv41ter.us-east-1.redshift.amazonaws.com',
     database='deps',
     user='awsuser',
     password='uGdipKs9WpJ8yU6'
  )
conn.autocommit = True
cursor = conn.cursor()

Create the table

In [None]:
cursor.execute("DROP TABLE dependencies")
cursor.execute(
"""
    CREATE TABLE dependencies (
        package VARCHAR(150),
        dependency VARCHAR(150) NULL
    )
"""
)

Load the data

In [None]:


# Time it
start = default_timer()

cursor.write_dataframe(df, "dependencies")

# for package, deps in get_dependencies():
#     # Packages with no dependencies get blank string
#     if deps:
#         args = list(product([package], deps))
#     else:
#         args = [(package, "")]
        
#     print(args)
    
#     for tup in args:
#         cursor.execute("INSERT INTO dependencies VALUES (%s, %s)", tup)
#     cursor.executemany("INSERT INTO dependencies VALUES (%s, %s)", args)

end = default_timer()
print(end - start)

# Neptune Graph

Clear existing data from graph

In [20]:
%%gremlin

g.V().drop()

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…

Load in nodes

In [21]:
from gremlin_python import statics
from gremlin_python.structure.graph import Graph
from gremlin_python.process.graph_traversal import __
from gremlin_python.process.strategies import *
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.traversal import T

graph = Graph()
remoteConnStr = 'wss://data516.cluster-crrfpo5iyyda.us-east-1.neptune.amazonaws.com:8182/gremlin'
remoteConn = DriverRemoteConnection(remoteConnStr,'g')
g = graph.traversal().withRemote(remoteConn)


for package_name in df.package.unique():
    try:
        g.add_v('package').property(T.id, package_name).as_(package_name).next()
        #print("added package: " + package_name)
    except:
        print(f"Failed to add package {package_name}")

remoteConn.close()

Failed to add package None


In [25]:
%%gremlin

g.V()

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…

Add edges

In [33]:
remoteConn = DriverRemoteConnection(remoteConnStr,'g')
g = graph.traversal().withRemote(remoteConn)


for index, row in df.iterrows():
    print(row['dependency'])
    if row['dependency'] is not None:
        g.V(row['package']).add_e("depends_on").to(__.V(row['dependency'])).next()
        print("Added dependency: " + row['package'] + "->" + row['dependency'])
        
remoteConn.close()

None
None
None
None
None
None
None
argparse
Added dependency: aaargh->argparse
None
None
None
None
None
scipy
Added dependency: ABBA->scipy
None
None
"ToscaWidgets>0.9.7"
Added dependency: abl.jquery->"ToscaWidgets>0.9.7"
Add other requirements here
Added dependency: abl.jquery->Add other requirements here
"abl.util>=0.1"
Added dependency: abl.jquery.ui->"abl.util>=0.1"
"abl.jquery>=1.4"
Added dependency: abl.jquery.ui->"abl.jquery>=1.4"
"abl.jquery.plugins.form>=2.28"
Added dependency: abl.jquery.ui->"abl.jquery.plugins.form>=2.28"
None
None
Extra requirements: -*-
Added dependency: absolute32->Extra requirements: -*-
Extra requirements: -*-
Added dependency: abu.rpc->Extra requirements: -*-
protobuf>=2.4
Added dependency: abu.rpc->protobuf>=2.4
message
Added dependency: abu.rpc->message
None
None
None
None
None
None
None
setuptools
Added dependency: actdiag->setuptools
blockdiag>=1.2.0
Added dependency: actdiag->blockdiag>=1.2.0
Extra requirements: -*-
Added dependency: actdiag->Extr

In [34]:
%%gremlin

g.E()

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…