In [1]:
import json
import dill as pkl
import networkx as nx
import matplotlib.pyplot as plt

from arango import ArangoClient

In [4]:
# Establish Client Connection
client = ArangoClient(hosts='http://127.0.0.1:8530')

db = client.db('reddit_comp_db', username='root', password='compdb_kyle')

In [5]:
# create the graph
if db.has_graph('reddit'):
    reddit = db.graph('reddit')
else:
    reddit = db.create_graph('reddit')

In [6]:
# Create the Posts collection
if reddit.has_vertex_collection('posts'):
    posts = reddit.vertex_collection('posts')
else:
    posts = reddit.create_vertex_collection('posts')

In [7]:
# Read posts collection
with open('arango_formed_data/posts_collection.pkl','rb') as file:
    posts_collection = pkl.load(file)

In [8]:
# Insert data from the posts collection
cursor = db.aql.execute('FOR doc IN posts RETURN doc')

if not list(cursor):
    posts.import_bulk(posts_collection)
else:
    print('document already populated')

In [9]:
# Create the Comments collection
if reddit.has_vertex_collection('comments'):
    comments = reddit.vertex_collection('comments')
else:
    comments = reddit.create_vertex_collection('comments')

In [10]:
# Read comments collection
with open('arango_formed_data/comments_collection.pkl','rb') as file:
    comments_collection = pkl.load(file)

In [11]:
cursor = db.aql.execute('FOR doc IN comments RETURN doc')

if not list(cursor):
    comments.import_bulk(comments_collection)
else:
    print('document already populated')

In [12]:
# Create Commented on (post and comments) edge collection
if reddit.has_edge_definition('commented_on'):
    commented_on = reddit.edge_collection('commented_on')
else:
    commented_on = reddit.create_edge_definition(
        edge_collection='commented_on',
        from_vertex_collections=['comments'],
        to_vertex_collections=['posts']
    )


In [13]:
# Read commented on edge collection
with open('arango_formed_data/commented_on_edge_collection.pkl','rb') as file:
    commented_on_edge_collection = pkl.load(file)

In [14]:
cursor = db.aql.execute('FOR doc IN commented_on RETURN doc')

if not list(cursor):
    commented_on.import_bulk(commented_on_edge_collection)
else:
    print('document already populated')

In [15]:
# Create the users collection
if reddit.has_vertex_collection('users'):
    users = reddit.vertex_collection('users')
else:
    users = reddit.create_vertex_collection('users')

In [16]:
# Read users collection
with open('arango_formed_data/users_collection.pkl','rb') as file:
    users_collection = pkl.load(file)

In [17]:
cursor = db.aql.execute('FOR doc IN users RETURN doc')

if not list(cursor):
    users.import_bulk(users_collection)
else:
    print('document already populated')

In [18]:
# Create Posted (posts and users) edge collection
if reddit.has_edge_definition('posted'):
    posted = reddit.edge_collection('posted')
else:
    posted = reddit.create_edge_definition(
        edge_collection='posted',
        from_vertex_collections=['users'],
        to_vertex_collections=['posts']
    )

In [19]:
# Read posted edge collection
with open('arango_formed_data/posted_edge_collection.pkl','rb') as file:
    posted_edge_collection = pkl.load(file)

In [20]:
cursor = db.aql.execute('FOR doc IN posted RETURN doc')

if not list(cursor):
    posted.import_bulk(posted_edge_collection)
else:
    print('document already populated')

In [21]:
# Create Commented (comments and users) edge collection
if reddit.has_edge_definition('commented'):
    commented = reddit.edge_collection('commented')
else:
    commented = reddit.create_edge_definition(
        edge_collection='commented',
        from_vertex_collections=['users'],
        to_vertex_collections=['comments']
    )

In [22]:
# Read commented edge collection
with open('arango_formed_data/commented_edge_collection.pkl','rb') as file:
    commented_edge_collection = pkl.load(file)

In [23]:
cursor = db.aql.execute('FOR doc IN commented RETURN doc')

if not list(cursor):
    commented.import_bulk(commented_edge_collection)
else:
    print('document already populated')

In [26]:
# create second graph that includes comments in a thread
"""
NOTE: It is much easier to add existing collections to a new graph using the Web Interface than it is in python-arango
Therefore, I added the following collection through the web interface (users,comments,posts,posted_edge, commented_edge,
commented_on_edge)

I probably could've created comments in a thread as part of the commented_on edge collection. But to keep it simple I will
just make a new edge collection called comment_thread
"""

if db.has_graph('reddit_w_threads'):
    reddit_w_threads = db.graph('reddit_w_threads')
else:
    reddit_w_threads = db.create_graph('reddit_w_threads')


In [30]:
# Read comment thread edge collection
with open('arango_formed_data/comment_thread_edge_collection.pkl','rb') as file:
    comment_thread_edge_collection = pkl.load(file)

In [28]:
# Create Comment thread (comments and comments) edge collection
if reddit_w_threads.has_edge_definition('comment_thread'):
    comment_thread = reddit_w_threads.edge_collection('comment_thread')
else:
    comment_thread = reddit_w_threads.create_edge_definition(
        edge_collection='comment_thread',
        from_vertex_collections=['comments'],
        to_vertex_collections=['comments']
    )

In [31]:
cursor = db.aql.execute('FOR doc IN comment_thread RETURN doc')

if not list(cursor):
    comment_thread.import_bulk(comment_thread_edge_collection)
else:
    print('document already populated')