In [6]:
import dill as pkl
from os import listdir
from datetime import datetime

In [2]:
with open('cleaned_data/reddit_posts_clean.pkl','rb') as file:
    data = pkl.load(file)

In [7]:
def create_posts_collection(data:list) -> list:
    """
    Formats the posts for insertion into an arango collection
    Args:
        data: cleaned data containing all the posts (list)
    Returns:
        list: all posts formatted for arange insertion
    """
    all_posts = []
    for post in data:
        ar_post = {}
        ar_post.update({'_id': 'posts/' + post['id'], 
                        'title':post['title'],
                        'datetime':post['Reference_Date'].strftime('%m-%d-%Y, %H:%M:%S'),
                        'num_comments':post['num_comments'],
                        'num_upvotes':post['num_upvotes'],
                        'body':post['body']})
        
        try:
            ar_post.update({'tag':post['tag']})
        except:
            pass
                           
        all_posts.append(ar_post)
    return(all_posts)

def create_comments_collection(data:list) -> list:
    """
    Formats the comments for insertion into an arango collection
    NOTE: there are some duplicate comments so we will eliminate them
    Args:
        data: cleaned data containing all the posts (list)
    Returns:
        list: all comments formatted for arango insertion
    """
    all_comments = []
    all_ids = []
    for post in data:
        for comment in post['comments']:
            if comment['comment_id'] not in all_ids:
                ar_comment = {}
                ar_comment.update({'_id': 'comments/' + comment['comment_id'],
                                   'body': comment['body'],
                                   'upvotes': comment['upvotes']})
                all_comments.append(ar_comment)
                all_ids.append(comment['comment_id'])
            
    return(all_comments)

def create_commeted_on_edge_collection(data:list) -> list:
    """
    Creates edge collection for the relationships between posts and comments
    Args:
        data: cleaned data containing all the posts (list)
    Returns:
        list: all edges between posts and comments formatted for arango insertion
    """
    all_ids = []
    id_root = 'commented_on/'
    all_edges = []
    
    for post in data:
        for comment in post['comments']:
            if comment['comment_id'] not in all_ids:
                all_ids.append(comment['comment_id'])
                
                ar_edge = {}
                ar_id = id_root + post['id'] + "-" + comment['comment_id']
                ar_edge.update({'_id':ar_id,
                                '_from': 'comments/' + comment['comment_id'],
                                '_to': 'posts/' + post['id']})
                
                all_edges.append(ar_edge)
                
                
    return(all_edges)

def create_users_collection(data:list) -> list:
    """
    Formats the users for insertion into an arango collection
    NOTE: have to be sure not to include duplicate users
    Args:
        data: cleaned data containing all the posts (list)
    Returns:
        list: all comments formatted for arango insertion
    """
    all_users = []
    user_ids = []
    user_root = 'users/'
    for post in data:
        if post['author'] not in user_ids:
            user = {'_id': user_root + post['author']}
            
            all_users.append(user)
            user_ids.append(post['author'])
        for comment in post['comments']:
            if comment['username'] not in user_ids:
                user = {'_id': user_root + comment['username']}
                
                all_users.append(user)
                user_ids.append(comment['username'])
    return(all_users)

def create_posted_edge_collection(data:list) -> list:
    """
    Creates edge collection for the relationships between posts and users
    Args:
        data: cleaned data containing all the posts (list)
    Returns:
        list: all edges between posts and users formatted for arango insertion
    """
    all_edges = []
    id_root = 'posted/'
    for post in data:
        
        ar_edge_id = id_root + post['id'] + '-' + post['author']
        ar_user = 'users/' + post['author']
        ar_post = 'posts/' + post['id']
        
        ar_edge = {'_id': ar_edge_id,
                   '_from': ar_user,
                   '_to': ar_post,
                   'date': post['Reference_Date'].strftime('%m-%d-%Y, %H:%M:%S')}
        
        all_edges.append(ar_edge)
    
    return(all_edges)

def create_commented_edge_collection(data:list) -> list:
    """
    Creates edge collection for the relationships between comments and users
    Args:
        data: cleaned data containing all the posts (list)
    Returns:
        list: all edges between comments and users formatted for arango insertion
    """
    comment_ids = []
    all_edges = []
    id_root = 'commented/'
    
    for post in data:
        for comment in post['comments']:
            if comment['comment_id'] not in comment_ids:
                comment_ids.append(comment['comment_id'])
                
                ar_edge_id = id_root + comment['comment_id'] + comment['username']
                ar_user = 'users/' + comment['username']
                ar_comment = 'comments/' + comment['comment_id']
                
                ar_edge = {'_id': ar_edge_id,
                           '_from': ar_user,
                           '_to': ar_comment}
                all_edges.append(ar_edge)
                
    return(all_edges)

def create_comment_thread_edge_collection(data:list) -> list:
    """
    Creates edge collection for the relationships between comments and other comments.
    This exists when comments are part of a thread
    Args:
        data: cleaned data containing all the posts (list)
    Returns:
        list: all edges between comments and comments formatted for arango insertion
    """
    all_edges = []
    id_root = 'comment_thread/'
    comment_ids = []
    for post in data:
        for comment in post['comments']:
            if comment['comment_id'] not in comment_ids:
                try:
                    comment['reply_to_id']

                    ar_edge_id = id_root + comment['reply_to_id'] + '-' + comment['comment_id']

                    ar_edge = {'_id':ar_edge_id,
                               '_from': 'comments/' + comment['comment_id'],
                               '_to': 'comments/' + comment['reply_to_id']}
                    
                    
                    comment_ids.append(comment['comment_id'])
                    all_edges.append(ar_edge)
                except:
                    pass
    return(all_edges)

In [48]:
posts_collection = create_posts_collection(data)
print(len(posts_collection))
posts_collection[0]

598


{'_id': 'posts/pst_100',
 'title': 'What Are Your Moves Tomorrow, February 17, 2022',
 'datetime': '02-16-2022, 16:17:21',
 'num_comments': 12100,
 'num_upvotes': 292,
 'body': 'What Are Your Moves Tomorrow, February 17, 2022',
 'tag': 'Daily Discussion'}

In [20]:
with open('arango_formed_data/posts_collection.pkl','wb') as file:
    pkl.dump(posts_collection, file)

In [42]:
comments_collection = create_comments_collection(data)
print(len(comments_collection))
comments_collection[0]

13552


{'_id': 'comments/t1_hx84k5q',
 'body': ' Silly me for investing in calls on a company that manufactures very very high end GPUs and other essential computer components.  I should’ve bought calls in checks notes the food delivery company that delivers the wrong food, cold, to the wrong address',
 'upvotes': 125}

In [46]:
with open('arango_formed_data/comments_collection.pkl','wb') as file:
    pkl.dump(comments_collection, file)

In [54]:
commented_on_edges = create_commeted_on_edge_collection(data)
print(len(commented_on_edges))
commented_on_edges[0]

13552


{'_id': 'commented_on/pst_100-t1_hx84k5q',
 '_from': 'comments/t1_hx84k5q',
 '_to': 'posts/pst_100'}

In [55]:
with open('arango_formed_data/commented_on_edge_collection.pkl','wb') as file:
    pkl.dump(commented_on_edges, file)

In [59]:
users_collection = create_users_collection(data)
print(len(users_collection))
users_collection[5]

6868


{'_id': 'users/CyborgAlgoInvestor'}

In [60]:
with open('arango_formed_data/users_collection.pkl','wb') as file:
    pkl.dump(users_collection, file)

In [62]:
posted_edges = create_posted_edge_collection(data)
print(len(posted_edges))
posted_edges[5]

598


{'_id': 'posted/pst_105-NyCWalker76',
 '_from': 'users/NyCWalker76',
 '_to': 'posts/pst_105',
 'date': '02-16-2022, 22:19:52'}

In [63]:
with open('arango_formed_data/posted_edge_collection.pkl','wb') as file:
    pkl.dump(posted_edges, file)

In [69]:
commented_edges = create_commented_edge_collection(data)
print(len(commented_edges))
commented_edges[3]

13552


{'_id': 'commented/t1_hx8g1phHereComesThe_Squeeze',
 '_from': 'users/HereComesThe_Squeeze',
 '_to': 'comments/t1_hx8g1ph'}

In [70]:
with open('arango_formed_data/commented_edge_collection.pkl','wb') as file:
    pkl.dump(commented_edges, file)

In [8]:
comment_thread_edges = create_comment_thread_edge_collection(data)
print(len(comment_thread_edges))
comment_thread_edges[0]

6797


{'_id': 'comment_thread/t1_i2h8z4u-t1_i2h92k1',
 '_from': 'comments/t1_i2h92k1',
 '_to': 'comments/t1_i2h8z4u'}

In [9]:
with open('arango_formed_data/comment_thread_edge_collection.pkl','wb') as file:
    pkl.dump(comment_thread_edges, file)