In [1]:
import dill as pkl
from os import listdir

# Data Merge and Cleaning

The raw data for this analysis is stored in seperate .pkl files for each day of collection. The goal of this notebook is to combine the files into one JSON type object. After combining the items, we will clean the data so it is prepared to formatted for arango ingestion. This included formatting strings and changing string types int types.

In [64]:
def aggregate_pkl_files(filepath:str) -> list:
    """
    Reads all the files(pkl) in 'filepath' and converts them to a single list of dictionary objects
    Args:
        filepath: folder that contains the pkl files (str)
    Returns:
        list: contains all the posts/comments combined into one object 
    """
    all_data = []
    for file in listdir(filepath):
        with open(f'{filepath}/{file}','rb') as f:
            data = pkl.load(f)
            posts = data['main']
            for post in posts:
                all_data.append(post)
            
    return(all_data)

def format_usernames(data:list) -> list:
    """
    usernames and post authors are currently either in 'u/username' or '/user/username/' format
    I want these all stripped down to keep just the username
    Args:
        data: list containing all the posts (list)
    Returns:
        list: all the posts with correctly formatted usernames
    """
    
    for post in data:
        parsed_auth = post['author'].split('/')
        
        if parsed_auth[0]:
            post['author'] = parsed_auth[1]
        else:
            post['author'] = parsed_auth[2]
        
        for comment in post['comments']:
            parsed_com_auth = comment['username'].split('/')
            
            if parsed_com_auth[0]:
                comment['username'] = parsed_com_auth[1]
            else:
                comment['username'] = parsed_com_auth[2]
    
    return(data)

def format_num_comments(data:list) -> list:
    """
    The num_comments field is currently formatted as a string, and if there are over 100 comments it looks like "1.1k comments"
    I want these changed to int datatypes
    Args:
        data: list containing all the posts (list)
    Returns:
        list: all the posts with correctly formatted num_comments
    """
    
    for post in data:
        split_num_comments = post['num_comments'].split(' ')
        comments_value = split_num_comments[0]
        if comments_value[-1] == 'k':
            drop_k = comments_value[:-1]
            value_split = drop_k.split('.')
            
            thousands_count = int(value_split[0]) * 1000
            hundreds_counts = int(value_split[1]) * 100
            
            total_num_comments = thousands_count + hundreds_counts
            post['num_comments'] = total_num_comments
        else:
            post['num_comments'] = int(comments_value)
    return(data)

def format_num_upvotes(data:list) -> list:
    """
    The num_upvotes field is currently formatted as a string. When the number of upvotes is zero it is formatted
    as 'Vote' due to how it was scraped. Also, like before, they can be formatted as '5.1k'. Finally, if the comment was a jpeg,
    the number of upvotes was not captured. I dont want to drop these so i will just set it as zero. 
    I want this field formatted as an int.
    Args:
        data: list containing all the posts (list)
    Returns:
        list: all the posts and comments with correctly formatted num_upvotes
    """
    
    for post in data:
        if post['num_upvotes'] == 'Vote':
            post['num_upvotes'] = 0
            
        elif post['num_upvotes'][-1] == 'k':
            drop_k = post['num_upvotes'][:-1]
            split_votes = drop_k.split('.')
            
            t_count = int(split_votes[0]) * 1000
            h_count = int(split_votes[1]) * 100
            
            post['num_upvotes'] = t_count + h_count
            
        else:
            post['num_upvotes'] = int(post['num_upvotes'])
        
        for comment in post['comments']:
            try:
                if comment['upvotes'] == 'Vote':
                    comment['upvotes'] = 0

                elif comment['upvotes'][-1] == 'k':
                    drop_k = comment['upvotes'][:-1]
                    split_votes = drop_k.split('.')

                    t_count = int(split_votes[0]) * 1000
                    h_count = int(split_votes[1]) * 100

                    comment['upvotes'] = t_count + h_count

                else:
                    comment['upvotes'] = int(comment['upvotes'])
            except:
                comment['upvotes'] = 0
    return(data)

In [72]:
combined_posts = aggregate_pkl_files('data_pkl')
combined_posts[0]

{'title': 'What Are Your Moves Tomorrow, February 17, 2022',
 'Reference_Date': datetime.datetime(2022, 2, 16, 16, 17, 21, 799255),
 'body': 'What Are Your Moves Tomorrow, February 17, 2022',
 'num_comments': '12.1k comments',
 'tag': 'Daily Discussion',
 'num_upvotes': '292',
 'author': 'u/AutoModerator',
 'comments': [{'comment_id': 't1_hx84k5q',
   'username': '/user/SomeDumbassSays/',
   'body': ' Silly me for investing in calls on a company that manufactures very very high end GPUs and other essential computer components.  I should’ve bought calls in checks notes the food delivery company that delivers the wrong food, cold, to the wrong address',
   'upvotes': '125'},
  {'comment_id': 't1_hx84oxj',
   'username': '/user/NrdRage/',
   'body': " Don't forget the previously bankrupted electric car that's going to a rental model and missed top and bottom and has no path to profitability.",
   'upvotes': '24'},
  {'comment_id': 't1_hx8fxvn',
   'username': '/user/Jackol4ntrn/',
   'bod

In [73]:
fixed_names = format_usernames(combined_posts)
fixed_names[0]

{'title': 'What Are Your Moves Tomorrow, February 17, 2022',
 'Reference_Date': datetime.datetime(2022, 2, 16, 16, 17, 21, 799255),
 'body': 'What Are Your Moves Tomorrow, February 17, 2022',
 'num_comments': '12.1k comments',
 'tag': 'Daily Discussion',
 'num_upvotes': '292',
 'author': 'AutoModerator',
 'comments': [{'comment_id': 't1_hx84k5q',
   'username': 'SomeDumbassSays',
   'body': ' Silly me for investing in calls on a company that manufactures very very high end GPUs and other essential computer components.  I should’ve bought calls in checks notes the food delivery company that delivers the wrong food, cold, to the wrong address',
   'upvotes': '125'},
  {'comment_id': 't1_hx84oxj',
   'username': 'NrdRage',
   'body': " Don't forget the previously bankrupted electric car that's going to a rental model and missed top and bottom and has no path to profitability.",
   'upvotes': '24'},
  {'comment_id': 't1_hx8fxvn',
   'username': 'Jackol4ntrn',
   'body': ' nvdia crushed ear

In [74]:
fixed_num_comments = format_num_comments(fixed_names)
fixed_num_comments[0]

{'title': 'What Are Your Moves Tomorrow, February 17, 2022',
 'Reference_Date': datetime.datetime(2022, 2, 16, 16, 17, 21, 799255),
 'body': 'What Are Your Moves Tomorrow, February 17, 2022',
 'num_comments': 12100,
 'tag': 'Daily Discussion',
 'num_upvotes': '292',
 'author': 'AutoModerator',
 'comments': [{'comment_id': 't1_hx84k5q',
   'username': 'SomeDumbassSays',
   'body': ' Silly me for investing in calls on a company that manufactures very very high end GPUs and other essential computer components.  I should’ve bought calls in checks notes the food delivery company that delivers the wrong food, cold, to the wrong address',
   'upvotes': '125'},
  {'comment_id': 't1_hx84oxj',
   'username': 'NrdRage',
   'body': " Don't forget the previously bankrupted electric car that's going to a rental model and missed top and bottom and has no path to profitability.",
   'upvotes': '24'},
  {'comment_id': 't1_hx8fxvn',
   'username': 'Jackol4ntrn',
   'body': ' nvdia crushed earnings: flat

In [75]:
data_cl = format_num_upvotes(fixed_num_comments)
data_cl[0]

{'title': 'What Are Your Moves Tomorrow, February 17, 2022',
 'Reference_Date': datetime.datetime(2022, 2, 16, 16, 17, 21, 799255),
 'body': 'What Are Your Moves Tomorrow, February 17, 2022',
 'num_comments': 12100,
 'tag': 'Daily Discussion',
 'num_upvotes': 292,
 'author': 'AutoModerator',
 'comments': [{'comment_id': 't1_hx84k5q',
   'username': 'SomeDumbassSays',
   'body': ' Silly me for investing in calls on a company that manufactures very very high end GPUs and other essential computer components.  I should’ve bought calls in checks notes the food delivery company that delivers the wrong food, cold, to the wrong address',
   'upvotes': 125},
  {'comment_id': 't1_hx84oxj',
   'username': 'NrdRage',
   'body': " Don't forget the previously bankrupted electric car that's going to a rental model and missed top and bottom and has no path to profitability.",
   'upvotes': 24},
  {'comment_id': 't1_hx8fxvn',
   'username': 'Jackol4ntrn',
   'body': ' nvdia crushed earnings: flat dash 

In [76]:
with open('cleaned_data/reddit_posts_clean.pkl', 'wb') as file:
    pkl.dump(data_cl,file)