In [1]:
import pandas as pd
import numpy as np
import json
from itertools import product

The `json_to_csv` functions convert our json files into a csv file, keeping only those data with keys corresponding to the list `cols`. There is some little care that hase to be taken regarding which API the json came from.

In [2]:
def json_to_csv(subreddit=None, pushshift=False, cols=[]):
    if pushshift:
        with open(f'../data/json/pushshift_{subreddit}_2017.json', 'r') as file:
            json_all = json.load(file)
    else:
        with open(f'../data/json/{subreddit}.json', 'r') as file:
            json_all = json.load(file)
        json_all = [json['data'] for json in json_all]

    
    df = {col:[] for col in cols}


    for json_tmp, col in product(json_all, cols):
        if col in json_tmp:
            new_term = json_tmp[col]
        else:
            new_term = ''
        
        df[col].append(new_term)

    df = pd.DataFrame(data=df, columns=cols)
    df.dropna(axis=1, how='all', inplace=True)
    df.drop_duplicates(inplace=True)
    df.to_csv(f'../data/csv/{subreddit}.csv',index=False)

The `csv_clean` function is just replacing empty strings with `' '`. I'm doing this to avoid having posts with no selftext showing up as NaN or something. This should mess with anything later when I tokenize to avoid feeding nulls into the those functions. I'm not sure if it would handle those correctly, so I'm just going to avoid the issue entirely. 

In [3]:
def csv_clean(subreddit, cols):
    
    df = pd.read_csv(f'../data/csv/{subreddit}.csv')
    
    for col_name in cols:
        df[col_name] = df[col_name].map(lambda x: x if x == x else ' ')   

    df.to_csv(f'../data/csv/{subreddit}_cleaned.csv', index=False)

Now we'll clean up the subreddit `.csv` files and then put them all together into one big file that we can load once in the future.

In [4]:
cols = [
    'title',
    'num_comments',
    'score',
    'over_18',
    'locked',
    'stickied', 
    'subreddit',
    'created_utc',
    'is_self',
    'selftext',
]

subreddits = [
    'math',
    'learnmath',
    
    'python',
    'learnpython',
    
    'datascience',
    'statistics',
    'learnmachinelearning',
    
    'actuary',
    'chemistry',
    'physics',
]


json_to_csv(subreddit='all', pushshift=False, cols=cols)
csv_clean(subreddit='all', cols=cols)

all_posts = pd.DataFrame(columns=cols)

for subreddit in subreddits:
    json_to_csv(subreddit=subreddit, pushshift=True, cols=cols)
    csv_clean(subreddit, cols=cols)
    df_subreddit = pd.read_csv(f'../data/csv/{subreddit}_cleaned.csv')
    all_posts = all_posts.append(df_subreddit, ignore_index=True)

all_posts.to_csv('../data/csv/combined_subreddits.csv')