In [None]:
# IMPORTS
import os
import json
from tqdm import tqdm
from multiprocessing import Pool
import sys

In [None]:
# DIRECTORIES
DATA = "/ais/hal9000/datasets/reddit/stance_pipeline/luo_data/raw_data"
OUT = "/ais/hal9000/datasets/reddit/stance_pipeline/luo_data/network_analysis/dictionaries/"

In [None]:
# Input must be in JSON readable format, containing fields: author, subreddit, parent_id

# Generate Author:Subreddit dictionary
def generate_mapping(filename):
    # DATA STORAGE
    DICT = {}

    # Setting input path
    path = os.path.join(DATA, filename)
    file = open(path, "r")

    print("Iterating over file : "+filename+"\n")

    # Iterating over each line in the file
    for line in file:
        
        # Load in one JSON object
        obj = json.loads(line)
        
        # Important values
        author = obj['author']
        subreddit = obj['subreddit']

        # Initialization
        if author not in DICT:
            DICT[author] = {}
        
        if subreddit not in DICT[author]:
            DICT[author][subreddit] = 0
        
        # Incrementing occurence
        DICT[author][subreddit] += 1

    # Dump to file
    path = os.path.join(OUT, filename)
    with open(path,"w") as f:
        json.dump(DICT,f)
    

        
    

In [None]:
# Top level version
def generate_mapping_top(filename):
    # DATA STORAGE
    DICT = {}

    # Setting input path
    path = os.path.join(DATA, filename)
    file = open(path, "r")

    print("Iterating over file : "+filename+"\n")

    # Iterating over each line in the file
    for line in file:
        
        # Load in one JSON object
        obj = json.loads(line)
        
        # Is top level check
        level = obj['parent_id']
        if level[:2] != "t3":
            continue

        # Important values
        author = obj['author']
        subreddit = obj['subreddit']

        # Initialization
        if author not in DICT:
            DICT[author] = {}
        
        if subreddit not in DICT[author]:
            DICT[author][subreddit] = 0
        
        # Incrementing occurence
        DICT[author][subreddit] += 1

    # Dump to file
    path = os.path.join(OUT, filename[:-5]+"_top.json")
    with open(path,"w") as f:
        json.dump(DICT,f)

In [None]:
# GENERATE NON TOP LEVEL
targets = []
for month in ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]:
        targets.append(f"RC_2019-{month}.json")

# 2 Minutes / 2 Files w 2 Cores
# ~6-8GB -> ~100MB
with Pool(2) as p:
        r = tqdm(list(p.imap(generate_mapping, targets)), total=12)

In [None]:
# GENERATE TOP LEVEL
targets = []
for month in ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]:
        targets.append(f"RC_2019-{month}.json")

# 2 Minutes / 2 Files w 2 Cores
# ~6-8GB -> ~100MB
with Pool(2) as p:
        r = tqdm(list(p.imap(generate_mapping_top, targets)), total=12)

In [None]:
# NEXT NOTEBOOK ==> GraphGenNX