In [1]:
#IMPORTS
import networkx as nx
import os
import json
import sys

In [2]:
# DIRECTORIES
DATA = "/ais/hal9000/datasets/reddit/stance_pipeline/luo_data/network_analysis/dictionaries/"
OUT = '/ais/hal9000/datasets/reddit/stance_pipeline/luo_data/network_analysis/graphs/'

In [3]:
# GENERATING CONDENSED AUTHOR-SUBREDDIT DICTIONARY (COMBINE ALL 12 MONTHS INTO ONE DICTIONARY)
CONDENSED = {}

# READ IN MONTH BY MONTH
def condense(filename):
    # Setting path to file to read in
    file_path = os.path.join(DATA, filename)

    print(f"Reading {file_path} ...")

    # Open the data
    DICT = {}
    with open(file_path, 'r') as json_file:
        DICT = json.load(json_file)
    
    # Iterate over all authors
    for author in list(DICT.keys()):

        # Key check
        if author not in CONDENSED:
            CONDENSED[author] = {}

        # Add up subreddit interactions
        for subreddit in DICT[author]:

            # Key check
            if subreddit not in CONDENSED[author]:
                CONDENSED[author][subreddit] = 0
            
            # Incrementing    
            CONDENSED[author][subreddit] += DICT[author][subreddit]


In [4]:
# TOLERANCE: Only consider subreddits of an author where they have posted >= TOLERANCE times

def generate_graph(TOLERANCE):
    # Create an empty graph
    g = nx.Graph()

    # Adding edges and nodes
    for author in CONDENSED:

        # Keeping subreddits where posts >= TOLERANCE
        keep = []
        for subreddit in CONDENSED[author]:
            if CONDENSED[author][subreddit] >= TOLERANCE:
                keep.append(subreddit)

        # If less than two subreddits left, cannot create an edge, no cross-community engagement
        if len(keep) < 2:
            continue

        # Add vertices
        for subreddit in keep:
            # Add vertices, duplicates taken care of by NX
            g.add_node(subreddit)

        # Add edges
        for subreddit1 in keep:
            for subreddit2 in keep:
                # No self cycles
                if subreddit1 == subreddit2:
                    continue
                
                # Adding 0.5 instead of 1 since each edge is considered twice in this nested loop
                # That is, weights are added for A-->B and B-->A, hence 0.5 + 0.5 = 1 as we want
                
                # Check if the edge already exists
                if g.has_edge(subreddit1, subreddit2):
                    g[subreddit1][subreddit2]['weight'] += 0.5
                # New edge
                else:
                    g.add_edge(subreddit1, subreddit2)
                    g[subreddit1][subreddit2]['weight'] = 0.5
        
    return g

In [5]:
# Generate the CONDENSED dictionary
# Considering all types of posts (Top and Non-Top)
for filename in os.listdir(DATA):
    if "_top" not in filename:
        condense(filename)

Reading /ais/hal9000/datasets/reddit/stance_pipeline/luo_data/network_analysis/dictionaries/RC_2019-11.json ...
Reading /ais/hal9000/datasets/reddit/stance_pipeline/luo_data/network_analysis/dictionaries/RC_2019-09.json ...
Reading /ais/hal9000/datasets/reddit/stance_pipeline/luo_data/network_analysis/dictionaries/RC_2019-05.json ...
Reading /ais/hal9000/datasets/reddit/stance_pipeline/luo_data/network_analysis/dictionaries/RC_2019-01.json ...
Reading /ais/hal9000/datasets/reddit/stance_pipeline/luo_data/network_analysis/dictionaries/RC_2019-04.json ...
Reading /ais/hal9000/datasets/reddit/stance_pipeline/luo_data/network_analysis/dictionaries/RC_2019-10.json ...
Reading /ais/hal9000/datasets/reddit/stance_pipeline/luo_data/network_analysis/dictionaries/RC_2019-08.json ...
Reading /ais/hal9000/datasets/reddit/stance_pipeline/luo_data/network_analysis/dictionaries/RC_2019-06.json ...
Reading /ais/hal9000/datasets/reddit/stance_pipeline/luo_data/network_analysis/dictionaries/RC_2019-02.j

In [7]:
# Generating size attribute for each community:
POSTS = {}
UNIQUE_USERS = {}

for author in CONDENSED:
    for subreddit in CONDENSED[author]:

        # Key Check
        if subreddit not in POSTS:
            POSTS[subreddit] = 0
        
        if subreddit not in UNIQUE_USERS:
            UNIQUE_USERS[subreddit] = 0

        # Increment
        POSTS[subreddit] += CONDENSED[author][subreddit]
        UNIQUE_USERS[subreddit] += 1


In [18]:
# Generate graphs
# 100 ~ 10s
# 20 ~ 15s
# 10 ~ 30s
# 5 ~ 60s
TOLERANCE = 5
g = generate_graph(TOLERANCE)

In [16]:
# Post Process
# Set all subreddits loyal users to 0, preparing for next step in pipeline
# Set the size attribute to the number of posts made in that subreddit

for subreddit in g.nodes():    
    # Loyalty
    g.nodes[subreddit]['loyal_users'] = 0

    # Posts
    g.nodes[subreddit]['posts'] = POSTS[subreddit]

    # Unique Users
    g.nodes[subreddit]['unique_users'] = UNIQUE_USERS[subreddit]

In [17]:
# Save graph
nx.write_graphml_lxml(g, OUT + f"2019_NX_T{TOLERANCE}.graphml")

In [None]:
# NEXT NOTEBOOK ==> ComputeLoyalty