In [1]:
# IMPORTS
import os
import json
import sys
import graph_tool.all as gt

In [2]:
# DIRECTORIES
DATA = "/ais/hal9000/datasets/reddit/stance_pipeline/luo_data/network_analysis"
OUT = '/h/118/prabuddha/graphs/'


In [3]:
# GENERATING CONDENSED AUTHOR-SUBREDDIT DICTIONARY
CONDENSED = {}

# READ IN MONTH BY MONTH
def condense(filename):
    # Setting path to file to read in
    file_path = os.path.join(DATA, filename)

    # Open the data
    DICT = {}
    with open(file_path, 'r') as json_file:
        DICT = json.load(json_file)
    
    # Iterate over all authors
    for author in list(DICT.keys()):

        # Key check
        if author not in CONDENSED:
            CONDENSED[author] = {}

        # Add up subreddit interactions
        for subreddit in DICT[author]:

            # Key check
            if subreddit not in CONDENSED[author]:
                CONDENSED[author][subreddit] = 0
            
            # Incrementing    
            CONDENSED[author][subreddit] += DICT[author][subreddit]


In [4]:
# Specify the file path where your JSON data is stored
# check for top / not top depending on if top level data is wanted
for filename in os.listdir(DATA):
    # Generate CONDENSED dictionary
    if "_top" in filename:
        continue
    condense(filename)
    
# Print size of data structure
# 2 minutes to load all years, 3 GB in memory space
print(sys.getsizeof(CONDENSED))

246065920


In [17]:
# TOLERANCE: The number of connections required for an edge to be made
def generate_graph(TOLERANCE):
    # Create an empty graph
    g = gt.Graph(directed=False)

    # Labels for each vertex
    labels = g.new_vertex_property("string")

    # Edge weights
    edge_weights = g.new_edge_property("double")

    # Vertex set
    V = {}

    # Adding edges and nodes
    for author in CONDENSED:
        keep = []

        # Keeping subreddits where posts >= TOLERANCE
        for subreddit in CONDENSED[author]:
            if CONDENSED[author][subreddit] >= TOLERANCE:
                keep.append(subreddit)

        # Discard if less than two subreddits are above tolerance
        if len(keep) < 2:
            continue

        # Add vertices
        for subreddit in keep:
            # Add vertices
            if subreddit not in V:
                V[subreddit] = g.add_vertex()
                labels[V[subreddit]] = subreddit
        
        # Add edges
        used = []
        for subreddit1 in keep:
            for subreddit2 in keep:
                # No self cycles, no double adding
                if subreddit1 == subreddit2 or subreddit2 in used:
                    continue
                
                # Increment edge weight if edge already exists
                edge = g.edge(V[subreddit1], V[subreddit2])
                if edge in g.edges():
                    edge_weights[edge] += 1
                    continue
                
                # New edge
                edge = g.add_edge(V[subreddit1], V[subreddit2])  
                edge_weights[edge] = 1
            used.append(subreddit1)


            
    return g, labels, edge_weights, V

In [18]:
# Generate graphs
TOLERANCE = 100
g, labels, edge_weights, V = generate_graph(TOLERANCE)

KeyboardInterrupt: 

In [None]:
# Save graph
g.save(OUT + f"2019_GT_T{TOLERANCE}.graphml")

In [16]:
# VISUALIZATION

# Graphical output
pos = gt.sfdp_layout(g)
deg = g.degree_property_map("out")
deg.a = 10 * (deg.a * 2 + 0.4)
b = deg.copy()
b.a = b.a * 255
gt.graph_draw(g, pos=pos, vertex_text = labels, vertex_size=deg, vertex_text_position="centered", vertex_fill_color=b, edge_pen_width=2, output_size=(5000, 5000), output="/u/prabuddha/efficiency/graphs/graph5.pdf")


<VertexPropertyMap object with value type 'vector<double>', for Graph 0x7fa06f94c550, at 0x7f9fae935290>