In the previous notebook, I found the most common subreddits based on their overall frequencies. However, it is not true that the average individual is guaranteed to vote in an election. Instead, those who are already politically inclined are far more likely to vote, and as such their stance matters most. As such, let's figure out a different metric for determining relevant subreddits, utilizing unique user-ids in 'r/politics'.

In [1]:
import networkx as nx
import pickle
import os
import sys
import itertools as it

In [2]:
def file2graph(directory, file, n=100, k=10):
    subreddits = pickle.load(open(os.path.join(directory, file), 'rb'))
    
    count = 0
    greater_than_n_names = {}
    greater_than_n_values = {}
    
    for sub in subreddits.keys():
        if len(subreddits.get(sub).get('unique_users')) > n:
            greater_than_n_names[sub] = count
            greater_than_n_values[count] = sub
            count += 1
    print("Built dictionary structures for {}.".format(file), end='\r')
    
    subreddit_graph = nx.Graph()
    
    for combo in it.combinations(greater_than_n_names.keys(), 2):
        
        subreddit1 = combo[0]
        subreddit2 = combo[1]
        
        subreddit1_index = greater_than_n_names[subreddit1]
        subreddit2_index = greater_than_n_names[subreddit2]
        
        left_users = subreddits.get(subreddit1).get("unique_users")
        right_users = subreddits.get(subreddit2).get("unique_users")
            
        joint_user_count = len( left_users.intersection(right_users) )
        
        if joint_user_count < k:
            continue
        
        subreddit_graph.add_edge(subreddit1_index, subreddit2_index, weight=int(joint_user_count))
        #print("{:^20} <--{:-^7}--> {:^20} ".format(subreddit1, joint_user_count, subreddit2), end='\r', flush=True)
    
    print("Created graph for {}.".format(file), end='\r', flush=True)
    return subreddit_graph, greater_than_n_names, greater_than_n_values


def pickleGraph(directory, file, graph_structures):
    pickle.dump(graph_structures, open(os.path.join(directory, file), 'wb'))
    print("Pickled graph for {}.".format(file), flush=True)

In [3]:
%%time
# # Ubuntu
stats_directory = "/media/jayckaiser/My Passport/reddit/stats/"
graphs_directory = "/media/jayckaiser/My Passport/reddit/user_graphs/"

# files_to_do = sorted(os.listdir(stats_directory))
files_to_do = ['RC_2012-03.pkl']

for file in files_to_do:
    if file >= "RC_2014-07.pkl":  # it crashed, so here is a way to keep going.
        pickleGraph(graphs_directory, file,   file2graph(stats_directory, file))

Pickled graph for RC_2014-07.pkl.C_2014-07.pkl.
Pickled graph for RC_2014-08.pkl.C_2014-08.pkl.
Pickled graph for RC_2014-09.pkl.C_2014-09.pkl.
Pickled graph for RC_2014-10.pkl.C_2014-10.pkl.
Pickled graph for RC_2014-11.pkl.C_2014-11.pkl.
Pickled graph for RC_2014-12.pkl.C_2014-12.pkl.
Pickled graph for RC_2015-01.pkl.C_2015-01.pkl.
Pickled graph for RC_2015-02.pkl.C_2015-02.pkl.
Pickled graph for RC_2015-03.pkl.C_2015-03.pkl.
Pickled graph for RC_2015-04.pkl.C_2015-04.pkl.
Pickled graph for RC_2015-05.pkl.C_2015-05.pkl.
Pickled graph for RC_2015-06.pkl.C_2015-06.pkl.
Pickled graph for RC_2015-07.pkl.C_2015-07.pkl.
Pickled graph for RC_2015-08.pkl.C_2015-08.pkl.
Pickled graph for RC_2015-09.pkl.C_2015-09.pkl.
Pickled graph for RC_2015-10.pkl.C_2015-10.pkl.
Pickled graph for RC_2015-11.pkl.C_2015-11.pkl.
Pickled graph for RC_2015-12.pkl.C_2015-12.pkl.
Pickled graph for RC_2016-01.pkl.C_2016-01.pkl.
Pickled graph for RC_2016-02.pkl.C_2016-02.pkl.
Pickled graph for RC_2016-03.pkl.C_2016-