In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import json, time, datetime, math
import operator

now = datetime.datetime.now()
ROOT_PATH = 'data/'
DATA_PATH = '-'.join(str(x) for x in [(now.year), now.month, now.day])
LOAD_DATA_PATH = '2018-2-6'

def sorted_map(map):
    ms = sorted(map.items(), key=operator.itemgetter(1), reverse=True)
    return ms

def sorted_df(cen):
    return pd.DataFrame(sorted(cen.items(), key=operator.itemgetter(1), reverse=True))

def print_progress(iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, errors = 0, fill = '+'):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s - errors: %s' % (prefix, bar, percent, suffix, errors), end = '\r')
    # Print New Line on Complete
    if iteration == (total - 2): 
        print()

In [2]:
submissions = pd.read_pickle(ROOT_PATH + LOAD_DATA_PATH + '/' + 'submissions.pkl')
comments = pd.read_pickle(ROOT_PATH + LOAD_DATA_PATH + '/' + 'comments.pkl')
comments[['parent_type','parent_id']] = comments['parent_id'].str.split('_',expand=True)

In [3]:
comments = comments.query('author != "[deleted]"').reset_index(drop=True)

In [4]:
authors_1 = pd.DataFrame({
    'author': submissions['author']
})

authors_2 = pd.DataFrame({
    'author': comments['author']
}) 

authors = pd.concat([authors_1, authors_2]).drop_duplicates().reset_index(drop=True)
authors.info()

print('num_comments:', comments.query('parent_type=="t1"')['parent_type'].count())
print('num_links:', comments.query('parent_type=="t3"')['parent_type'].count())
print('total:', comments['id'].count())

#625480

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290763 entries, 0 to 290762
Data columns (total 1 columns):
author    290763 non-null object
dtypes: object(1)
memory usage: 2.2+ MB
num_comments: 1232827
num_links: 625480
total: 1858307


In [5]:
edges_sub = pd.merge(
    comments[comments['parent_type'] == 't3'].reset_index(),
    submissions, 
    left_on='parent_id', 
    right_on='id', 
    how='left', 
    suffixes=('_left', '_right')
)
edges_sub = edges_sub.query('author_left != "[deleted]" & author_right != "[deleted]"').reset_index(drop=True)
edges_sub = edges_sub.query('author_left != author_right').reset_index(drop=True)
#print(edges_sub.info())
edge_fields_sub = ['author_left', 'author_right', 'score_left', 'id_left', 'id_right', 'subreddit_left']
edges_sub = edges_sub[edge_fields_sub]
edges_sub['weight'] = edges_sub['score_left']
edges_sub['type'] = 'com_on_sub'
edges_sub = edges_sub.rename(columns={'subreddit_left': 'subreddit'})
edges_sub.columns
#edges_sub['subreddit'] = edges_sub['subreddit_left']
#edges_sub.drop(['subreddit_left'], axis = 1, inplace = True)

Index(['author_left', 'author_right', 'score_left', 'id_left', 'id_right',
       'subreddit', 'weight', 'type'],
      dtype='object')

In [6]:
edges_com = pd.merge(
    comments[comments['parent_type'] == 't1'].reset_index(), 
    comments, 
    left_on='parent_id', 
    right_on='id', 
    how='left', 
    suffixes=('_left', '_right')
)

edge_fields = ['author_left', 'author_right', 'score_left', 'id_left', 'id_right', 'subreddit_left']
edges_com = edges_com[edge_fields]
edges_com['weight'] = edges_com['score_left']
edges_com['type'] = 'com_on_com'
edges_com = edges_com.rename(columns={'subreddit_left': 'subreddit'})
edges_com.columns

Index(['author_left', 'author_right', 'score_left', 'id_left', 'id_right',
       'subreddit', 'weight', 'type'],
      dtype='object')

In [7]:
#edges_com = edges_com[0:10][['author_left', 'author_right']]
print(edges_sub[0:5][['author_left', 'author_right']])

print(edges_com[0:5][['author_left', 'author_right']])
#print(edges_com[['author_left', 'author_right']].query('author_right'))
print(edges_sub.columns)
print()
print(edges_com.columns)
#print(comments.sort_values(by='score', ascending=False)[-50:-1]['score'])

#comments.subreddit.unique()

    author_left          author_right
0    taddraughn  SuperficialPickle444
1      Sprint99  SuperficialPickle444
2  ianufyrebird  SuperficialPickle444
3     Nrimelman  SuperficialPickle444
4    Gabe121411  SuperficialPickle444
  author_left  author_right
0   garlicbot    taddraughn
1   garlicbot      Sprint99
2   garlicbot  ianufyrebird
3   garlicbot           NaN
4  taddraughn    taddraughn
Index(['author_left', 'author_right', 'score_left', 'id_left', 'id_right',
       'subreddit', 'weight', 'type'],
      dtype='object')

Index(['author_left', 'author_right', 'score_left', 'id_left', 'id_right',
       'subreddit', 'weight', 'type'],
      dtype='object')


In [8]:
submissions.columns

Index(['author', 'subreddit_id', 'subreddit', 'score', 'num_comments', 'id',
       'created_utc', 'retrieved_on', 'num_crossposts', 'title', 'url',
       'stickied', 'pinned', 'gilded'],
      dtype='object')

In [9]:
g = nx.from_pandas_dataframe(
    pd.concat([edges_sub, edges_com], ignore_index=True), 
    'author_left', 
    'author_right', 
    ['weight', 'score_left', 'id_left', 'id_right', 'subreddit', 'type'], 
    create_using=nx.MultiDiGraph()
)
print(nx.info(g))

Name: 
Type: MultiDiGraph
Number of nodes: 274987
Number of edges: 1836658
Average in degree:   6.6791
Average out degree:   6.6791


In [10]:
deg_cen = sorted_df(nx.degree_centrality(g))
print(50*'-')
print(deg_cen[0:50])

--------------------------------------------------
                       0         1
0   SuperficialPickle444  0.135974
1              garlicbot  0.062123
2          AutoModerator  0.026521
3                    NaN  0.025518
4      DigitalizedOrange  0.024274
5                   mvea  0.021037
6             skyler4722  0.020561
7                  tippr  0.020205
8                  b1tcc  0.018365
9            rBitcoinMod  0.016946
10               Z_staff  0.015917
11            mungojelly  0.013633
12                 fabwa  0.012633
13  DestroyerOfShitcoins  0.012313
14   Religion__of__Peace  0.010306
15              jessquit  0.009971
16           remi9martin  0.009237
17              rdar1999  0.008830
18                 --orb  0.008520
19              coin2k17  0.008459
20            KnifeOfPi2  0.008437
21         unitedstatian  0.008426
22              perogies  0.008037
23              Fulvio55  0.007706
24                speckz  0.007502
25   censorship_notifier  0.007019
26  

In [None]:
betweenness_cen = sorted_df(nx.betweenness_centrality(g, k=55, weight='weight'))
print(50*'-')
print(betweenness_cen[0:50])