In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import json, time, datetime, math
import operator

now = datetime.datetime.now()
ROOT_PATH = 'data/'
DATA_PATH = '-'.join(str(x) for x in [(now.year), now.month, now.day])
LOAD_DATA_PATH = '2018-2-6'

def sorted_map(map):
    ms = sorted(map.items(), key=operator.itemgetter(1), reverse=True)
    return ms

def sorted_df(cen):
    return pd.DataFrame(sorted(cen.items(), key=operator.itemgetter(1), reverse=True))

def print_progress(iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, errors = 0, fill = '+'):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s - errors: %s' % (prefix, bar, percent, suffix, errors), end = '\r')
    # Print New Line on Complete
    if iteration == (total - 2): 
        print()

In [21]:
submissions = pd.read_pickle(ROOT_PATH + LOAD_DATA_PATH + '/' + 'submissions.pkl').query('subreddit!="garlicoin"')
comments = pd.read_pickle(ROOT_PATH + LOAD_DATA_PATH + '/' + 'comments.pkl').query('subreddit!="garlicoin"')
comments[['parent_type','parent_id']] = comments['parent_id'].str.split('_',expand=True)

In [22]:
comments = comments.query('author != "[deleted]"').reset_index(drop=True)

In [23]:
authors_1 = pd.DataFrame({
    'author': submissions['author']
})

authors_2 = pd.DataFrame({
    'author': comments['author']
}) 

authors = pd.concat([authors_1, authors_2]).drop_duplicates().reset_index(drop=True)
authors.info()

print('num_comments:', comments.query('parent_type=="t1"')['parent_type'].count())
print('num_links:', comments.query('parent_type=="t3"')['parent_type'].count())
print('total:', comments['id'].count())

#625480

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242121 entries, 0 to 242120
Data columns (total 1 columns):
author    242121 non-null object
dtypes: object(1)
memory usage: 1.8+ MB
num_comments: 1172934
num_links: 540112
total: 1713046


In [24]:
edges_sub = pd.merge(
    comments[comments['parent_type'] == 't3'].reset_index(),
    submissions, 
    left_on='parent_id', 
    right_on='id', 
    how='left', 
    suffixes=('_left', '_right')
)
edges_sub = edges_sub.query('author_left != "[deleted]" & author_right != "[deleted]"').reset_index(drop=True)
edges_sub = edges_sub.query('author_left != author_right').reset_index(drop=True)
edge_fields_sub = ['author_left', 'author_right', 'score_left', 'id_left', 'id_right', 'subreddit_left']
edges_sub = edges_sub[edge_fields_sub]
edges_sub['weight'] = edges_sub['score_left']
edges_sub['type'] = 'com_on_sub'
edges_sub = edges_sub.rename(columns={'subreddit_left': 'subreddit'})
edges_sub.columns

Index(['author_left', 'author_right', 'score_left', 'id_left', 'id_right',
       'subreddit', 'weight', 'type'],
      dtype='object')

In [25]:
edges_com = pd.merge(
    comments[comments['parent_type'] == 't1'].reset_index(), 
    comments, 
    left_on='parent_id', 
    right_on='id', 
    how='left', 
    suffixes=('_left', '_right')
)

edge_fields = ['author_left', 'author_right', 'score_left', 'id_left', 'id_right', 'subreddit_left']
edges_com = edges_com[edge_fields]
edges_com['weight'] = edges_com['score_left']
edges_com['type'] = 'com_on_com'
edges_com = edges_com.rename(columns={'subreddit_left': 'subreddit'})
edges_com.columns

Index(['author_left', 'author_right', 'score_left', 'id_left', 'id_right',
       'subreddit', 'weight', 'type'],
      dtype='object')

In [7]:
#edges_com = edges_com[0:10][['author_left', 'author_right']]
print(edges_sub[0:5][['author_left', 'author_right']])

print(edges_com[0:5][['author_left', 'author_right']])
#print(edges_com[['author_left', 'author_right']].query('author_right'))
print(edges_sub.columns)
print()
print(edges_com.columns)
#print(comments.sort_values(by='score', ascending=False)[-50:-1]['score'])

#comments.subreddit.unique()

    author_left          author_right
0    taddraughn  SuperficialPickle444
1      Sprint99  SuperficialPickle444
2  ianufyrebird  SuperficialPickle444
3     Nrimelman  SuperficialPickle444
4    Gabe121411  SuperficialPickle444
  author_left  author_right
0   garlicbot    taddraughn
1   garlicbot      Sprint99
2   garlicbot  ianufyrebird
3   garlicbot           NaN
4  taddraughn    taddraughn
Index(['author_left', 'author_right', 'score_left', 'id_left', 'id_right',
       'subreddit', 'weight', 'type'],
      dtype='object')

Index(['author_left', 'author_right', 'score_left', 'id_left', 'id_right',
       'subreddit', 'weight', 'type'],
      dtype='object')


In [26]:
g = nx.from_pandas_dataframe(
    pd.concat([edges_sub, edges_com], ignore_index=True), 
    'author_left', 
    'author_right', 
    ['weight', 'score_left', 'id_left', 'id_right', 'subreddit', 'type'], 
    create_using=nx.MultiDiGraph()
)
print(nx.info(g))

Name: 
Type: MultiDiGraph
Number of nodes: 226406
Number of edges: 1691975
Average in degree:   7.4732
Average out degree:   7.4732


In [27]:
deg_cen = sorted_df(nx.degree_centrality(g))
print(50*'-')
print(deg_cen[0:10])

--------------------------------------------------
                      0         1
0         AutoModerator  0.032212
1                   NaN  0.029991
2                  mvea  0.025552
3                 tippr  0.024500
4                 b1tcc  0.022305
5           rBitcoinMod  0.020583
6               Z_staff  0.019333
7            mungojelly  0.016559
8                 fabwa  0.015344
9  DestroyerOfShitcoins  0.014956


*PageRank top users*

In [28]:
page_rank = sorted_df(nx.pagerank_scipy(g, alpha=0.6))
print(page_rank[0:50])

                       0         1
0                   mvea  0.004880
1                Z_staff  0.004077
2                 speckz  0.001801
3            rBitcoinMod  0.001589
4                 Amidza  0.001582
5                    NaN  0.001288
6                  b1tcc  0.001089
7         pixel-gamer-3D  0.000955
8                  fabwa  0.000926
9             AdamCannon  0.000872
10           maxwellhill  0.000739
11       HudsonRiverLine  0.000696
12        MichaelRahmani  0.000674
13                 tag65  0.000666
14             BluntLord  0.000645
15   A_Internet_Stranger  0.000624
16           AzzIRATIBOY  0.000588
17         Gabriel-Lewis  0.000576
18          delumstudios  0.000557
19      acacia-club-road  0.000552
20       Elementz_Reborn  0.000517
21            ttminh1997  0.000506
22             iAmTheTot  0.000503
23             lriccardo  0.000503
24            TheBomber9  0.000486
25        GloriousGe0rge  0.000468
26            Scary_Wasp  0.000467
27               kus

                       0         1
0   SuperficialPickle444  0.024455
1             CaseyDafuq  0.011067
2      DigitalizedOrange  0.008305
3              ranjiruku  0.007619
4                 F4rg0_  0.007612
5               MegaOhms  0.007594
6              imightmax  0.007593
7         emperoraugusto  0.007593
8                   mvea  0.003787
9             skyler4722  0.003668
10   Religion__of__Peace  0.003203
11               Z_staff  0.002425
12            AdamCannon  0.001747
13                   NaN  0.001585
14           rBitcoinMod  0.001542
15           remi9martin  0.001469
16                Amidza  0.001293
17         stansellj1983  0.001256
18                 b1tcc  0.001215
19                 fabwa  0.001171
20            Darksirius  0.001169
21                speckz  0.001162
22         ButtsOfficial  0.001145
23            Chichigami  0.001093
24       I_PUNCH_INFANTS  0.001032
25     ChopBangBuzzDylan  0.000902
26           kevindongyt  0.000848
27                 t

In [None]:
betweenness_cen = sorted_df(nx.betweenness_centrality(g, k=256, weight='weight'))
print(50*'-')
print(betweenness_cen[0:50])
betweenness_cen.to_pickle(ROOT_PATH + LOAD_DATA_PATH + '/' + 'betweenness_large.pkl')