In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import pickle
from time import time

In [2]:
counts = {}
print('Loading post counts...')
t = time()
with open('app/data/post_counts.pickle', 'rb') as handle:
    counts.update(pickle.load(handle))
print('loaded {} embeddings in {} s'.format(len(counts), time() - t))

Loading post counts...
loaded 51278 embeddings in 0.295137882232666 s


In [7]:
len(counts)

51278

In [8]:
s = sorted(counts.keys(), key=lambda x: sum(counts[x]))

In [10]:
s[-10:]

['me_irl',
 'dirtykikpals',
 'funny',
 'newsbotbot',
 'showerthoughts',
 'ice_poseidon',
 'rocketleagueexchange',
 'the_donald',
 'autonewspaper',
 'askreddit']

In [15]:
print(s[0], sum(counts[s[0]]))

spam 0.0


In [16]:
print(s[-1], sum(counts[s[-1]]))

askreddit 3004130.0


In [17]:
print(max(max(row) for row in counts.values()))

16056.0


In [3]:
%%time
for k in counts:
    counts[k] = counts[k][:365]

CPU times: user 34.5 ms, sys: 8.96 ms, total: 43.4 ms
Wall time: 42 ms


In [13]:
len(counts['askreddit'])

365

In [4]:
with open('app/data/post_counts.pickle', 'wb') as handle:
    pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [3]:
df = pd.read_csv('app/data/post_counts.csv')

In [4]:
df.head(10)

Unnamed: 0,subreddit,day,total
0,CircleofTrust,92,192789
1,CircleofTrust,93,139672
2,thanosdidnothingwrong,190,119575
3,CircleofTrust,94,55236
4,thanosdidnothingwrong,189,41484
5,CircleofTrust,95,30424
6,movietvreview,272,23004
7,inthesoulstone,190,20621
8,u_Head_Evidence,264,17634
9,u_TallPrune,264,17606


In [7]:
%%time
pivot = df.pivot(index='subreddit', columns='day', values='total')

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.34 µs


In [10]:
pivot

day,1,2,3,4,5,6,7,8,9,10,...,356,357,358,359,360,361,362,363,364,365
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
*cohold00009,,,,,,,,,,,...,,,,,,,,,,
*polhold00214,,,,,,,,,,,...,,,,,,,,,,
*polhold999999,,,,,,,,,,,...,,,,,,,,,,
*tmhold00193,,,,,,,,,,,...,,,,,,,,,,
*tmhold00340,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzzz,,,,,,1.0,,,,,...,,,,,,,1.0,,,
zzzzz,,,,,,,,,,,...,,,,,,,,,,
zzzzzzzzzz,,,,,,,,,,,...,,,,,,,,,,
zzzzzzzzzzze,,,,,,,,,,,...,,,,,,,,,,


In [12]:
subreddits = set(df['subreddit'])

In [13]:
len(subreddits)

1661701

In [14]:
with open('app/data/subreddit_embeddings.pickle', 'rb') as handle:
    embeddings = pickle.load(handle)

In [34]:
df.subreddit = df.subreddit.apply(lambda x: x.lower())

In [35]:
%%time
filtered = df[df.apply(lambda row: row.subreddit in embeddings, axis=1)]

CPU times: user 3min 42s, sys: 1.18 s, total: 3min 43s
Wall time: 3min 43s


In [36]:
filtered.sort_values(by='total', ascending=False).head()

Unnamed: 0,subreddit,day,total
10,ice_poseidon,156,16056
13,askreddit,337,11887
14,ice_poseidon,125,11648
16,askreddit,364,11282
17,askreddit,362,10842


In [37]:
filtered[filtered.subreddit == 'askreddit']

Unnamed: 0,subreddit,day,total
13,askreddit,337,11887
16,askreddit,364,11282
17,askreddit,362,10842
18,askreddit,338,10824
20,askreddit,323,10803
...,...,...,...
623,askreddit,139,6757
629,askreddit,125,6750
638,askreddit,55,6705
641,askreddit,118,6696


In [20]:
len(df)

14433360

In [21]:
d = {}
for k in embeddings:
    d[k] = np.zeros(366)

In [25]:
def f(row):
    d[row.subreddit][row.day-1] = row.total

In [38]:
%%time
filtered.apply(f, axis=1)

CPU times: user 4min, sys: 464 ms, total: 4min 1s
Wall time: 4min 1s


10          None
13          None
14          None
16          None
17          None
            ... 
14433337    None
14433341    None
14433346    None
14433355    None
14433358    None
Length: 6691486, dtype: object

In [39]:
d['askreddit']

array([ 6841.,  7212.,  7670.,  7207.,  7607.,  7123.,  7599.,  7727.,
        7736.,  7886.,  8055.,  7748.,  6998.,  7220.,  7679.,  7575.,
        8089.,  8320.,  7587.,  7322.,  7504.,  8280.,  8482.,  9030.,
        8539.,  7847.,  7418.,  7672.,  8307.,  8531.,  9232.,  8724.,
        7834.,  7768.,  7900.,  8332.,  8732.,  8678.,  8635.,  7882.,
        7435.,  7923.,  8507.,  8490.,  8055.,  8039.,  7771.,  7505.,
        8107.,  8225.,  8896.,  8500.,  8231.,  7257.,  6705.,  7454.,
        8121.,  8224.,  7747.,  7915.,  7080.,  6821.,  7320.,  7882.,
        7999.,  7988.,  7789.,  6855.,  6789.,  6822.,  7880.,  7817.,
        7975.,  7363.,  7583.,  7169.,  7847.,  8192.,  8343.,  8289.,
        7873.,  8061.,  7108.,  7372.,  7799.,  8326.,  8407.,  8683.,
        8409.,  7801.,  7859.,  7871.,  8605.,  8384.,  8538.,  8134.,
        7617.,  8237.,  8884.,  8569.,  8485.,  8227.,  7797.,  7372.,
        7380.,  8539.,  8371.,  8278.,  8338.,  7379.,  6959.,  7717.,
      

In [40]:
with open('app/data/post_counts.pickle', 'wb') as handle:
    pickle.dump(d, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# %%time
# for i, row in df.iterrows():
#     s = row.subreddit
#     if s not in d:
#         d[s] = [0]*366
#     d[s][row.day-1] = row.total