In [None]:
import sys
import os

sys.path.insert(0, os.getcwd() + '/reddit_download')

In [None]:
import sys

import matplotlib.pyplot as plt

sys.path.append('../..')
from plotting.matplotlib_setup import configure_latex, savefig, set_size_decorator, savefig, thiner_border

tex_dir, images_dir = 'porocilo/main.tex', 'porocilo/images'

configure_latex(style=['science', 'notebook'], global_save_path=images_dir)

%config InlineBackend.figure_format = 'pdf'

In [None]:
from reddit_download.RWV.pushshift.time_utils import timestamp_to_utc
from reddit_download.RWV.pushshift.utils import build_df, apply_df_time_transforms

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load and preprocess

In [None]:
# TODO: make this faster and more efficient
df_comments = build_df(content_type='comment', file_path=os.getcwd() + '/reddit_download')
df_posts = build_df(content_type='post', file_path=os.getcwd() + '/reddit_download')

In [None]:
df_comments = apply_df_time_transforms(df_comments)

In [None]:
ind = df_comments[df_comments['author'] == '[deleted]'].index
df_comments.drop(ind, inplace=True)

ind = df_comments[df_comments['author'] == 'AutoModerator'].index
df_comments.drop(ind, inplace=True)

ind = df_posts[df_posts['author'] == '[deleted]'].index
df_posts.drop(ind, inplace=True)

ind = df_posts[df_posts['author'] == 'AutoModerator'].index
df_posts.drop(ind, inplace=True)

In [None]:
df_comments = df_comments.rename(columns={"link_id": "post_id"})

df_comments = df_comments.rename(columns={"created_utc": "timestamp"})
df_posts = df_posts.rename(columns={"created_utc": "timestamp"})

In [None]:
df_comments['post_id'] = df_comments['post_id'].apply(lambda x: x.split('_')[1])

In [None]:
# df_comments.sort_values(by='post_id', inplace=True)
# df_posts.sort_values(by='post_id', inplace=True)

In [None]:
df_comments.head()

In [None]:
df_posts.head()

In [None]:
# from pandarallel import pandarallel

# pandarallel.initialize(nb_workers=12, progress_bar=True, use_memory_fs=None)

# post_ids = df_posts['post_id'].unique()

# def check_post_id(x, post_ids):
#     if x in post_ids:
#         return x
#     else:
#         return 0
    
# df_comments['post_id'] = df_comments['post_id'].parallel_apply(check_post_id, args=(post_ids, ))

In [None]:
ind = df_comments[df_comments['post_id'] == 0].index
df_comments.drop(ind, inplace=True)

# Link comments to posts

In [None]:
k, v = df_posts['post_id'], df_posts['timestamp']
id_to_timestamp = dict(zip(k, v))

import swifter

def func(x, mapping):
    try:
        return mapping[x]
    except KeyError:
        return -1

df_comments['post_time'] = df_comments['post_id'].swifter.apply(func, args=(id_to_timestamp, ))

ind = df_comments[df_comments['post_time'] == -1].index
df_comments.drop(ind, inplace=True)

In [None]:
df_comments.sort_values(by='score', inplace=True)
df_posts.sort_values(by='score', inplace=True)

# Times from post to comment

In [None]:
from benford_helper_functions import get_first_digit, benfords_test, construct_log_bins
from random_helper_functions import get_bitstring
from NIST_tests import RNG_test

In [None]:
times = df_comments['timestamp'].values - df_comments['post_time'].values
times = times[times > 1]

In [None]:
fig, ax = set_size_decorator(plt.subplots, fraction=0.5, ratio='4:3')(1, 1)

ax.hist(times, bins=50, range=[1, 60 * 10], histtype='step')
ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))
ax.set_xlabel('$\Delta t$ [s]')
ax.set_ylabel('$N$')
# savefig('reddit_post_activity_10min', tight_layout=False)

In [None]:
fig, ax = set_size_decorator(plt.subplots, fraction=0.5, ratio='4:3')(1, 1)

plt.yscale('log')
ax.hist(times, bins=50, range=[60 * 60 * 16, 60 * 60 * 32], histtype='step')
ax.axvline(86400, lw=1, c='C3', ls='--')
ax.set_xlabel('$\Delta t$ [s]')
ax.set_ylabel('$N$')
# savefig('reddit_post_activity_16h_to32h', tight_layout=False)

In [None]:
fig, ax = set_size_decorator(plt.subplots, fraction=0.5, ratio='4:3')(1, 1)

ax.set_yscale('log')

ax.hist(times, bins=50, range=[1, 60 * 60 * 24 * 1], histtype='step')
ax.set_xlabel('$\Delta t$ [s]')
ax.set_ylabel('$N$')
# savefig('reddit_post_activity_1day', tight_layout=False)

In [None]:
from benford_helper_functions import do_full_rng_test

In [None]:
def reshape_and_truncate(arr, shape):
    desired_size_factor = np.prod([n for n in shape if n != -1])
    if -1 in shape:  # implicit array size
        desired_size = arr.size // desired_size_factor * desired_size_factor
    else:
        desired_size = desired_size_factor
    return arr.flat[:desired_size].reshape(shape)

In [None]:
split_times = np.array_split(times, 75)

In [None]:
split_results = []

for s in split_times:
    f = 3
    i, j = len(s) // f, f

    a = reshape_and_truncate(s, (i, j))
    a = np.abs(a.astype(np.float64))
    b = np.prod(a, axis=1)
    
    f1s, fd, fracs, chi2_tests, ks_tests, df = do_full_rng_test(b, rng_test=True, walk=False, end_bits=-1)
    
    split_results.append([f1s, fd, fracs, chi2_tests, ks_tests, df])

In [None]:
p_matrix = []

for r in split_results:
    p_matrix.append([float(i) for i in r[-1].iloc[0].values])

p_matrix = np.array(p_matrix)

In [None]:
from stat_tests import chi2_test, ks_test

fig, ax = set_size_decorator(plt.subplots, fraction=1.5, ratio='4:3')(4, 4)
ax[-1, -1].set_visible(False)
axs = ax.flatten()

bins = 10
for i in range(p_matrix.shape[1]):
    m = p_matrix[:, i]
    
    t1 = chi2_test(m, n_bins=bins)
    t2 = ks_test(m)
    
    crit = t1[1]
    
    axs[i].hist(m, histtype='step', lw=2, bins=bins)
    axs[i].annotate(f'$\chi^2={t1[0][0][0]:.2f}$', xy=(0.5, 0.1), xycoords='axes fraction', fontsize=10)
    axs[i].set_title(f'test {i+1}')

print(crit)
# savefig('p_test_dist')

In [None]:
fs = np.arange(1, 21, 1)

lognorms = []

for f in fs:
    i, j = len(times) // f, f

    a = reshape_and_truncate(times / f**2, (i, j))
    a = np.abs(a.astype(np.float64))
    b = np.prod(a, axis=1)
    
    lognorms.append(b)

In [None]:
fig, ax = set_size_decorator(plt.subplots, fraction=0.5, ratio='4:3')(1, 1)


ax.hist(np.log10(lognorms[0]), bins=100, histtype='step')
ax.hist(np.log10(lognorms[1]), bins=100, histtype='step')
ax.hist(np.log10(lognorms[2]), bins=100, histtype='step')
ax.hist(np.log10(lognorms[3]), bins=100, histtype='step')
ax.hist(np.log10(lognorms[-1]), bins=100, histtype='step')

ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))

ax.legend(['0', '1', '2', '3', '20'])

ax.set_xlabel(r'$\log X$')
ax.set_ylabel(r'$N$')

# savefig('reddit_lognorms', tight_layout=False)

In [None]:
results = []
for f in fs:
    i, j = len(times) // f, f

    a = reshape_and_truncate(times / f**2, (i, j))
    a = np.abs(a.astype(np.float64))
    b = np.prod(a, axis=1)
    
    f1s, fd, _, chi2_tests, ks_tests, df = do_full_rng_test(b, rng_test=True, end_bits=10**5, walk=False)
    
    results.append([f1s, fd, chi2_tests, ks_tests, df])

In [None]:
chi2 = []
chi2crit = []
ks = []
kscrit = []
f1 = []
first = []
dfs = []

for r in results:
    f1s, fd, chi2_tests, ks_tests, df = r
    
    chi2.append(chi2_tests[0][0][0][0])
    chi2crit.append(chi2_tests[0][1])
    
    ks.append(ks_tests[0][0][0][0])
    kscrit.append(ks_tests[0][1][0])
    
    f1.append(f1s[0])
    
    first.append(fd[0][0])
    
    dfs.append(df)

In [None]:
fig, ax = set_size_decorator(plt.subplots, fraction=0.5, ratio='4:3')(1, 1)

ax.set_yscale('log')
ax.plot(fs, chi2, lw=1, label=r'$\chi^2$')
ax.scatter(fs, chi2, s=6)

ax.plot(fs, chi2crit, lw=1, label=r'$\chi^2_*$')
ax.scatter(fs, chi2crit, s=6)

ax.set_ylabel(r'$\chi^2$')
ax.set_xlabel(r'$\Pi_{i=1}^N$')

ax.legend()

# savefig('reddit_times_chi2')

In [None]:
fig, ax = set_size_decorator(plt.subplots, fraction=0.5, ratio='4:3')(1, 1)

ax.set_yscale('log')
ax.plot(fs, ks, lw=1, label=r'$d$')
ax.scatter(fs, ks, s=6)

ax.plot(fs, kscrit, lw=1, label=r'$d_*$')
ax.scatter(fs, kscrit, s=6)

ax.legend()

ax.set_ylabel(r'KS')
ax.set_xlabel(r'$\Pi_{i=1}^N$')

# savefig('reddit_times_ks')

In [None]:
fig, ax = set_size_decorator(plt.subplots, fraction=0.5, ratio='4:3')(1, 1)

ax.set_yscale('log')
ax.plot(fs, abs(first - np.log10(2)), lw=1)
ax.scatter(fs, abs(first - np.log10(2)), s=6)

ax.set_ylabel(r'$|n_1 - \log_{10}2|$')
ax.set_xlabel(r'$\Pi_{i=1}^N$')

# savefig('reddit_times_n1')

In [None]:
fig, ax = set_size_decorator(plt.subplots, fraction=0.5, ratio='4:3')(1, 1)

ax.plot(fs, f1, lw=1)
ax.scatter(fs, f1, s=6)

ax.set_ylabel(r'$f_1$')
ax.set_xlabel(r'$\Pi_{i=1}^N$')

# savefig('reddit_times_f1')

In [None]:
# ne dela, ker ne poznamo tocne funkcijske odvisnosti g(x)
# from benford_helper_functions import normalize
# from numba import njit

# @njit
# def reject(us, g, bins, h2):
#     ys = []
#     for i, u in enumerate(us):
#         x = g[i]
#         fx = h2
        
#         ind = np.argmin(np.abs(bins - x))
#         gx = g[ind]
        
#         if u <= fx / gx:
#             ys.append(x)
    
#     return ys


# def uniform_from_any(g, us=None):
#     """g -> distribution used for making random numbers, u -> U(0, 1) numbers"""
#     us = np.random.uniform(size=len(g))
    
#     g = g / np.max(g)
    
#     pdf, bins = np.histogram(g, int(np.sqrt(len(g))), density=True)
#     bins = bins[:-1]
    
#     s, pairs = [], []
#     for i in range(len(bins)):
#         h1, b1 = pdf[i], bins[i]
#         for j in range(len(bins)):
#             h2, b2 = pdf[j], bins[j]
#             S = b2 - b1 * h2
#             s.append(S)
#             pairs.append([h1, h2, b1, b2])

#     s = np.array(s)
#     ind = np.argsort(s)[::-1]
#     res = pairs[ind[0]]

#     h1, h2, b1, b2 = res
    
#     plt.plot(bins, pdf)
#     plt.scatter([b1, b2], [h2, h2])
    
#     ys = reject(us, g, bins, h2)

#     return ys

# y = uniform_from_any(times[times < 86400])
# plt.hist(y)

# Length of comments

In [None]:
df_comments['body_len'] = df_comments['body'].apply(len)

ind = df_comments[df_comments['body_len'] <= 0].index
df_comments.drop(ind, inplace=True)

df_comments.sort_values(by='score', inplace=True)

body_len = df_comments['body_len'].values

In [None]:
plt.hist(np.log10(body_len), bins=50)
plt.show()

In [None]:
f1s, first_digits, _, chi2_tests, ks_tests, df = do_full_rng_test(body_len, rng_test=True, end_bits=10**6, walk=True)

In [None]:
ks_tests

# Length of names

In [None]:
df_comments['author_len'] = df_comments['author'].apply(len)

In [None]:
author_len = df_comments['author_len'].values

In [None]:
plt.hist(author_len, range=(3, 20), bins=17)
plt.show()