In [None]:
import sys
import os

sys.path.insert(0, os.getcwd() + '/reddit_download')

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

sys.path.append('../..')
from plotting.matplotlib_setup import configure_latex, savefig, set_size_decorator, savefig, thiner_border

tex_dir, images_dir = 'porocilo/main.tex', 'porocilo/images'

configure_latex(style=['science', 'notebook'], global_save_path=images_dir)

%config InlineBackend.figure_format = 'pdf'

## load data

In [None]:
df_comments = pd.read_csv('comments.csv', lineterminator='\n')
df_posts = pd.read_csv('posts.csv', lineterminator='\n')

df_comments.drop(columns=['author', 'post_id', 'parent_id', 'permalink'], inplace=True)
df_posts.drop(columns=['author', 'post_id', 'num_comments', 'permalink'], inplace=True)

In [None]:
df_comments['body'] = df_comments['body'].apply(lambda x: str(x))

In [None]:
df_comments.sort_values(by=['timestamp'], inplace=True, ascending=False)

# RNG

In [None]:
from benford_helper_functions import str_to_bits, get_bitstring, binary_tree_walk
from random_helper_functions import split_to_arr, bin_str_to_matrix
from NIST_tests import RNG_test

In [None]:
# r/Genshin_Impact leaking much
top_1000_words = ['the', 'to', 'I', 'a', 'and', 'of', 'is', 'in', 'that', 'you', 'it', 'for', 'was', 'my', 'with', 'on', 'but', 'have', 'be', 'not', 'are', 'just', 'like', 'as', 'or', 'so', 'they', 'this', 'at', 'if', 'me', 'can', 'your', 'The', 'get', 'about', 'from', 'would', 'all', 'one', 'do', 'an', 'people', 'when', 'up', 'out', 'more', 'what', 'her', 'because', "don't", "I'm", 'we', 'had', 'he', 'i', 'some', 'think', 'will', "it's", 'by', 'them', 'really', 'their', 'how', 'has', 'no', 'only', 'know', 'who', 'even', 'than', 'good', 'there', 'time', 'she', 'his', 'then', 'It', 'other', 'If', 'got', 'want', 'You', 'still', 'much', 'were', 'make', 'been', 'also', 'being', 'it.', 'go', 'into', 'any', 'could', 'see', 'never', 'My', 'very', 'But', 'And', 'need', 'way', 'use', "It's", 'which', '-', 'most', 'first', 'going', 'him', 'after', 'something', 'where', 'This', 'too', 'I’m', 'say', 'same', 'should', 'lot', 'back', 'over', 'did', 'better', 'A', 'actually', 'now', 'every', 'So', 'pretty', 'always', 'why', 'don’t', 'someone', 'They', 'off', 'those', 'it’s', 'feel', 'since', 'That', "I've", 'work', 'thing', 'take', 'before', 'things', 'new', 'while', 'probably', "you're", 'am', 'many', 'its', 'years', 'love', 'around', 'game', 'made', 'said', "didn't", '2', 'sure', 'right', 'our', 'best', 'getting', "that's", "can't", 'Not', "doesn't", 'We', 'does', 'down', 'few', 'find', 'used', 'day', 'He', 'bad', 'What', 'enough', 'without', 'long', 'me.', 'ever', 'doing', 'look', 'thought', 'two', 'give', 'life', 'well', 'having', 'might', 'makes', 'In', 'different', 'little', 'anything', 'through', 'it,', 'these', 'already', 'try', 'both', 'When', 'put', "I'd", 'character', 'shit', 'mean', 'last', 'Just', 'great', 'own', 'characters', 'us', 'trying', 'until', 'another', '3', 'No', 'least', 'went', 'keep', 'There', 'point', 'person', 'old', "isn't", 'here', 'She', 'big', '&gt;', 'using', 'It’s', 'hard', 'that.', 'come', 'play', 'next', 'everyone', 'For', 'able', "That's", 'them.', 'end', 'guy', 'bit', '5', 'world', 'tell', 'kind', 'money', 'part', 'help', 'whole', 'maybe', 'everything', 'remember', 'As', 'show', 'How', 'lol', 'time.', 'high', 'once', 'year', 'damage', 'less', 'live', 'though', '4', 'seen', 'each', 'nothing', 'told', 'may', 'stuff', 'start', 'team', 'fucking', 'saying', 'friends', "I'll", 'started', 'gonna', 'literally', 'main', 'making', 'real', 'away', 'reason', 'far', 'guess', 'anyone', 'such', "they're", '1', 'looking', 'wanted', 'I’ve', 'definitely', 'watch', "there's", 'came', 'believe', 'between', 'gets', 'read', 'friend', 'talking', 'almost', 'man', 'you.', 'times', 'let', 'myself', 'care', 'school', 'Is', 'nice', 'else', 'understand', 'story', 'Also', 'seems', 'dont', 'found', 'done', 'saw', 'level', 'either', 'second', 'Oh', 'full', 'change', 'didn’t', 'set', 'buy', 'fuck', 'me,', 'Why', 'hate', 'place', 'instead', 'looks', 'kids', 'hope', 'called', 'anime', 'post', 'Then', 'run', 'hit', 'free', 'name', 'fun', "wouldn't", 'All', 'heard', 'too.', '10', 'left', 'idea', 'One', 'stop', 'Or', 'you’re', 'took', 'worth', 'usually', 'family', 'playing', 'job', 'ask', 'movie', 'during', "wasn't", 'call', 'single', 'quite', 'tried', 'Yeah', 'can’t', 'that’s', 'girl', 'home', 'pay', 'comes', 'top', 'kinda', 'banner', 'basically', 'Do', 'means', 'wrong', 'Thank', 'small', 'support', 'super', 'days', 'question', 'Because', 'again', 'talk', 'out.', 'Like', "won't", 'build', 'now.', 'water', 'up.', 'Maybe', 'Well', 'fact', 'People', 'that,', 'against', 'rather', 'thinking', 'At', 'wish', 'half', 'car', 'though.', 'problem', 'mind', 'women', 'ones', 'games', 'working', 'cause', 'under', '*', 'house', 'couple', 'especially', 'this.', 'To', 'entire', 'sex', 'side', "Don't", 'completely', 'food', "he's", 'goes', 'asked', 'likely', 'close', 'pull', 'Now', 'mom', 'I’d', 'later', 'comment', 'matter', 'watching', 'weird', 'doesn’t', 'parents', "aren't", 'absolutely', 'there.', 'felt', 'Even', 'u', 'video', 'hear', '&amp;', 'Your', 'happened', 'amount', 'hours', 'kid', "she's", 'star', 'sounds', 'wait', 'Some', 'one.', 'knew', 'eat', 'happy', 'seem', ':)', 'others', 'guys', 'well.', 'leave', 'months', 'often', 'open', 'cool', 'head', 'kill', 'yet', 'country', 'works', 'case', 'taking', 'needs', '+', 'coming', 'power', 'you,', 'says', 'based', 'im', "haven't", 'sense', 'become', 'whatever', 'day.', 'exactly', 'lost', 'rest', 'sometimes', 'similar', 'Zhongli', 'crit', 'due', 'night', 'must', 'lol.', 'him.', 'dps', 'time,', 'weapon', 'enjoy', 'Yeah,', 'experience', 'easy', 'Thanks', "There's", 'agree', 'spend', 'gave', 'human', 'body', 'her.', 'certain', 'turn', 'men', 'etc.', 'answer', 'check', 'normal', 'ago', 'unless', 'dad', 'yourself', '6', 'fine', 'life.', 'move', 'That’s', 'favorite', 'music', 'ass', 'song', 'Most', 'Good', 'huge', 'burst', 'actual', 'seeing', 'week', 'space', 'attack', 'die', 'running', 'all.', 'takes', 'black', 'watched', 'them,', 'worked', 'add', 'past', 'again.', 'woman', ',', 'Also,', 'save', 'living', "couldn't", 'room', 'Its', 'deal', 'outside', 'people.', 'energy', 'type', 'per', 'played', 'system', 'low', 'content', 'phone', 'number', 'current', 'true', 'face', 'possible', 'feels', 'Yes', '.', 'good.', 'episode', '20', 'gives', 'chance', 'here.', 'behind', 'straight', 'looked', 'mostly', 'I’ll', 'event', 'early', 'feeling', 'wife', 'isn’t', 'three', 'stay', 'learn', 'amazing', 'hot', 'is.', 'sound', 'shield', 'physical', 'After', "you'll", 'happen', 'yeah', 'course', 'front', 'way.', 'minutes', 'middle', 'asking', 'fight', 'extra', 'thank', 'important', 'original', 'shows', '?', 'imagine', 'sleep', 'stupid', 'hell', 'sort', 'Are', 'finally', 'higher', 'damn', 'series', 'needed', 'hand', 'artifacts', 'together', '100%', 'honestly', 'on.', 'given', 'wants', 'ended', 'random', "You're", 'worst', 'wanna', 'child', 'On', 'break', 'social', 'Yes,', 'DPS', 'Probably', 'specific', 'US', 'worse', 'Can', 'interesting', 'game.', 'thing.', 'do.', 'giving', 'scene', 'bring', 'issue', 'near', 'turned', 'rate', 'they’re', 'thanks', 'meant', 'clear', 'bunch', 'line', 'pick', 'now,', 'is,', 'multiple', 'death', 'dead', 'up,', 'order', 'years.', 'supposed', 'decided', 'girls', 'Being', 'common', 'word', 'god', 'abyss', 'With', 'version', 'future', 'it?', 'simply', 'strong', 'season', 'Well,', 'please', 'yes', 'sorry', 'longer', 'though,', '(and', 'large', 'white', "we're", 'Have', 'light', 'weeks', 'difference', 'age', 'Same', 'easier', 'reading', 'issues', 'fast', 'loved', 'Never', 'along', 'work.', 'account', 'Hu', '8', 'dog', 'company', 'young', 'cryo', 'No,', 'this,', 'pyro', 'piece', 'lose', 'Which', 'totally', 'cut', '(I', 'building', 'happens', 'alone', 'older', 'not.', 'electro', 'general', 'except', 'telling', '2.', 'realize', 'Venti', 'short', '/', 'easily', 'kept', 'decent', 'wasn’t', 'group', 'funny', 'hold', 'walk', '30', 'towards', 'Im', 'Did', 'Ganyu', 'better.', 'extremely', 'out,', 'currently', 'there’s', 'Eula', 'dude', 'American', 'Any', 'bought', 'consider', '&amp;#x200B;', 'glad', 'spent', 'coffee', 'Only', 'Bennett', 'knows', 'control', 'quality', 'waiting', 'late', 'soon', 'class', 'personal', 'Every', 'within', 'mine', 'met', 'several', 'allowed', 'God', 'Lol', 'moment', 'liked', 'cannot', 'inside', 'month', 'themselves', 'standard', 'crazy', 'forget', '7', 'thats', 'party', 'relationship', 'said,', '"I', 'perfect', 'dmg', 'wouldn’t', 'across', 'health', 'drink', 'brother', 'list', 'built', 'sad', 'expect', 'example', 'recommend', 'voice', 'to.', 'book', 'self', 'stuck', 'wonder', 'taste', 'pain', 'stopped', 'choose', 'drop', 'beat', 'mean,', 'state', 'personally', 'floor', 'in.', '12', 'paid', 'sub', 'cost', 'compared', 'pity', 'Was', 'stand', 'Please', 'assume', 'died', 'depends', 'poor', '=', 'right?', 'fan', 'brain', 'whether', 'Of', 'hour', 'Diluc', 'listen', 'weapons', 'Does', 'mental', 'off.', 'explain', 'players', 'there,', 'prefer', 'lots', 'eating', 'gay', 'killed', 'children', 'mother', 'eventually', 'know,', 'hair', 'E', 'Who', '1.', "you've", 'questions', 'area', 'figure', 'Genshin', 'His', 'fucked', 'well,', 'bed', 'known', 'shot', 'above', 'date', 'Jean', 'too,', 'door', 'gotten', 'day,', 'much.', 'point.', 'drive', 'cant', 'Klee', 'Diona', 'things.', 'generally', 'eyes', 'artifact', 'shit.', 'government', 'fall', 'enemies', 'problems', 'following', 'opinion', 'lower', 'form', 'Those', 'boss', 'considered', 'Reddit', 'red', 'skill', 'yes,', 'Go', 'public', 'learned', 'movies', 'popular', 'pulled', 'you!', 'ok', 'OP', 'mention', 'changed', 'art', 'war', 'he’s', 'recently', 'fit', 'luck', 'college', 'hurt', 'people,', 'wear', 'lmao', 'resin', 'tho', 'miss', 'sister', 'gone', 'stars', '15', 'New', 'fully', 'average', 'walking', 'taken', 'back.', 'more.', 'one,', 'interested', ':(']

In [None]:
from text_rng import TextRng

In [None]:
# df_comments.sort_values(by=['score'], inplace=True, ascending=False)

In [None]:
# ocena bitov/s
TR = TextRng(text=df_comments['body'].values[:10**6], 
             utf8_kwargs={"utf8_bit_pos": -1, "top_words": top_1000_words[:100], "remove_spaces": True},
             mixing_kwargs = {"n_mixes": 1, "chunks": 16},
             lognormal_kwargs = {"n": 8, "d": 6, "div": 1e6},
             bit_generation="bitstring")

In [None]:
from benford_helper_functions import str_to_bits

In [None]:
all_bits = str_to_bits(TR.text, remove_spaces=False)

In [None]:
f'{len(all_bits):.3e}'

In [None]:
bits_per_comment = len(all_bits) / 10**6
bits_per_comment

In [None]:
bits = TR.run()

In [None]:
r = len(all_bits) / len(bits)
r

In [None]:
(bits_per_comment / r) * 100

## testing code

In [None]:
def make_bit_chunk(bits, n):
    m = len(bits) // n
    bits_chunked = [bits[i*m:(i+1)*m] for i in range(n)]
    return bits_chunked


def make_bit_chunks(bits, n=32, splits=2, prnt=False):
    end_parts, elements = n**(splits + 1), len(bits) // n**(splits + 1)
    if prnt:
        print(f'end parts: {end_parts} with {elements} elements')
    bits_chunked = make_bit_chunk(bits, n)
    
    if splits == 0:
        return bits_chunked, end_parts, elements

    for split in range(splits):
        split_chunks = []
        for chunk in bits_chunked:
            split_chunks += make_bit_chunk(chunk, n)
        bits_chunked = split_chunks
    
    return bits_chunked, end_parts, elements


def make_bitstring_from_chunks(bits, num_bits=None, **kwargs):
    bits_chunked, n_chunks, elements = make_bit_chunks(bits, **kwargs)
    
    bitstring = ''
    for i in range(elements):
        for j in range(n_chunks):
            b = bits_chunked[j][i]
            bitstring += b
            if num_bits:
                if len(bitstring) > num_bits:
                    return bitstring
        
    return bitstring


def multi_mix(st, n_mixes=None, chunks=None):
    starting_st = st
    
    if chunks is None:
        n = int(np.sqrt(len(st))) - 1
    else:
        n = chunks
    print(f'chunks: {n}')
    
    if n_mixes is None:
        n_mixes = n
        
    for i in tqdm(range(n_mixes)):
        st = make_bitstring_from_chunks(st, n=n, splits=0)
        if st == starting_st:
            print('sequence repeated! returnig last good combination!')
            return old_st
        old_st = st
    
    return st

In [None]:
def utf8_bits(text, utf8_bit_pos=-1, n_top_word_replace=100):
    full_text = ''.join(text)
    spaces_bits = str_to_bits(full_text, to_replace=top_1000_words[:n_top_word_replace], remove_spaces=True)
    
    list_bits = list(spaces_bits.split(" "))

    bits = ''
    for b in list_bits:
        try:
            bits += b[utf8_bit_pos]
        except Exception as e:
            print(e, b)
            pass
    return bits

In [None]:
def make_ints_with_n_bits(bits, n):
    m = len(bits) // n
    
    ints = []
    z = 0
    for i in range(m):
        take = bits[i*n:(i+1)*n]
        make_int = int(take, 2)
        if make_int != 0:
            ints.append(make_int)
        else:
            z += 1
    
    print(f'{z} total zeros')
    return np.array(ints)


def reshape_and_truncate(arr, shape):
    desired_size_factor = np.prod([n for n in shape if n != -1])
    if -1 in shape:  # implicit array size
        desired_size = arr.size // desired_size_factor * desired_size_factor
    else:
        desired_size = desired_size_factor
    return arr.flat[:desired_size].reshape(shape)


def text_lognormal_dist(bits, n, d, div=1):
    """
    bits: str
        Sequence of bits
    n: int
        Number of bits to take together in bits sequence
    d: int
        Number of multiplications
    """
    ints = make_ints_with_n_bits(bits, n=n)
    ints_mat = reshape_and_truncate(ints, (len(ints) // d, d))
    ints_prod = np.prod(ints_mat / div, axis=1)
    return ints_prod

In [None]:
def make_float_chunks(fl, n):
    m = len(fl) // n
    bits_chunked = [fl[i*m:(i+1)*m] for i in range(n)]
    return bits_chunked

def make_floatarr_from_chunks(fl, num_fl=None, n=2): # n -> chunks
    floats_chunked = make_float_chunks(fl, n)
    elements = len(fl) // n
    
    floatarr = []
    for i in range(elements):
        for j in range(n):
            f = floats_chunked[j][i]
            floatarr.append(f)
            
            if num_fl and len(floatarr) > num_fl:
                return floatarr
        
    return np.array(floatarr)

def multi_floatarr_from_chunks(fl, n_mixes, **kwargs):
    for m in tqdm(range(n_mixes)):
        fl = make_floatarr_from_chunks(fl, **kwargs)
    return np.array(fl)

In [None]:
from stat_tests import chi2_test, ks_test


def bitstring_rng_test(rng_bits, take):
    it = len(rng_bits) // take
    
    if it < 1:
        it = 1
    
    results = []
    for i in range(it):
        print(i, '/', it)
        res = RNG_test(rng_bits[i*take:(i+1)*take])
        results.append(res)
    
    return results


def rng_all_comments_stat_tests(df, df_col, n_comments, bit_pos=-1, take=10**6, use_walk=False):
    it = len(df[df_col]) // n_comments
    
    stat_results, rng_results = [], []
    
    for i in range(it):
        print(i, '/', it)
        comment_bits = utf8_bits(df_comments[df_col].values[i*n_comments:(i+1)*n_comments], utf8_bit_pos=bit_pos)
        comment_bits = multi_mix(comment_bits, n_mixes=1, chunks=16)
        
        prod = text_lognormal_dist(comment_bits, n=8, d=6, div=1e6)
        
        u = np.log10(prod) % 1
        chi2, ks = chi2_test(u), ks_test(u)
        
        # rng_bits = get_bitstring(u)
        if use_walk:
            rng_bits = binary_tree_walk(u).astype(str)
        else:
            rng_bits = get_bitstring(u)
        
        print(f'NUM BITS: {len(rng_bits)}')
        rng_bits = "".join(rng_bits)
        
        res = bitstring_rng_test(rng_bits, take=take)
        
        stat_results.append([chi2, ks])
        rng_results.append(res)
    
    return stat_results, rng_results

In [None]:
import pickle

In [None]:
# results = rng_all_comments_stat_tests(df_comments, 'body', 10**6, bit_pos=-2)

# pickle.dump(results, open("results_bit_m2.p", "wb"))
# results = pickle.load(open("results_bit_m2.p", "rb"))

In [None]:
# results = rng_all_comments_stat_tests(df_comments, 'body', 10**6, bit_pos=-3)

# pickle.dump(results, open("results_bit_m3.p", "wb"))
# results = pickle.load(open("results_bit_m3.p", "rb"))

In [None]:
# results = rng_all_comments_stat_tests(df_comments, 'body', 10**6, bit_pos=-4)

# pickle.dump(results, open("results_bit_m4.p", "wb"))
# results = pickle.load(open("results_bit_m4.p", "rb"))

In [None]:
# results = rng_all_comments_stat_tests(df_comments, 'body', 10**6, bit_pos=4, use_walk=True)

# pickle.dump(results, open("results_bit_m5_walk.p", "wb"))
# results = pickle.load(open("results_bit_m5_walk.p", "rb"))

In [None]:
res1 = pickle.load(open("results_bit_m1.p", "rb"))
res2 = pickle.load(open("results_bit_m2.p", "rb"))
res3 = pickle.load(open("results_bit_m3.p", "rb"))
res4 = pickle.load(open("results_bit_m4.p", "rb"))
res5 = pickle.load(open("results_bit_m5.p", "rb"))

In [None]:
def get_p_results(results):
    p_results = []

    for r in results[1]:
        for ri in r:
            p_results.append(ri['p'].values.astype(np.float32))
    
    return np.array(p_results)


def get_stat_results(results):
    chi2, ks = [], []
    chi2_crit, ks_crit = [], []
    
    for r in results[0]:
        chi2.append(r[0][0][0][0])
        chi2_crit.append(r[0][1])
        ks.append(r[1][0][0][0])
        ks_crit.append(r[1][1])
    
    return chi2, ks, chi2_crit, ks_crit

In [None]:
results = [res1, res2, res3, res4, res5]
results = results[:3]

p_results = [get_p_results(i) for i in results]

In [None]:
fig, ax = set_size_decorator(plt.subplots, fraction=1.5, ratio='4:3')(4, 4)
ax[-1, -1].set_visible(False)
axs = ax.flatten()

# crit: 24.724970311318277
for i in range(15):
    l = []
    for j in range(len(p_results)):
        p = p_results[j]
        _, bins, _ = axs[i].hist(p[:, i], histtype='step', bins=10)
        c2 = chi2_test(p[:, i], n_bins=len(bins))
        l.append(f'bit {-j-1}: $\chi^2=${c2[0][0][0]:.2f}')
        
    axs[i].legend(l, loc='lower left', fontsize=5)
    axs[i].set_title(f'test {i+1}')

savefig('text_rng_p_dists')

In [None]:
for i, res in enumerate(results):
    chi2, ks, chi2_crit, ks_crit = get_stat_results(res)
    print(chi2, chi2_crit)
    print()

## more testing

In [None]:
n_comments = 10**6

comment_bits = utf8_bits(df_comments['body'].values[:n_comments])

In [None]:
comment_bits = multi_mix(comment_bits, n_mixes=1, chunks=16)

prod = text_lognormal_dist(comment_bits, n=8, d=6, div=1e6)

In [None]:
prod.shape

In [None]:
from stat_tests import chi2_test, ks_test

u = np.log10(prod) % 1

# u = multi_floatarr_from_chunks(u, n_mixes=1, n=2)

# for i in range(10):
#     u = np.concatenate((u[1::2], u[::2]))

chi2_test(u), ks_test(u)

In [None]:
rng_bits = get_bitstring(u)
rng_bits = "".join(rng_bits)

# rng_bits = binary_tree_walk(u).astype(str)
# rng_bits = "".join(rng_bits)

In [None]:
len(rng_bits)

In [None]:
RNG_test(rng_bits[:10**6])