In [2]:
from multiprocessing import cpu_count
from pathlib import Path

from sklearn.model_selection import KFold

import dask.dataframe as ddf
import numpy as np
import pandas as pd
import spacy
import swifter
from cleantext import clean
import json
from tqdm import tqdm
from fastai.text import *

from functools import lru_cache 

pd.options.display.max_colwidth = 1000

import dask

from dask.diagnostics import ProgressBar

n_cores = cpu_count()
tqdm.pandas()

In [42]:
max_tokens_per_comment = 200

# is a problem with multiprocessing
tok = spacy.blank('en')

# truncate individual comments
def truncate(text, max_tokens_per_comment=max_tokens_per_comment):
    return str(tok.tokenizer(text)[:max_tokens_per_comment].merge())

def clean_and_truncate(text, linebreaks_token=False):
    text = clean(text, lower=False, no_urls=True,
                 no_emails=True, zero_digits=True)
    if linebreaks_token:
        tl = len(text)
        text = text.replace('\n', ' xx_linebreak ')
        assert tl == len(text) or 'xx_linebreak' in text
        assert not '\n' in text
        
    text = truncate(text)
    return text

In [43]:
data_in = []
with open('/mnt/data/datasets/reddit-coarse-discourse/coarse_discourse_dump_reddit.json') as f:
    for i, l in enumerate(f):
        previous_ids = []
        
        j = json.loads(l)
        posts = j['posts']
        rank = 0
        for p in posts:
            p['rank'] = rank
            p['thread_id'] = i
            p['title'] = j['title']
            p['is_self_post'] = 'xx_selfpost' if 'is_self_post' in j else 'xx_linkpost'
            p['subreddit'] = j['subreddit']
            del p['annotations']
            rank += 1
        data_in += posts

In [44]:
data_in

[{'id': 't3_1bx6qw',
  'majority_link': 'none',
  'majority_type': 'announcement',
  'is_first_post': True,
  'body': "4/7/13  \n\n7/27/12  \n\nhttp://www.imdb.com/title/tt0073440/reference\n\nIt was only a few minutes into Robert Altman's homespun epic *Nashville* that I got the feeling I was watching a great movie. By the end it could not be denied. Now I'm sure it helps that I'm a musician, since this created an immediate connection to the subject matter. I spent a portion of the movie with my Telecaster in my lap trying to play along with the characters who all seem to be really playing and singing these songs. However I also am *not* a fan of country western, so that could have easily been a turn off.  \n\nTo begin describing the action in the film is daunting. I can't even process a lot of what I saw. This movie is extremely dense, and the first 30 minutes or so are spent just trying to figure out who people are. Their relationships to one another - some of which are purely incid

In [45]:
len(data_in)

116357

In [46]:
data_in[12]

{'in_reply_to': 't3_259tbh',
 'post_depth': 1,
 'id': 't1_chrzb0h',
 'majority_link': 't3_259tbh',
 'majority_type': 'appreciation',
 'body': ">very specific items that aren't available everywhere\n\nI find this funny, because I'm in the US and find it practically impossible to find gelatin in sheet form, digestive cookies, and Ribena light.\n\nI was, however, inspired by your recipe. to just wing it with what I *could* find.  I used sugar-free raspberry Jell-O and pre-made mini graham cracker crusts.  They came out entirely too Jell-O-like in texture, but still tasty, and I'll be tweaking them and trying again.  Thanks for the idea!",
 'rank': 1,
 'thread_id': 2,
 'title': '122cal black currant cheesecake!',
 'is_self_post': 'xx_selfpost',
 'subreddit': '1200isplenty'}

In [47]:
df = pd.DataFrame(data_in)

In [48]:
# df = df[df['body'].str.len() > 0]

In [9]:
all_authors = df.groupby('thread_id').agg({'author': lambda x: x.dropna().drop_duplicates().values.tolist()}).reset_index()
all_authors = all_authors.rename(columns={'author': 'thread_authors'})

In [10]:
all_authors

Unnamed: 0,thread_id,thread_authors
0,0,"[DTX120, mcgrewf10]"
1,1,"[Keatonus, answerorreply]"
2,2,"[foodandweight, LinkFixerBotSnr, autowikibot, 10andback, Gingerstop]"
3,3,"[stanleyfarnsworth, yourenzyme, 2wheeljunkie, YourMothersAss, Vibora]"
4,4,"[WhatDaFlogNut, souobixo, espike845, savais, Luckyguny]"
5,5,"[MyMotivation, MyDefenceLevelisJuan, life_is_football, Oshlavv, MxChamp24, kingphilco]"
6,6,"[NoYoureWrongg, marcellman, celery_under, dopiates, SammehXD, Entycle, ColorMePanda, IzziTheEpic]"
7,7,"[Sailorgalaxy, MlLADY, Stuck1nher, 0urlasthope, SanfewOfFi, ConstantRager17]"
8,8,"[GrandmaTitz, Sly34me, legoknight, ostapko]"
9,9,"[Nhite, afriendRS, IFadedxMotionI, Quafleonrs, SanfewOfFi, Haavoittuva, ShaunDreclin]"


In [11]:
df = df.merge(all_authors, on = 'thread_id')

In [12]:
df

Unnamed: 0,author,body,id,in_reply_to,is_first_post,is_self_post,majority_link,majority_type,post_depth,rank,subreddit,thread_id,title,url,thread_authors
0,DTX120,"4/7/13 \n\n7/27/12 \n\nhttp://www.imdb.com/title/tt0073440/reference\n\nIt was only a few minutes into Robert Altman's homespun epic *Nashville* that I got the feeling I was watching a great movie. By the end it could not be denied. Now I'm sure it helps that I'm a musician, since this created an immediate connection to the subject matter. I spent a portion of the movie with my Telecaster in my lap trying to play along with the characters who all seem to be really playing and singing these songs. However I also am *not* a fan of country western, so that could have easily been a turn off. \n\nTo begin describing the action in the film is daunting. I can't even process a lot of what I saw. This movie is extremely dense, and the first 30 minutes or so are spent just trying to figure out who people are. Their relationships to one another - some of which are purely incidental - slowly become clear as things progress. It's an ensemble cast with no clear lead and lots of overlapping co...",t3_1bx6qw,,True,xx_selfpost,none,announcement,,0,100movies365days,0,DTX120: #87 - Nashville,https://www.reddit.com/r/100movies365days/comments/1bx6qw/dtx120_87_nashville/,"[DTX120, mcgrewf10]"
1,mcgrewf10,I've wanted to watch this for a long time. I was also turned off by the country western aspect.,t1_c9b2nyd,t3_1bx6qw,,xx_selfpost,t3_1bx6qw,elaboration,1.0,1,100movies365days,0,DTX120: #87 - Nashville,,"[DTX120, mcgrewf10]"
2,DTX120,"You strike me as the type who would appreciate it. I would give it a go. This is also my first Altman film so I didn't really know what to expect, except that people always compare PTA's Boogie Nights and Magnolia as being influenced by Altman. Magnolia is probably the best analog in terms of structure (having no lead character) but it is stylistically very different, much more melodramatic and transparently earnest.",t1_c9b30i1,t1_c9b2nyd,,xx_selfpost,t1_c9b2nyd,elaboration,2.0,2,100movies365days,0,DTX120: #87 - Nashville,,"[DTX120, mcgrewf10]"
3,mcgrewf10,"Yeah, I've always heard that Altman was famous for his ensemble casts. But I, too, have never seen an Altman.",t1_c9b6sj0,t1_c9b30i1,,xx_selfpost,t1_c9b30i1,elaboration,3.0,3,100movies365days,0,DTX120: #87 - Nashville,,"[DTX120, mcgrewf10]"
4,Keatonus,"Alright guys, little background about myself. I'm a good looking, 23 year old male, have had modest success in the past with women, but have decided that modest just isn't good enough anymore.\n\nThe problem has never been a lack of attention, or opportunity. It's just been not having the killer instinct and extreme AA, to the point where women will basically EYE FUCK me, and I still don't have the testicular fortitude to say anything unless they say something first.\n\nMy goal is to approach 100 sets by the end of April, and hopefully break my AA.",t3_omv7p,,True,xx_selfpost,none,announcement,,0,100sets,1,"Male, 23 years old. Going for 100 sets!",https://www.reddit.com/r/100sets/comments/omv7p/male_23_years_old_going_for_100_sets/,"[Keatonus, answerorreply]"
5,Keatonus,"**January 16th 3 Sets:** \nWent out shopping with my grandma as I visited her. We go to a Factory store. Opened 3 sets.\n\n* First Set: was the changing room gal, HB 6 cute asian gal. Asked her a lot of questions about what i was wearing. Made her laugh several times with corny jokes, introduced her to my grandma, and we all talked for a minute after I was done picking out the clothes I wanted and left. \n\n* Second set: was this older gal, mid 40's. Wouldn't give her an HB because it was more of just a casual chat about the outfit I put on. She said she didn't like it, I responded with ""aw, I'm not attractive?!"" and she said ""I never said that haha"" and grinned. I gave her a wink as she continued laughing and went into her stall.\n\n* Third set: **NUMBER CLOSE** As me and my Grandma are waiting in line to be helped checking out, and HB7 walks to the register and helps us. We IMMEDIATELY small talk. Exchanging intellectual talking points such as martyrs and assassinations, making ...",t1_c3igqif,t3_omv7p,,xx_selfpost,t3_omv7p,elaboration,1.0,1,100sets,1,"Male, 23 years old. Going for 100 sets!",,"[Keatonus, answerorreply]"
6,,,t1_c3imrkb,t1_c3igqif,,xx_selfpost,t1_c3igqif,humor,2.0,2,100sets,1,"Male, 23 years old. Going for 100 sets!",,"[Keatonus, answerorreply]"
7,answerorreply,"dude, these sets are awesome. You're doing great. Sounds like you're a natural at meeting people once you get past the AA.",t1_c3ij8z0,t3_omv7p,,xx_selfpost,t3_omv7p,appreciation,1.0,3,100sets,1,"Male, 23 years old. Going for 100 sets!",,"[Keatonus, answerorreply]"
8,Keatonus,"Thanks man! Yeah I'm trying to just keep the ""who cares have fun"" attitude on. Because normally I freak out about what to say. But I find if I just bring a subject up and get some momentum going, I'm actually a pretty good conversationalist.",t1_c3inx9t,t1_c3ij8z0,,xx_selfpost,t1_c3ij8z0,appreciation,2.0,4,100sets,1,"Male, 23 years old. Going for 100 sets!",,"[Keatonus, answerorreply]"
9,Keatonus,"Ok. Update! Sorry I haven't been doing this day to day like I should be. Let me see what I can remember\n\n**January 17-23 Sets:** Various sets I remember opening, unfortunately no number closes. Just good old fashioned chit chatting.\n\n* 4th set: During the 49ers and Giants game I went to grab a bite to eat at the local burger joint. Before ordering since no line was behind me I decided to small talk with the Female at the register HB5. Mostly talked about sports, a little about myself, and her coworker HB6 joined in the conversation. This lasted about 5 minutes, learned both had boyfriends through small talk and proceeded to order.\n\n* 5th set: Also small talked with the female who works at my local grocery chain. She was new there, we talked about the rain of all things haha. Male coworker semi interrupted us, still small talked with both. Then ordered my drink.\n\nBlahhh, I know there's more than this...Hence why I should try to update every day. Be back soon guys.",t1_c3k2lc1,t3_omv7p,,xx_selfpost,t3_omv7p,elaboration,1.0,5,100sets,1,"Male, 23 years old. Going for 100 sets!",,"[Keatonus, answerorreply]"


In [13]:
def user_index(row):
    authors = row['thread_authors']
    author = row['author']
    if author in authors:
        i = authors.index(author)
        return f"xx_user_{i}"
    return ""

In [14]:
df['user_index'] = df.apply(user_index, axis=1)

In [15]:
def replace_user(row):
    text = row['body']
    if pd.isna(text):
        return ''
    for i, u in enumerate(row['thread_authors']):
        text = text.replace(u, f"xx_user_{i}")
    return text

In [16]:
df['text'] = df.apply(replace_user, axis=1)

In [19]:
df[df['text'].str.contains('xx_user')]

Unnamed: 0,author,body,id,in_reply_to,is_first_post,is_self_post,majority_link,majority_type,post_depth,rank,subreddit,thread_id,title,url,thread_authors,user_index,text
14,LinkFixerBotSnr,/r/keto \n\n*****\n[^report ^a ^**problem**](http://reddit.com/r/LinkFixerBotSnr) ^| [^delete ^comment](http://www.reddit.com/message/compose?to=LinkFixerBotSnr&subject=Comment%20Deletion%20%28Parent%20Commenter%20Only%29&message=%2Bdelete+chs0fj3) ^| [^source ^code](http://github.com/WinneonSword/LFB) ^| [^contact ^developer](http://reddit.com/user/WinneonSword),t1_chs0fj3,t1_chs0eqa,,xx_selfpost,t1_chs0eqa,elaboration,3.0,3,1200isplenty,2,122cal black currant cheesecake!,,"[foodandweight, LinkFixerBotSnr, autowikibot, 10andback, Gingerstop]",xx_user_1,/r/keto \n\n*****\n[^report ^a ^**problem**](http://reddit.com/r/xx_user_1) ^| [^delete ^comment](http://www.reddit.com/message/compose?to=xx_user_1&subject=Comment%20Deletion%20%28Parent%20Commenter%20Only%29&message=%2Bdelete+chs0fj3) ^| [^source ^code](http://github.com/WinneonSword/LFB) ^| [^contact ^developer](http://reddit.com/user/WinneonSword)
16,autowikibot,"#####&#009;\n\n######&#009;\n\n####&#009;\n [**Digestive biscuit**](https://en.wikipedia.org/wiki/Digestive%20biscuit): [](#sfw) \n\n---\n\n>A __digestive biscuit__, sometimes described as a __sweet-meal biscuit__, is a semi-[sweet](https://en.wikipedia.org/wiki/Sweetness) [biscuit](https://en.wikipedia.org/wiki/Biscuit) (usually known in [American English](https://en.wikipedia.org/wiki/American_English) as a ""cookie"" ) that originated in the [United Kingdom](https://en.wikipedia.org/wiki/United_Kingdom) and is popular worldwide. The term ""digestive"" is derived from the belief that they had [antacid](https://en.wikipedia.org/wiki/Antacid) [properties](https://en.wikipedia.org/wiki/Chemical_property) due to the use of [sodium bicarbonate](https://en.wikipedia.org/wiki/Sodium_bicarbonate) when they were first developed. Historically, some producers used [diastatic](https://en.wikipedia.org/wiki/%C2%B0Lintner) [malt extract](https://en.wikipedia.org/wiki/Malt) to ""digest"" some of the...",t1_chs51gj,t1_chs51e3,,xx_selfpost,t1_chs51e3,elaboration,4.0,5,1200isplenty,2,122cal black currant cheesecake!,,"[foodandweight, LinkFixerBotSnr, autowikibot, 10andback, Gingerstop]",xx_user_2,"#####&#009;\n\n######&#009;\n\n####&#009;\n [**Digestive biscuit**](https://en.wikipedia.org/wiki/Digestive%20biscuit): [](#sfw) \n\n---\n\n>A __digestive biscuit__, sometimes described as a __sweet-meal biscuit__, is a semi-[sweet](https://en.wikipedia.org/wiki/Sweetness) [biscuit](https://en.wikipedia.org/wiki/Biscuit) (usually known in [American English](https://en.wikipedia.org/wiki/American_English) as a ""cookie"" ) that originated in the [United Kingdom](https://en.wikipedia.org/wiki/United_Kingdom) and is popular worldwide. The term ""digestive"" is derived from the belief that they had [antacid](https://en.wikipedia.org/wiki/Antacid) [properties](https://en.wikipedia.org/wiki/Chemical_property) due to the use of [sodium bicarbonate](https://en.wikipedia.org/wiki/Sodium_bicarbonate) when they were first developed. Historically, some producers used [diastatic](https://en.wikipedia.org/wiki/%C2%B0Lintner) [malt extract](https://en.wikipedia.org/wiki/Malt) to ""digest"" some of the..."
165,BrotherOmad,Thank you both M a g 3 and SpiritOfFi got it!,t1_crd71jb,t1_crd6wsl,,xx_selfpost,t1_crd6wsl,appreciation,2.0,3,2007scape,20,What's going on here? Can't choose damaged book.,,"[BrotherOmad, SpiritOfFi, M__A___G___3]",xx_user_0,Thank you both M a g 3 and xx_user_1 got it!
306,The_Jims,"/u/Ilovetmac and /u/BuckeyeLeaves, would you guys be able to start a game, then have the Suns quit out? Sorry for the inconvenience, but for whatever reason 2K just doesn't have your other games on the admin schedule.",t1_ce6fm2d,t3_1tc3xq,,xx_selfpost,t3_1tc3xq,question,1.0,1,2k14oa,39,12/20 CONFERENCE SEMI-FINALS/FINALS GAME THREAD,,"[The_Jims, BuckeyeLeaves, IamLordFlacko, Bangers69, PopeNimrod]",xx_user_0,"/u/Ilovetmac and /u/xx_user_1, would you guys be able to start a game, then have the Suns quit out? Sorry for the inconvenience, but for whatever reason 2K just doesn't have your other games on the admin schedule."
357,justinjustin7,I'd guess that the issue is with the card reader as profblackjack suggested. I had the same issue the other day with an sdhc in my card reader that doesn't work. I ended up [using my Wii with homebrew](https://www.reddit.com/r/AskReddit/comments/2uea6t/what_kind_of_electronic_life_hacks_have_you/co7m67d) to transfer the data. Either get a new reader (get one that is sdhc capable for your higher memory needs) or if you have a homebrew enabled Wii you can try doing something similar to what I did.,t1_codyoy8,t3_2v2t04,,xx_selfpost,t3_2v2t04,answer,1.0,4,3ds,44,Help with 3DS SD card problems.,,"[ElvenHero, profblackjack, justinjustin7]",xx_user_2,I'd guess that the issue is with the card reader as xx_user_1 suggested. I had the same issue the other day with an sdhc in my card reader that doesn't work. I ended up [using my Wii with homebrew](https://www.reddit.com/r/AskReddit/comments/2uea6t/what_kind_of_electronic_life_hacks_have_you/co7m67d) to transfer the data. Either get a new reader (get one that is sdhc capable for your higher memory needs) or if you have a homebrew enabled Wii you can try doing something similar to what I did.
482,guPPer,"I'll be cheering on from Melbourne, at 5am, the sun might just start showing some rays. I'll with you in spirit /u/Waldinator !",t1_cenlb4f,t3_1v0rf9,,xx_selfpost,t3_1v0rf9,agreement,1.0,3,49ers,58,A message from Down Under,,"[Waldinator, Jussi56, guPPer, niteaurora, beall49, gwo, EndersBuggers, IvyGold, kahrinn, wotsupdog, ForeverFun, jroc83]",xx_user_2,"I'll be cheering on from Melbourne, at 5am, the sun might just start showing some rays. I'll with you in spirit /u/xx_user_0 !"
583,avazah,"Well, your measurements put you at 30E, and these bras are 32E. That's one cup size larger than 30E, so it's no wonder the pour moi doesn't fit. It may fit in 30E, I can't really say, or 32DD, but I believe it is too big for you here.\n\nI also agree with what ButTheBoobies says about the elegance. Can't really make a call on whether it fits or not just yet!",t1_c9vunc6,t1_c9vq1mq,,xx_selfpost,t1_c9vq1mq,answer,3.0,3,abrathatfits,67,Got 2 bras in same size. One seems to fit perfectly while the other doesn't fit. Help me determine which size to get for the second one!,,"[halpmewithbra, ButTheBoobies, avazah, unseenarchives, t_maia]",xx_user_2,"Well, your measurements put you at 30E, and these bras are 32E. That's one cup size larger than 30E, so it's no wonder the pour moi doesn't fit. It may fit in 30E, I can't really say, or 32DD, but I believe it is too big for you here.\n\nI also agree with what xx_user_1 says about the elegance. Can't really make a call on whether it fits or not just yet!"
585,unseenarchives,I believe it's just an incompatible shape. /u/t_maia recommended it for me and I am very much the opposite of shallow. Narrow root and tons of projection.,t1_c9vr2xe,t1_c9vq7xu,,xx_selfpost,t1_c9vq7xu,elaboration,4.0,5,abrathatfits,67,Got 2 bras in same size. One seems to fit perfectly while the other doesn't fit. Help me determine which size to get for the second one!,,"[halpmewithbra, ButTheBoobies, avazah, unseenarchives, t_maia]",xx_user_3,I believe it's just an incompatible shape. /u/xx_user_4 recommended it for me and I am very much the opposite of shallow. Narrow root and tons of projection.
586,t_maia,">/u/t_maia recommended it for me \n\nYou got the wrong bra I think. \n\nPourMoi? Amour is available in two versions, one padded plunge (which is recommended for shallow shapes, PM-1500) and an unpadded balcony bra (which you got, PM-1502). \n\nSearch for PourMoi? Amour in this subreddit and you'll find fitchecks showing the other bra. \n\nYou can also compare the two at bratabase. \n\nhttp://www.bratabase.com/browse/pour-moi/amour-padded-pm-1500/\nhttp://www.bratabase.com/browse/pour-moi/amour-underwired-non-padded-pm-1502/\n\nIn addition I think the PourMoi? Amour sits too low just like the CK Elegance. You might want to try this bra again with the wires probably in the inframammary fold. \n\nButTheBoobies already covered the rest. \n\nBTW, since you got the unpadded now, could you please enter it into bratabase before you send it back? Thanks.",t1_c9wcvzn,t1_c9vr2xe,,xx_selfpost,t1_c9vr2xe,elaboration,5.0,6,abrathatfits,67,Got 2 bras in same size. One seems to fit perfectly while the other doesn't fit. Help me determine which size to get for the second one!,,"[halpmewithbra, ButTheBoobies, avazah, unseenarchives, t_maia]",xx_user_4,">/u/xx_user_4 recommended it for me \n\nYou got the wrong bra I think. \n\nPourMoi? Amour is available in two versions, one padded plunge (which is recommended for shallow shapes, PM-1500) and an unpadded balcony bra (which you got, PM-1502). \n\nSearch for PourMoi? Amour in this subreddit and you'll find fitchecks showing the other bra. \n\nYou can also compare the two at bratabase. \n\nhttp://www.bratabase.com/browse/pour-moi/amour-padded-pm-1500/\nhttp://www.bratabase.com/browse/pour-moi/amour-underwired-non-padded-pm-1502/\n\nIn addition I think the PourMoi? Amour sits too low just like the CK Elegance. You might want to try this bra again with the wires probably in the inframammary fold. \n\nxx_user_1 already covered the rest. \n\nBTW, since you got the unpadded now, could you please enter it into bratabase before you send it back? Thanks."
592,crazymusicalgenius96,"That's totally fine! The recommendations /u/Goddess_Keira gave sound pretty good, I hope you can find some you like! You don't have to worry about the gore tacking in a wire-free bra, as they aren't designed to do that, but the other signs of good fit will still be important to look out for.",t1_cn60kfs,t1_cn5ykrq,,xx_selfpost,t1_cn5ykrq,,3.0,4,abrathatfits,68,I'm desperate and need help finding comfortable wireless bras.,,"[inbedwithabook, crazymusicalgenius96, Goddess_Keira, HorchataBorracha, gigglesmcbug, BotticellusRex, lizzyborden42, lo_dolly_lolita]",xx_user_1,"That's totally fine! The recommendations /u/xx_user_2 gave sound pretty good, I hope you can find some you like! You don't have to worry about the gore tacking in a wire-free bra, as they aren't designed to do that, but the other signs of good fit will still be important to look out for."


In [20]:
with ProgressBar():
    df['text_proc'] = ddf.from_pandas(df, npartitions=n_cores / 4).map_partitions(
        lambda dfx: dfx['text'].apply(lambda x: clean_and_truncate(x, linebreaks_token=True))).compute(scheduler="processes")

[########################################] | 100% Completed |  1min  0.9s


In [21]:
with ProgressBar():
    df['title_proc'] = ddf.from_pandas(df, npartitions=n_cores / 4).map_partitions(
        lambda dfx: dfx['title'].apply(lambda x: clean_and_truncate(x, linebreaks_token=True))).compute(scheduler="processes")

[########################################] | 100% Completed | 33.8s


In [22]:
# makes it easier for further computations
df['post_depth'] = df['post_depth'].fillna(0)
df['in_reply_to'] = df['in_reply_to'].fillna(-1)

In [23]:
def get_before_in_level(thread_id, post_id, depth, reply_to):
#     print(post_id, depth)
    df_a = df[(df['thread_id'] == thread_id) & (df['post_depth'] == depth) & (df['in_reply_to'] == reply_to)]
    res = []
    for _, row in df_a.iterrows():
        res.append(row)
        if row['id'] == post_id:
            return res
#     should never happen!
    print(thread_id, post_id, depth)
    return None

In [24]:
def prepend_previous(row, df):
    
    
    
    current_depth = row['post_depth']
    current_post_id = row['id']
    current_reply_to_id = row['in_reply_to']
    
    all = []
    
    while current_depth >= 0:
#         print(current_depth)
        res = get_before_in_level(row['thread_id'], current_post_id, current_depth, current_reply_to_id)
        if res is None:
            # happens if the root post is not in the data (missing, FUCK)
            print('fail: ', row['thread_id'], current_post_id, current_depth, current_reply_to_id)
#             assert current_depth == 0
        else:
            all += res
            current_post_id = res[-1]["in_reply_to"]
            if str(current_post_id) != str(-1):
                bla  = df[df['id'] == current_post_id]
                if len(bla) > 0:
                    current_reply_to_id = bla.iloc[0]['in_reply_to']
                else:
                    print('FUCK, there is something wrong')
                    
#                   just to make sure we are gettinga fucking mistake
                    rank = 0
                    if len(res) > 0:
                        rank = res[-1]
                        if 'rank' in rank:
                            rank = rank['rank']
                        
                    bla = df[(df['rank'] < rank) & (df['thread_id'] == row['thread_id'])]
                    if len(bla) > 0:
                        bla = bla.sort_values('rank', ascending=True)
                        current_post_id = bla.iloc[-1]["id"]
                        current_reply_to_id = bla.iloc[-1]['in_reply_to']
                        current_depth = bla.iloc[-1]['post_depth'] + 1 # because of the other -1
                        
        current_depth -= 1

    all = pd.DataFrame({'subreddit': [x['subreddit'] for x in all], 'user_index': [x['user_index'] for x in all], 'is_self_post': [x['is_self_post'] for x in all], 'rank': [x['rank'] for x in all], 'text_proc': [x['text_proc'] for x in all], 'title_proc': [x['title_proc'] for x in all]})    
    # just sort by rank to get the correct order
    all = all.sort_values('rank', ascending=True)
    
    final_string = 'xx_thread_end '
    
    for _, row in all.iterrows():
        if row['rank'] == 0:
            final_string += ' xx_comment_start ' + row['title_proc'] + ' xx_sep ' + row['text_proc'] + ' xx_sep ' + row['is_self_post'] + ' xx_sep2 ' + row['user_index'] + ' xx_sep2 ' + row['subreddit'] + ' xx_comment_end '
        else:
            final_string += ' xx_comment_start ' + row['text_proc']  + ' xx_sep2 ' + row['user_index'] + ' xx_sep2 ' + row['subreddit'] + ' xx_comment_end '
            
    final_string += ' xx_thread_end'
#     for title, text in zip(all['title_proc'].values, all['text_proc'].values):

    return final_string

In [25]:
with ProgressBar():
    dask_result = ddf.from_pandas(df.set_index('thread_id'), npartitions=n_cores/2).map_partitions(
        lambda dfx: dfx.reset_index().apply(prepend_previous, axis=1, df=dfx.reset_index()), meta=('str')).compute(scheduler="processes")

[                                        ] | 0% Completed | 39.8sFUCK, there is something wrong
[                                        ] | 0% Completed | 40.0sFUCK, there is something wrong
[                                        ] | 0% Completed | 52.6sFUCK, there is something wrong
[                                        ] | 0% Completed | 52.7sFUCK, there is something wrong
[                                        ] | 0% Completed |  1min 15.7sFUCK, there is something wrong
[                                        ] | 0% Completed |  1min 15.8sFUCK, there is something wrong
[                                        ] | 0% Completed |  1min 52.3sFUCK, there is something wrong
[                                        ] | 0% Completed |  1min 54.0sFUCK, there is something wrong
[                                        ] | 0% Completed |  2min 25.5sFUCK, there is something wrong
[                                        ] | 0% Completed |  2min 26.1sFUCK, there is something wrong
[   

[                                        ] | 0% Completed |  5min 35.9sFUCK, there is something wrong
[                                        ] | 0% Completed |  5min 44.9sFUCK, there is something wrong
[                                        ] | 0% Completed |  5min 46.7sFUCK, there is something wrong
[                                        ] | 0% Completed |  5min 47.1sFUCK, there is something wrong
[                                        ] | 0% Completed |  5min 47.8sFUCK, there is something wrong
[                                        ] | 0% Completed |  5min 47.9sFUCK, there is something wrong
FUCK, there is something wrong
[                                        ] | 0% Completed |  5min 48.7sFUCK, there is something wrong
[                                        ] | 0% Completed |  5min 49.1sFUCK, there is something wrong
[                                        ] | 0% Completed |  5min 49.3sFUCK, there is something wrong
[                                        ] | 0% Com

FUCK, there is something wrong
[                                        ] | 0% Completed |  8min  0.6sFUCK, there is something wrong
[                                        ] | 0% Completed |  8min  3.1sFUCK, there is something wrong
[                                        ] | 0% Completed |  8min  3.9sFUCK, there is something wrong
[                                        ] | 0% Completed |  8min  6.4sFUCK, there is something wrong
[                                        ] | 0% Completed |  8min 21.5sFUCK, there is something wrong
[                                        ] | 0% Completed |  8min 21.6sFUCK, there is something wrong
[                                        ] | 0% Completed |  8min 22.2sFUCK, there is something wrong
[                                        ] | 0% Completed |  8min 22.5sFUCK, there is something wrong
[                                        ] | 0% Completed |  8min 27.3sFUCK, there is something wrong
[                                        ] | 0% Com

[                                        ] | 0% Completed | 11min 43.7sFUCK, there is something wrong
FUCK, there is something wrong
[                                        ] | 0% Completed | 11min 45.0sFUCK, there is something wrong
[                                        ] | 0% Completed | 12min  1.8sFUCK, there is something wrong
[                                        ] | 0% Completed | 12min 15.6sFUCK, there is something wrong
[                                        ] | 0% Completed | 12min 15.7sFUCK, there is something wrong
[                                        ] | 0% Completed | 12min 27.6sFUCK, there is something wrong
[                                        ] | 0% Completed | 12min 44.3sFUCK, there is something wrong
[                                        ] | 0% Completed | 12min 44.4sFUCK, there is something wrong
[                                        ] | 0% Completed | 12min 44.6sFUCK, there is something wrong
[                                        ] | 0% Com

[####                                    ] | 10% Completed | 15min  1.2sFUCK, there is something wrong
[####                                    ] | 10% Completed | 15min  1.4sFUCK, there is something wrong
[####                                    ] | 10% Completed | 15min 14.2sFUCK, there is something wrong
[####                                    ] | 10% Completed | 15min 14.5sFUCK, there is something wrong
[####                                    ] | 10% Completed | 15min 15.0sFUCK, there is something wrong
[########                                ] | 20% Completed | 15min 24.9sFUCK, there is something wrong
[########                                ] | 20% Completed | 15min 25.0sFUCK, there is something wrong
[########                                ] | 20% Completed | 15min 25.1sFUCK, there is something wrong
[########                                ] | 20% Completed | 15min 25.3sFUCK, there is something wrong
[########                                ] | 20% Completed | 15min 30.1sF

In [26]:
df['text_final'] = dask_result.values

In [64]:
df.to_pickle('reddit.pkl')

In [3]:
df = pd.read_pickle('reddit.pkl')

In [4]:
kf = KFold(n_splits=10, random_state=1312, shuffle=True)
xx = [[i] for i in range(df['thread_id'].values[-1] + 1)]

folds = []
for train_index, test_index  in kf.split(xx):
    folds.append([train_index, test_index])

In [5]:
fold_id = 0

In [6]:
folds[0][0][20]

25

In [7]:
df.shape

(116357, 20)

In [8]:
df_tr = df.set_index('thread_id').loc[folds[fold_id][0]].reset_index()

In [9]:
df_tr.shape

(104760, 20)

In [10]:
df_vl = df.set_index('thread_id').loc[folds[fold_id][1]].reset_index()

In [11]:
df_vl.shape

(11597, 20)

In [12]:
len(df_vl) + len(df_tr)

116357

In [13]:
assert len(df_vl) + len(df_tr) == len(df)

In [14]:
df_tr['text_final'].iloc[3]

"xx_thread_end  xx_comment_start DTX000: #00 - Nashville xx_sep 0/0/00 0/00/00 <URL> xx_linebreak It was only a few minutes into Robert Altman's homespun epic *Nashville* that I got the feeling I was watching a great movie. By the end it could not be denied. Now I'm sure it helps that I'm a musician, since this created an immediate connection to the subject matter. I spent a portion of the movie with my Telecaster in my lap trying to play along with the characters who all seem to be really playing and singing these songs. However I also am *not* a fan of country western, so that could have easily been a turn off. To begin describing the action in the film is daunting. I can't even process a lot of what I saw. This movie is extremely dense, and the first 00 minutes or so are spent just trying to figure out who people are. Their relationships to one another - some of which are purely incidental - slowly become clear as things progress. It's an ensemble cast with no clear lead and lots of

In [15]:
df_tr.shape

(104760, 20)

In [16]:
df_vl.shape

(11597, 20)

In [17]:
df_tr = df_tr.dropna(subset=['body'])
df_vl = df_vl.dropna(subset=['body'])

In [18]:
df_tr.shape

(102998, 20)

In [19]:
df_vl.shape

(11420, 20)

In [20]:
cut = 1400
exp = 're_exp_f_' + str(cut) + '_' + str(fold_id)
exp

're_exp_f_1400_0'

In [21]:
data_lm = TextLMDataBunch.from_df('/mnt/data/group07/johannes/reddit/' + exp, df_tr, df_vl, max_vocab=30000, text_cols='text_final', tokenizer=Tokenizer(cut_n_from_behind=cut - 2))

In [22]:
data_lm.save(cache_name='tmp_lm')

In [23]:
df_tr_cl = df_tr.dropna(subset=['majority_type', 'body'])

In [24]:
# drop other

In [25]:
df_tr_cl = df_tr_cl[df_tr_cl['majority_type']!='other']

In [26]:
df_tr_cl.shape

(90205, 20)

In [27]:
df_vl_cl = df_vl.dropna(subset=['majority_type', 'body'])

In [28]:
df_vl_cl = df_vl_cl[df_vl_cl['majority_type']!='other']

In [29]:
df_vl_cl.shape

(10131, 20)

In [30]:
data_clas = TextClasDataBunch.from_df('/mnt/data/group07/johannes/reddit/' + exp, df_tr_cl, df_vl_cl,
                                          vocab=data_lm.train_ds.vocab, bs=64, text_cols=['text_final'], label_cols='majority_type' ,tokenizer=Tokenizer(cut_n_from_behind=cut -2))

In [31]:
data_clas.save(cache_name='tmp_cl')