In [1]:
import pandas as pd
import ujson
import json
pd.options.display.max_colwidth = 5000
import matplotlib

In [2]:
subreddit_list = ['canada', 'liberal', 'conservative','politics',\
"twoxchromosomes","showerthoughts","tifu"]
political = ['liberal', 'conservative','politics']
nonpolitical = ["twoxchromosomes","showerthoughts","tifu"]
# Read the feather files
data = pd.DataFrame()
for sub in subreddit_list:
    rf = pd.read_feather('../data/feather_files/RS_2019_'+sub+'_df.feather')
    if sub in political:
        rf['class'] = 'political'
    elif sub in nonpolitical:
        rf['class'] = 'nonpolitical'
    elif sub == 'canada':
        rf['class'] = 'test'
    else:
        rf['class'] = 'Unkown'
    data = pd.concat((data,rf))

# Removing posts with less than 10 characters in the body.
data = data[(data.selftext.astype(str).str.len()>10)].reset_index()
print(data.groupby('class').describe(percentiles=[.5]))

# combining titles with the post body. 
data['text'] = data.title + " " + data.selftext
# deleting the previous two columns.
data.drop(columns=["title","selftext"],inplace=True)

                index                                                       \
                count          mean           std   min      50%       max   
class                                                                        
nonpolitical  14804.0   7282.288503   8386.425392   0.0   5001.0   41450.0   
political       885.0  43950.302825  47488.106523  40.0  16476.0  139049.0   
test           2147.0   7482.789474   4476.914083   1.0   7538.0   17225.0   

             num_comments                                                  \
                    count         mean          std   min    50%      max   
class                                                                       
nonpolitical      14804.0   143.823021   457.592274  15.0   26.0  16639.0   
political           885.0  3414.674576  7093.978640  15.0  177.0  52183.0   
test               2147.0    71.351653   204.972232  15.0   31.0   6441.0   

                score                                               

In [3]:
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
import random
import swifter

In [4]:
def clean_text(text):
    """
    Input : text
    output : cleaned text
    process:
        1.Remove non-alphabitical words
        2.remove words of less than 3 characters
        3.Remove stopwords
        4.Transform words to lower characters
        4.lemmatize the text - First verbs then nouns
        * Steps are performed in that order.
    """
    tokens = word_tokenize(text)
    lemma = WordNetLemmatizer()
    def clean(word):
        if not word.isalpha() or len(word) < 3:
            return False
        if word.lower() in stopwords.words("english"):
            return False
        return True
    tokens = " ".join(str(x) for x in \
        [lemma.lemmatize(\
            lemma.lemmatize(word.lower(),pos="v")\
            ,pos="n") for word in tokens if clean(word)]\
        )
    return tokens
#--------------------------------------------------
data.text = data.text.swifter.allow_dask_on_strings(enable=True).apply(clean_text)

Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

In [5]:
data.reset_index().drop(columns=['index'])\
    .to_feather('../data/feather_files/data2019clean.feather')

In [6]:
data

Unnamed: 0,index,author,created_utc,num_comments,score,domain,locked,subreddit,class,text
0,1,villehockey13,2019-01-01,74,0,self.canada,False,canada,test,argument pipeline hard time find info pipeline hop get clarification hear people say thing like train pollute alberta carry canada economy year province owe ruin environment statement prove otherwise convince someone pipeline good environment
1,4,AutoModerator,2019-01-01,21,23,self.canada,True,canada,test,monthly photography contest reddit gold one suggestion want implement census photography competition contest run every month win image post reddit search bar next month contest end credit give sidebar also keep image future use undecided time whatever reason want retain copy image delete request understand incredibly rare pepes type photography look rule submission must take photograph must permission use photograph submit photograph creation leave original photograph untouched possible add text overlay image etc nothing nsfw allow good luck submission
2,8,xiic,2019-01-01,222,65,self.canada,False,canada,test,illegal specify gender job post get employment equity act http certain class receive special status correct condition disadvantage employment experience woman aboriginal people person disability member visible minority still illegal canadian human right act http discriminate someone base sex wonder first time ever see job post female requirement http
3,10,iphonexmas,2019-01-01,163,2350,self.canada,False,canada,test,grateful live beautiful country happy new year fellow canadian happy new year everyone wish family beautiful full smile positive vibe
4,13,7Trickster,2019-01-01,47,0,self.canada,False,canada,test,canadian need help make decision dear canadian amp firstly happy new year want start give context bear live france since forever work expatriate insurance company age since december time think move elsewhere paris life feel empty bore like go forward live good atmosphere simple man simply wish live peaceful place make family raise child better live condition better life part urge move due fact lose lot friend year various reason somehow want fresh new start see something new simply put life bore enjoy communicate english try look option first candidate due short distance despite felt like would somehow feel like paris worse decide live australia another candidate far family member live portugal spain france austria usa cabo verde compare pro con realize france decent perk better balance work life expensive health care good doctor good food gmos one issue eye direction head identify many european merkel silly decision leadership elite lead disastrous result country germany sweden even france goal get economic migrant big corporation pay le would person country doubt shady objective behind set eye canada never read see lot contradiction online really initial idea come ask guy directly think country overall pact france quebec save could allow specialize study accès etude québec http http roughly one year get chance start new life tempt point question need answer opinion know take decision health care system know suppose free read lot information queue surgery example thing capable go month year beyond bite scary understand private clinic sure many would get rid issue cost job know come cheap north america general tell amp people read lot conflict opinion many say canadian general welcome first everyone others say canadian good heart nice people another group say canadian keep hard make friend new country result kinda difficult guess right course every population positive negative individual want understand people general canada work everyday life amp work condition admit extensive research understand get really better balance least report french people live canada average wage people behave general office etc want know guy could give opinion amp buy apartment save money want buy apartment bedroom barely idea price canada seem quite high saw interest offer apartment look amaze unlike old use apartment find paris suburb anyone kind project mind far one cad store original plan buy france apartment eur truth tell extremely expensive area around paris end hate capital amp politics one simple feel politics general much corruption voice people hear unlike france many european country due parliament dictatorship brussels amp sorry post long want make background want know clear thank time great day happy new year
...,...,...,...,...,...,...,...,...,...,...
17831,8007,Cyclan,2019-12-31,1340,29625,self.tifu,False,tifu,nonpolitical,tifu thirteen year make hour ago discover conversation thirteen year ago work paralegal do law degree age firm solicitor work part big chain produce free will via workplace offer trade union deal people would fill form send would construct base preference form share office attractive funny smart young woman give appropriate train complete will time dump boyfriend several year find sleep someone else university home town several hundred mile away brief period single ask sort age gap people acceptable long conversation tell know couple year mention hear century ago older men often younger woman able take care say younger woman likely survive childbirth good mother mature husband income keep comfortable keep ask opinion ask would ever consider see woman think likely shrug avoid question throughout month work together often go work weekend group would dress provocatively occasion wear stock would let know previously confess particular several month get onto solicitor train programme leave keep touch meet occasion food coffee drink couple year meet someone new firm stop meet get marry baby bump hour ago court deliver paper represent client insist coffee chat tell reason ask big age gap relationship massive crush scar damage friendship anything hop would finally decide wasnt go happen leave course absolutely idea go gorgeous young woman interest beyond friendship honestly could cry make friend hot girl office notice crush
17832,8008,WhoopsieDaisy88,2019-12-31,22,56,self.tifu,False,tifu,nonpolitical,tifu give boyfriend blowjob actually happen last night still ashamed feel need punish share world warn tmi alert bottom sooo aunt flow town last night feel frisky also slightly intoxicate skip dinner yet still manage polish nearly half bota box malbec perhaps result also particularly give mood time boyfriend make way upstairs get bed get gnarly detail even end get toot horn first say oral skill tell fortunate enough experience typically pretty darn spectacular never let teeth get action always finish job yes boy finish right way sayin anyways case beau thoroughly enjoy minute otherwise would best blowjob life word mine impressive number ooh ahh mmhmm could tell get close finish line home stretch neck bite sore lip felt like fall straight face determine elicit climax epic proportion guy admit long overdue favor like one power faster deeper get point close edge really like go boyfriend usually get little fidgety start gently thrust gently fine mood set right normally favorite part routine second grand finale help keep perfect pace since work actually take nice load pun intend fatigue neck read thus far detail narrative might think geez get point already perhaps time open safari private mode navigate straight pornhub take care boner get latter true best ahead continue read rest assure soldier stand attention soon find shrivel dead position submissively knee look love everything look ecstasy face move hip bed fast mouth feel begin swell release hit hard gag reflex vomit everywhere remember wine lot lot wine yes vomit place manhood belly face throat drip chest sheet dear god sheet soak straight mattress horrific like scene straight walk dead someone walk run live call police report homicide fortunately one actually die believe say wish moment able handle fair share bedroom gracefully past absolutely grace last night say well end well though ultimately swap sheet clean bed boyfriend manage miraculously get back round night good old fashion missionary point little period blood gon stop get happy end say sure since pretty sure red ball still beat blue kind day week thank goodness skip dinner hey could worse chunky much yeah sorry sorry give boyfriend glorious head totally ruin finish vomitting private bed gross
17833,8009,[deleted],2019-12-31,27,30,self.tifu,False,tifu,nonpolitical,tifu shake man hand actually three month ago reddit back use fuckbuddy year younger cute bubbly great bed look like young jacqueline moore http basically always call make laugh hot little black bitch also international student travel sydney australia south africa study university father reasonably important person reverend involve one church familiar one basically organise live alone nice apartment study needle say parent religious expect good little girl marry begin parent travel australia south africa first ever visit since move ask come see one last time parent arrive last chance get fuck week well think pussy head decide follow suit think dick spite parent imminent arrival head quick fuck three hour later sex quick fuck obviously escalate escalate point parent way need help quickly hide evidence tryst instead leave stay help make bed throw garbage contain condom wrapper hide lube toy etc end result still parent arrive dress give hot day sweat suspicious parent knock door quickly throw exercise book pen coffee table open textbook kneel say coffee table chat parent amicably pretend study together lose track time smooth smooth maybe pursue career act play part well time leave politely say goodbye kiss mother cheek shake father hand hot little black bitch lead door say goodbye shut door saw father background sniff hand fuck yep hand smelt like pussy shake reverend hand daughter pussy finger wash hand clearly wash thoroughly enough fuckbuddy longer live alone live home run father church surround female roommate expressly forbid see bad influence also angry let cat bag parent little girl grow know sex anyone person fuck right tldr shake reverend hand smell daughter pussy finger allow see
17834,8011,FadCap,2019-12-31,19,131,self.tifu,False,tifu,nonpolitical,tifu wait ask girl want new year eve know girl always like personality always crush feel grow quite lot live another country originally visit summer holiday since parent live friend friend happen friend also reason see visit saw many time christmas never tell feel get master plan tell ask new year eve maybe ball want push date well sit friend new year party sad leave yesterday visit long time wait ask crush new year eve plan day never show leave back home country edit thank response supportive comment shit hit hard today side want send message side update make decision
