In [1]:
import pandas as pd
from nltk import sent_tokenize, word_tokenize, WordNetLemmatizer,pos_tag, PorterStemmer
from nltk.corpus import stopwords, wordnet
import swifter
pd.options.display.max_colwidth = 5000

political = ['liberal', 'conservative','politics']
nonpolitical = ["twoxchromosomes","showerthoughts",\
                'todayilearned',"tifu"]
test = ['canada']

subreddit_list = political + nonpolitical + test

# 2019

In [2]:

# Read the feather files
def clean(year):
    data = pd.DataFrame()
    for sub in subreddit_list:
        rf = pd.read_feather('../data/feather_files/RS_'+year+\
                             '_'+sub+'_df.feather')
        if sub in political:
            rf['tclass'] = 'political'
        elif sub in nonpolitical:
            rf['tclass'] = 'nonpolitical'
        elif sub == 'canada':
            rf['tclass'] = 'test'
        else:
            rf['tclass'] = 'Unkown'
        data = pd.concat((data,rf))
    
    # combining titles with the post body.
    data['text'] = data.title + " " + data.selftext
    # deleting the previous two columns.
    data.drop(columns=["title","selftext"],inplace=True)
    # change letters to lowercase.
    data.text = data.text.str.lower()
    # remove the entry of deleted posts.
    data.text = data.text.replace({'[deleted]':'', '[removed]':'', 'http':'', 'tifu':'', 'todayilearned':''})
    
    # Removing posts with less than 20 characters in the body.
    data = data[(data.text.astype(str).str.len()>20)].reset_index()
    print(data.groupby('tclass').describe(percentiles=[.5]))
    print(data.groupby('subreddit').describe(percentiles=[.5]))
    
    # functions for reducing words. Either stimming or lemmatizing. The latter is better.
    lemma = WordNetLemmatizer()
    stemmer = PorterStemmer()
    english_stopwords = stopwords.words("english")
    def lemmt(word):
        word = lemma.lemmatize(word,pos='v')
        word = lemma.lemmatize(word,pos='n')
        return lemma.lemmatize(word,pos='a')
    def stemm(word):
        return stemmer.stem(word)

    def clean(word):
        '''
        return False if word to be skipped
        '''
        word = lemma.lemmatize(word,pos='v')
        word = lemma.lemmatize(word,pos='n')
        word = lemma.lemmatize(word,pos='a')
        
        if not word.isalpha() or len(word) <= 3 or word in english_stopwords:
            return ''
        return word
    
    def clean_text(text):
        """
        Input : text
        output : cleaned text
        process:
            1.Remove non-alphabitical words
            2.remove words of less than 3 characters
            3.Remove stopwords
            4.Transform words to lower characters
            4.lemmatize the text - First verbs then nouns
            * Steps are performed in that order.
        """
        tokens = ' '.join(map(clean, text.split(' ')))
        #tokens = re.sub(r'[^\s]+', lambda m: clean(m.group(0)), s)
        #tokens = word_tokenize(text)
        #tokens = " ".join(
        #    [lemmt(word) for word in tokens if clean(word)])
        return tokens
    #--------------------------------------------------
    data.text = data.text.swifter.allow_dask_on_strings(enable=True).apply(clean_text)
    data.reset_index().drop(columns=['index'])\
        .to_feather('../data/feather_files/data'+year+'clean.feather')
    return data

In [3]:
data = clean('2019')

                 index                                                      \
                 count          mean           std  min      50%       max   
tclass                                                                       
nonpolitical   86327.0  15254.029388  11372.195569  0.0  12988.0   41455.0   
political     150970.0  64436.601013  42224.850546  0.0  63485.5  139084.0   
test           17207.0   8613.332655   4973.707695  0.0   8612.0   17226.0   

             num_comments                                               \
                    count        mean         std   min   50%      max   
tclass                                                                   
nonpolitical      86327.0  132.939335  372.078757  15.0  27.0  16639.0   
political        150970.0  153.443088  718.448717  15.0  40.0  52183.0   
test              17207.0  113.038938  249.240752  15.0  46.0  13216.0   

                 score                                                   
            

Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

In [4]:
data2 = clean('2020')

                 index                                                      \
                 count          mean           std  min      50%       max   
tclass                                                                       
nonpolitical   74050.0  11635.609548   8181.401299  0.0   9376.5   28157.0   
political     195303.0  67597.977333  47786.584376  0.0  60539.0  158352.0   
test           15090.0   7554.462359   4361.808176  0.0   7553.5   15108.0   

             num_comments                                               \
                    count        mean         std   min   50%      max   
tclass                                                                   
nonpolitical      74050.0  144.617461  376.065479  15.0  31.0  10098.0   
political        195303.0  171.318940  919.181768  15.0  39.0  91645.0   
test              15090.0  120.690060  239.298333  15.0  51.0  12721.0   

                 score                                                   
            

Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

In [5]:
data2

Unnamed: 0,index,author,created_utc,num_comments,score,domain,locked,subreddit,tclass,text
0,0,progress18,2020-01-01,38,300,abc11.com,False,Liberal,political,north carolina judge formally strike voter
1,1,thefirststoryteller,2020-01-02,21,214,politico.com,False,Liberal,political,julián castro presidential
2,2,spaceghoti,2020-01-02,20,12,time.com,False,Liberal,political,conservative civil right restrict abortion
3,3,[deleted],2020-01-03,18,439,self.Liberal,False,Liberal,political,veteran bill house help block mitch mcconnell
4,4,Looking_Light33,2020-01-03,34,13,theguardian.com,False,Liberal,political,democratic presidential candidate condemn kill iran general world news
...,...,...,...,...,...,...,...,...,...,...
284438,15104,TortuouslySly,2020-12-31,16,71,journaldequebec.com,False,canada,test,first canadian long term care centre receive vaccine struggle major resident staff
284439,15105,cdnflower,2020-12-31,23,0,toronto.ctvnews.ca,False,canada,test,mysterious monolith appear seemingly overnight along shoreline
284440,15106,PoutineExpert,2020-12-31,26,106,montreal.ctvnews.ca,False,canada,test,quebec liberal leader barbados home
284441,15107,69blazeit69chungus,2020-12-31,16,4,canada.ca,False,canada,test,news release detail covid test require incoming traveller
