In [1]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Common Steps:
1. Remove Special Characters
2. Tokenize
3. Lemmatize/Stem
4. Remove Stop Words

In [2]:
subreddits = pd.read_csv('../data/subreddits_clean.csv')

In [3]:
subreddits.drop(columns = 'Unnamed: 0', inplace = True)

### Contractions

In [4]:
contractions = {
    "aren't": 'are not',
    "can't": 'cannot',
    "couldn't": "could not",
    "didn't": "did not",
    "doesnt": "does not",
    "don't": "do not",
    "hadn't": "had not", 
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "i'd": "i would",
    "i'll": "i will",
    "i am": "i'm",
    "i've": "i have",
    "isn't": "is not",
    "it's": "it is",
    "let's": "let us",
    "she'd":"she would",
    "she'll": "she will",
    "she's": "she is",
    "shouldn't":"should not",
    "there's": "there is",
    "they'd": "they had",
    "they'll":"they will",
    "they're": "they are", 
    "they've": "they have",
    "we'd": "we would",
    "we're": "we are",
    "we've": "we have", 
    "weren't": "were not",
    "what's": "what is",
    "where's": "where is",
    "who'd": "who would",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

In [5]:
contraction_indices = []

for index, post in enumerate(subreddits['total_text']):
    for word in post.split():
        if word.lower() in contractions.keys(): # Check if word is a contraction
            contraction_indices.append((index, word)) # Add the index and contraction to a list

for entry in contraction_indices:
    subreddits.loc[entry[0], 'total_text'] = subreddits.loc[entry[0], 'total_text'].replace(entry[1].lower(), contractions[entry[1].lower()])

### Create Tokens and Lemmatize

In [6]:
post_tokens = []

for post in subreddits['total_text']:
    post_tokens.append(word_tokenize(post.lower()))

In [7]:
lemmatizer = WordNetLemmatizer()

post_lemma_tokens = []

for token in post_tokens:
    running_lemma_tokens = []
    for word in token:
        running_lemma_tokens.append(lemmatizer.lemmatize(word))
    post_lemma_tokens.append(' '.join(running_lemma_tokens))
    
subreddits['lemma_text'] = post_lemma_tokens

In [8]:
subreddits.head()

Unnamed: 0,title,selftext,subreddit,author,num_comments,score,timestamp,total_text,post_length_char,post_length_words,lemma_text
0,Answers to why,,LifeProTips,AlienAgency,2,1,2020-07-17,Answers to why,15,3,answer to why
1,¿Quieres obtener juegos y premios gratis en tu...,,LifeProTips,GarbageMiserable0x0,2,1,2020-07-17,¿Quieres obtener juegos y premios gratis en tu...,60,10,¿quieres obtener juegos y premios gratis en tu...
2,Soothe digestion with lemongrass,,LifeProTips,thaigrrrrrrr,2,1,2020-07-17,Soothe digestion with lemongrass,33,4,soothe digestion with lemongrass
3,If your dog is panting a lot when you think th...,"They could have a number of problems going on,...",LifeProTips,CaptainJon720,7,1,2020-07-17,If your dog is panting a lot when you think th...,305,56,if your dog is panting a lot when you think th...
4,"If you're ever lonely, try writing. Whether it...",,LifeProTips,EJRose83,2,1,2020-07-17,"If you are ever lonely, try writing. Whether i...",162,33,"if you are ever lonely , try writing . whether..."


In [9]:
p_stemmer = PorterStemmer()

post_stemmer_tokens = []

for token in post_tokens:
    running_stemmer_tokens = []
    for word in token:
        running_stemmer_tokens.append(p_stemmer.stem(word))
    post_stemmer_tokens.append(' '.join(running_stemmer_tokens))

subreddits['stemmer_text'] = post_stemmer_tokens

In [11]:
subreddits.rename(columns = {'total_text':'original_text'})

Unnamed: 0,title,selftext,subreddit,author,num_comments,score,timestamp,original_text,post_length_char,post_length_words,lemma_text,stemmer_text
0,Answers to why,,LifeProTips,AlienAgency,2,1,2020-07-17,Answers to why,15,3,answer to why,answer to whi
1,¿Quieres obtener juegos y premios gratis en tu...,,LifeProTips,GarbageMiserable0x0,2,1,2020-07-17,¿Quieres obtener juegos y premios gratis en tu...,60,10,¿quieres obtener juegos y premios gratis en tu...,¿quier obten juego y premio grati en tu tiempo...
2,Soothe digestion with lemongrass,,LifeProTips,thaigrrrrrrr,2,1,2020-07-17,Soothe digestion with lemongrass,33,4,soothe digestion with lemongrass,sooth digest with lemongrass
3,If your dog is panting a lot when you think th...,"They could have a number of problems going on,...",LifeProTips,CaptainJon720,7,1,2020-07-17,If your dog is panting a lot when you think th...,305,56,if your dog is panting a lot when you think th...,if your dog is pant a lot when you think they ...
4,"If you're ever lonely, try writing. Whether it...",,LifeProTips,EJRose83,2,1,2020-07-17,"If you are ever lonely, try writing. Whether i...",162,33,"if you are ever lonely , try writing . whether...","if you are ever lone , tri write . whether it ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
5867,Want free cake?,,UnethicalLifeProTips,youdecidemyusername1,2,1,2019-10-14,Want free cake?,15,3,want free cake ?,want free cake ?
5868,"While doing online shopping, select the items...",,UnethicalLifeProTips,yesthatisfalse,0,3,2019-10-14,"While doing online shopping, select the items...",203,33,"while doing online shopping , select the item ...","while do onlin shop , select the item and then..."
5869,Lie about your income on credit card applicat...,Former credit analyst here. 9/10 they will not...,UnethicalLifeProTips,sadcthulu,2,4,2019-10-14,Lie about your income on credit card applicat...,255,42,lie about your income on credit card applicati...,lie about your incom on credit card applicatio...
5870,if you ever see someone you don't like lookin...,,UnethicalLifeProTips,IFoughtThereforeIWas,3,0,2019-10-14,if you ever see someone you do not like looki...,145,27,if you ever see someone you do not like lookin...,if you ever see someon you do not like look in...
