# Text Preprocessing

## Imports

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

## Read-in Data

In [2]:
subreddits = pd.read_csv('../data/subreddits_clean.csv')

In [3]:
subreddits.drop(columns = 'Unnamed: 0', inplace = True)

## Expanding Contractions

In [4]:
contractions = {
    "aren't": 'are not',
    "can't": 'cannot',
    "couldn't": "could not",
    "didn't": "did not",
    "doesnt": "does not",
    "don't": "do not",
    "hadn't": "had not", 
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "i'd": "i would",
    "i'll": "i will",
    "i am": "i'm",
    "i've": "i have",
    "isn't": "is not",
    "it's": "it is",
    "let's": "let us",
    "she'd":"she would",
    "she'll": "she will",
    "she's": "she is",
    "shouldn't":"should not",
    "there's": "there is",
    "they'd": "they had",
    "they'll":"they will",
    "they're": "they are", 
    "they've": "they have",
    "we'd": "we would",
    "we're": "we are",
    "we've": "we have", 
    "weren't": "were not",
    "what's": "what is",
    "where's": "where is",
    "who'd": "who would",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

In [5]:
contraction_indices = []

for index, post in enumerate(subreddits['total_text']):
    for word in post.split():
        if word.lower() in contractions.keys(): # Check if word is a contraction
            contraction_indices.append((index, word)) # Add the index and contraction to a list

for entry in contraction_indices:
    subreddits.loc[entry[0], 'total_text'] = subreddits.loc[entry[0], 'total_text'].replace(entry[1].lower(), contractions[entry[1].lower()])

## Lemmatize or Stem

In [6]:
# Create Tokens for Lemmatization or Stemming

post_tokens = []

for post in subreddits['total_text']:
    post_tokens.append(word_tokenize(post.lower()))

In [7]:
# Lemmatize Words and Convert Back to String

lemmatizer = WordNetLemmatizer()

post_lemma_tokens = []

for token in post_tokens:
    running_lemma_tokens = []
    for word in token:
        running_lemma_tokens.append(lemmatizer.lemmatize(word))
    post_lemma_tokens.append(' '.join(running_lemma_tokens))
    
subreddits['lemma_text'] = post_lemma_tokens

In [8]:
# Stem Words and Convert Back to String

p_stemmer = PorterStemmer()

post_stemmer_tokens = []

for token in post_tokens:
    running_stemmer_tokens = []
    for word in token:
        running_stemmer_tokens.append(p_stemmer.stem(word))
    post_stemmer_tokens.append(' '.join(running_stemmer_tokens))

subreddits['stemmer_text'] = post_stemmer_tokens

## Rename `total_text` column to `original_text`

In [9]:
subreddits.rename(columns = {'total_text':'original_text'}, inplace = True)

## Export Preprocessed Text Data to CSV

In [10]:
subreddits.to_csv('../data/subreddits_preprocessed.csv')