# Step 3: Data Cleaning and Text Preprocessing

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Set Up spaCy
nlp = spacy.load('en_core_web_sm')
stop_words_spacy = set(nlp.Defaults.stop_words)

## Read-In Data

In [2]:
health = pd.read_csv('../data/womens_health.csv', index_col = 'Unnamed: 0')
obsgyn = pd.read_csv('../data/fertility_and_pregnancy.csv', index_col = 'Unnamed: 0')
pospar = pd.read_csv('../data/postpartum.csv', index_col = 'Unnamed: 0')

## Check Head and Shape of Each Dataset

### General Women's Health Data

In [3]:
health.head(2)

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,Been to the clinic twice and they don’t know w...,So I’ve been having problems with discharge an...,WomensHealth,1596818251,thecrazedbunny,0,1,True,2020-08-07
1,Period going on for 14 days today. Help!!,Hi guys. I'm getting a bit worried about my pe...,WomensHealth,1596822599,Help-Me-Already,4,1,True,2020-08-07


In [4]:
health.shape

(31385, 9)

### Fertility and Pregnancy Data

In [5]:
obsgyn.head(2)

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,Adding to the kitchen sink approach: I just bo...,This time I’m going to be using a menstrual cu...,TryingForABaby,1596839749,lastput1,7,1,True,2020-08-07
1,Has anyone used/or currently an app to track t...,\nMy husband and I are new to TTC.\n\nWe have ...,TryingForABaby,1596841178,ParkingFrosting4,8,1,True,2020-08-07


In [6]:
obsgyn.shape

(98138, 9)

### Postpartum Data

In [7]:
pospar.head(2)

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,Question about bottles,I'm researching baby bottles and am trying to ...,BabyBumps,1596850138,All_Hail_CC,4,1,True,2020-08-07
1,Anyone else getting that generation gap judgem...,"Aunt- I don’t eat much meat, but when I was pr...",BabyBumps,1596850152,waterfallsummer,86,1,True,2020-08-07


In [8]:
pospar.shape

(51674, 9)

## Clean Data

### Combine `title` and `selftext` columns into a `total_text` column

The format of a Reddit post consists of a post title (`title`), and there is also an opportunity to write a body of text (`selftext`). For this project, I will be interested in analyzing all text in a post, so I will combine all of the text into a `total_text` column.

In [9]:
def create_total_text(df):
    df['total_text'] = df['title'] + ' ' + df['selftext']
    df.drop(columns = ['title', 'selftext'], inplace = True)
    return df.head(1)

In [10]:
create_total_text(health)
create_total_text(obsgyn)
create_total_text(pospar)

Unnamed: 0,subreddit,created_utc,author,num_comments,score,is_self,timestamp,total_text
0,BabyBumps,1596850138,All_Hail_CC,4,1,True,2020-08-07,Question about bottles I'm researching baby bo...


### Drop Unnecessary Columns

For this project, I am interested in using topic modeling to identify areas of concern for each domain of women's health. Therefore, the number of comments and score of each post will not be relevant and will be removed. Because the date is included under `timestamp`, the `created_utc` column will also be removed. The `is_self` column is also not needed and will be removed.

In [11]:
def clean_columns(df):
    df.drop(columns = ['created_utc', 'num_comments', 'score', 'is_self'], inplace = True)
    return df.head(1)

In [12]:
clean_columns(health)
clean_columns(obsgyn)
clean_columns(pospar)

Unnamed: 0,subreddit,author,timestamp,total_text
0,BabyBumps,All_Hail_CC,2020-08-07,Question about bottles I'm researching baby bo...


### Check for Null/Missing Values

In [13]:
def display_percent_null(df):
    '''Returns the percent of values in each column that are null or missing'''
    return (df.isna().sum()/len(df)) * 100

#### Women's Health Data

In [14]:
display_percent_null(health)

subreddit     0.000000
author        0.000000
timestamp     0.000000
total_text    2.450215
dtype: float64

Approximately 2.5% of the posts do not have text in the body of the post. 

#### Fertility and Pregnancy Data

In [15]:
display_percent_null(obsgyn)

subreddit     0.000000
author        0.000000
timestamp     0.000000
total_text    1.566162
dtype: float64

Approximately 1.6% of posts in the pregnancy and fertility data frame are null or missing.

#### Postpartum Data

In [16]:
display_percent_null(pospar)

subreddit     0.000000
author        0.000000
timestamp     0.000000
total_text    1.563649
dtype: float64

Approximately 1.6% of posts in the postpartum data frame are null or missing. 

### Drop Missing Data

Above, we saw that each dataframe is missing some data in the `total_text` column. Because we can not analyze posts without text, any rows with missing data will be removed.

In [17]:
health.dropna(inplace = True)
obsgyn.dropna(inplace = True)
pospar.dropna(inplace = True)

### Check Data Types

#### Functions

In [18]:
def convert_to_datetime(df, column):
    df[column] = pd.to_datetime(df[column])
    return df.dtypes

#### Women's Health Data

In [19]:
health.dtypes

subreddit     object
author        object
timestamp     object
total_text    object
dtype: object

The `subreddit`, `author`, and `total_text` columns are the appropriate data type (strings), but the `timestamp` column should be converted to a datetime object.

In [20]:
convert_to_datetime(health, 'timestamp')

subreddit             object
author                object
timestamp     datetime64[ns]
total_text            object
dtype: object

#### Fertility and Pregnancy Data

In [21]:
obsgyn.dtypes

subreddit     object
author        object
timestamp     object
total_text    object
dtype: object

Similar to the Women's Health Data, the `timestamp` column should be converted to a datetime object.

In [22]:
convert_to_datetime(obsgyn, 'timestamp')

subreddit             object
author                object
timestamp     datetime64[ns]
total_text            object
dtype: object

#### Postpartum Data

In [23]:
pospar.dtypes

subreddit     object
author        object
timestamp     object
total_text    object
dtype: object

Similar to the Women's Health and Fertility and Pregnancy Data, the `timestamp` column should be converted to a datetime object.

In [24]:
convert_to_datetime(pospar, 'timestamp')

subreddit             object
author                object
timestamp     datetime64[ns]
total_text            object
dtype: object

### Remove Special Text

In [26]:
def remove_string(df, column, string):
    df[column] = df[column].str.replace(string, '')


def replace_string(df, column, string, replacement):
    df[column] = df[column].str.replace(string, replacement)


#### [removed]
If moderators of the subreddit feel that the post violates one or more of the subreddit's rules, they will remove content of the post and replace it with a "[removed]" tag. Because the [removed] tags will not add any valuable information to the post, these will be removed from posts.

In [27]:
remove_string(health, 'total_text', '\[removed\]')
remove_string(obsgyn, 'total_text', '\[removed\]')
remove_string(pospar, 'total_text', '\[removed\]')

#### \n
This is a symbol that indicates a new line. Because we are only interested in the text itself, this tag will be removed.

In [28]:
remove_string(health, 'total_text', '\n')
remove_string(obsgyn, 'total_text', '\n')
remove_string(pospar, 'total_text', '\n')

#### &amp ;

Sometimes, the '&' symbol is not displayed, and the HTML reference `&amp;` shows up instead. This text will be removed.

In [29]:
remove_string(health, 'total_text', '&amp;')
remove_string(obsgyn, 'total_text', '&amp;')
remove_string(pospar, 'total_text', '&amp;')

#### &gt ;

Some times, the greater than symbol (>) is not properly displayed, and the HTML reference `&gt;` shows up in its place. I will remove this HTML reference.

In [30]:
remove_string(health, 'total_text', '&gt;')
remove_string(obsgyn, 'total_text', '&gt;')
remove_string(pospar, 'total_text', '&gt;')

#### &lt ;

Some times, the less than symbol (<) is not properly displayed, and the HTML reference `&lt;` shows up in its place. I will remove this HTML reference.

In [31]:
remove_string(health, 'total_text', '&lt;')
remove_string(obsgyn, 'total_text', '&lt;')
remove_string(pospar, 'total_text', '&lt;')

#### TL;DR
"TL;DR" is an abbreviation that stand for "Too long; Didn't read" and is meant to provide a brief synopsis of the post. Because these letters will not provide any meaningful inormation, they will be removed.

In [32]:
remove_string(health, 'total_text', 'TL;DR')
remove_string(obsgyn, 'total_text', 'TL;DR')
remove_string(pospar, 'total_text', 'TL;DR')

### Remove Website URLs

Prior to being able to remove the urls, I will need to reset the indices so that they are in numerical order from 0 to len(data frame). This will allow me to use `.loc` to edit the specific strings in the data frame.

In [33]:
health.reset_index(drop = True, inplace = True)
obsgyn.reset_index(drop = True, inplace = True)
pospar.reset_index(drop = True, inplace = True)

#### Functions

In [34]:
def remove_urls(df, column):
    for i in range(0, len(df)):
        df.loc[i, column] = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', df.loc[i, column])
        
# Regex Code by Lee Martin (Stack Overflow post)
# https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python/11332580        

In [35]:
def delete_empty_text(df, column):
    df = df[df[column] != '']
    return df.shape

#### Remove URLs

In [36]:
remove_urls(health, 'total_text')
remove_urls(obsgyn, 'total_text')
remove_urls(pospar, 'total_text')

#### Check to see if removing URLs resulted in any null 'total_text'; If so, remove it.

In [37]:
delete_empty_text(health, 'total_text')
delete_empty_text(obsgyn, 'total_text')
delete_empty_text(pospar, 'total_text')

(50866, 4)

### Remove Posts by AutoModerator

During EDA, I discovered that several of the subreddits used for data collection has an "AutoModerator" user that makes several posts. Because this user does not represent a woman who is interested in asking a health-related question, I will remove all posts from this user.

In [38]:
def remove_AutoModerator(df, column):
    df = df[df[column] != 'AutoModerator']
    return df

In [39]:
health = remove_AutoModerator(health, 'author')
obsgyn = remove_AutoModerator(obsgyn, 'author')
pospar = remove_AutoModerator(pospar, 'author')

### Remove Digits

In [40]:
def remove_digits(df, column1):
    df[column1] = df[column1].str.replace('[0-9]+', '', regex = True)
    
# https://stackoverflow.com/questions/47010044/how-to-remove-numeric-characters-present-in-countvectorizer

dfs = [health, obsgyn, pospar]

for df in dfs:
    remove_digits(df, 'total_text')

### Remove [deleted] tags

In [41]:
def remove_phrase(df, column):
    df[column] = df[column].str.replace('delete', '')

remove_phrase(health, 'total_text')
remove_phrase(obsgyn, 'total_text')
remove_phrase(pospar, 'total_text')

### Edit 'c-section' to be 'csection' in the obsgyn and pospar data frames

In [42]:
replace_string(obsgyn, 'total_text', 'c-section', 'csection')
replace_string(obsgyn, 'total_text', 'C-section', 'csection')
replace_string(pospar, 'total_text', 'c-section', 'csection')
replace_string(obsgyn, 'total_text', 'C-section', 'csection')

## Text Preprocessing

### Define Stop Words

#### Functions

In [43]:
def add_stop_words(word_list, list_stop_words):
    for word in word_list:
        list_stop_words.add(word)

In [44]:
def remove_stop_words(word_list, list_stop_words):
    for word in word_list:
        list_stop_words.remove(word)

#### Create Full Stop Words List from NLTK, spaCy, and custom Stop Words

In [45]:
stop_words_spacy = set(nlp.Defaults.stop_words)
stop_words_nltk = set(stopwords.words('english'))

full_stop_words = stop_words_spacy.union(stop_words_nltk)

In [46]:
words_to_add = ['like', 'know', 'want', 'feel', 'going', 'think', 'reddit', 'imgur', 'pron', 'officially']

add_stop_words(words_to_add, full_stop_words)

### Determine Word Counts for Each Post

#### Function

In [47]:
def get_word_count(df, column, new_column = 'word_count'):
    df[new_column] = df[column].apply(lambda x: len(x.split()))
    return df.head(1)

#### Add a Word Count column to each data frame

In [48]:
get_word_count(health, 'total_text')
get_word_count(obsgyn, 'total_text')
get_word_count(pospar, 'total_text')

Unnamed: 0,subreddit,author,timestamp,total_text,word_count
0,BabyBumps,All_Hail_CC,2020-08-07,Question about bottles I'm researching baby bo...,43


### Create a Lemmatized Text Column

In [49]:
health = health.reset_index(drop = True)
obsgyn = obsgyn.reset_index(drop = True)
pospar = pospar.reset_index(drop = True)

#### Function

In [50]:
def tokenize_and_lemma(df, column, stop_words = full_stop_words):
    nlp = spacy.load('en_core_web_sm')
    
    lemma_tokens = []
    
    for post in df[column]:
        doc = nlp(post) # Run text through spaCy pipeline
        tokens = [token for token in doc if token.text not in stop_words]
        lemma_tokens.append([token.lemma_ for token in tokens])

    df['lemma_text'] = [' '.join(post) for post in lemma_tokens] # join tokens back together into string
        
    return df.head()

In [51]:
dfs = [health, obsgyn, pospar]

for df in dfs:
    tokenize_and_lemma(df, 'total_text') 

### Sentiment Analysis of Posts

#### Functions

In [52]:
def analyze_sentiment(df, column, new_column = 'sentiment_score', score = 'compound'):
    sent_anal = SentimentIntensityAnalyzer()
    sentiment_scores = [sent_anal.polarity_scores(post)[score] for post in df[column]]
    df[new_column] = sentiment_scores
    return df.head()

#### Create a Sentiment Composite Column in each Data Frame

In [53]:
for df in dfs:
    analyze_sentiment(df, 'total_text')

### Export Clean and Preprocessed Data to CSV

In [54]:
# health.to_csv('../data/womens_health_preprocessed.csv', index = False)
# obsgyn.to_csv('../data/fertility_and_pregnancy_preprocessed.csv', index = False)
# pospar.to_csv('../data/postpartum_preprocessed.csv', index = False)