# Data Text Cleaning 
In this notebook, we will execute the following: 
- Remove HTML artifacts
- Remove punctuation 
- Lowercase everything 
- Tokenize then Snowball all texts 
- Save cleaned df as csv

In [1]:
import nltk

In [2]:
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from bs4 import BeautifulSoup

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('./datasets/df.csv')

In [5]:
df.shape

(1891, 3)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,subreddit,text
0,0,1,We know that September 10 was World Suicide Pr...
1,1,1,Welcome to /r/depression's check-in post - a p...
2,2,1,I’m going to the movies. I’m so nervous… I’m w...
3,3,1,Now i can save so i can get myself out of this...
4,4,1,I always do this. I’ll stay up until the wee h...


### Removing HTML artifacts 

In [7]:
example1 = BeautifulSoup(df['text'][0])
example1.get_text()

"We know that September 10 was World Suicide Prevention Day. And, we're all for activism. But not here, please and thank you. It takes focus away from our OPs in need of support and understanding.  Reminder: NO ACTIVISM is allowed here at any time."

In [8]:
#lets use beautiful soup to remove html language
df['text'] = [BeautifulSoup(text).get_text() for text in df['text'] ]

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,subreddit,text
0,0,1,We know that September 10 was World Suicide Pr...
1,1,1,Welcome to /r/depression's check-in post - a p...
2,2,1,I’m going to the movies. I’m so nervous… I’m w...
3,3,1,Now i can save so i can get myself out of this...
4,4,1,I always do this. I’ll stay up until the wee h...


## Tokenizing and Snowballing

#### Let's Tokenize First

In [10]:
#instntiate tokenizer
tokenizer = RegexpTokenizer(r'\w+') 

In [11]:
#instntiate snowballstemmer
snow = SnowballStemmer(language = 'english')

In [12]:
#shoutout to stack overflow 
#this code will tokenize then stem the lower case of text
def snow_text(text):
    return [snow.stem(w.lower()) for w in tokenizer.tokenize(text.replace("'", ""))]  #add .replace("'", "")


df['snow_text'] = df['text'].apply(snow_text)

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,subreddit,text,snow_text
0,0,1,We know that September 10 was World Suicide Pr...,"[we, know, that, septemb, 10, was, world, suic..."
1,1,1,Welcome to /r/depression's check-in post - a p...,"[welcom, to, r, depress, check, in, post, a, p..."
2,2,1,I’m going to the movies. I’m so nervous… I’m w...,"[i, m, go, to, the, movi, i, m, so, nervous, i..."
3,3,1,Now i can save so i can get myself out of this...,"[now, i, can, save, so, i, can, get, myself, o..."
4,4,1,I always do this. I’ll stay up until the wee h...,"[i, alway, do, this, i, ll, stay, up, until, t..."


In [14]:
df['text'][2].replace("'", "")

'I’m going to the movies. I’m so nervous… I’m willingly leaving my house for the first time in over 6 months'

### Checking if it got Snowballed

In [15]:
df['text'][2]

'I’m going to the movies. I’m so nervous… I’m willingly leaving my house for the first time in over 6 months'

In [16]:
df['snow_text'][2]

['i',
 'm',
 'go',
 'to',
 'the',
 'movi',
 'i',
 'm',
 'so',
 'nervous',
 'i',
 'm',
 'will',
 'leav',
 'my',
 'hous',
 'for',
 'the',
 'first',
 'time',
 'in',
 'over',
 '6',
 'month']

In [17]:
df.shape

(1891, 4)

In [18]:
df.head()

Unnamed: 0.1,Unnamed: 0,subreddit,text,snow_text
0,0,1,We know that September 10 was World Suicide Pr...,"[we, know, that, septemb, 10, was, world, suic..."
1,1,1,Welcome to /r/depression's check-in post - a p...,"[welcom, to, r, depress, check, in, post, a, p..."
2,2,1,I’m going to the movies. I’m so nervous… I’m w...,"[i, m, go, to, the, movi, i, m, so, nervous, i..."
3,3,1,Now i can save so i can get myself out of this...,"[now, i, can, save, so, i, can, get, myself, o..."
4,4,1,I always do this. I’ll stay up until the wee h...,"[i, alway, do, this, i, ll, stay, up, until, t..."


In [19]:
#lets join each word in snow_text by spaces 
df['texty'] = [[' '.join(i)] for i in df['snow_text']]

In [20]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,subreddit,text,snow_text,texty
0,0,1,We know that September 10 was World Suicide Pr...,"[we, know, that, septemb, 10, was, world, suic...",[we know that septemb 10 was world suicid prev...


In [21]:
#lets convert to string 
df['string'] = [str(i) for i in df['texty']]

In [22]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,subreddit,text,snow_text,texty,string
0,0,1,We know that September 10 was World Suicide Pr...,"[we, know, that, septemb, 10, was, world, suic...",[we know that septemb 10 was world suicid prev...,['we know that septemb 10 was world suicid pre...


In [23]:
#removing left bracket, right bracket, and apostrophe
df['stringOG'] = [i.replace('[', '').replace(']', '').replace("'", '') for i in df['string']]

In [24]:
df.head()

Unnamed: 0.1,Unnamed: 0,subreddit,text,snow_text,texty,string,stringOG
0,0,1,We know that September 10 was World Suicide Pr...,"[we, know, that, septemb, 10, was, world, suic...",[we know that septemb 10 was world suicid prev...,['we know that septemb 10 was world suicid pre...,we know that septemb 10 was world suicid preve...
1,1,1,Welcome to /r/depression's check-in post - a p...,"[welcom, to, r, depress, check, in, post, a, p...",[welcom to r depress check in post a place to ...,['welcom to r depress check in post a place to...,welcom to r depress check in post a place to t...
2,2,1,I’m going to the movies. I’m so nervous… I’m w...,"[i, m, go, to, the, movi, i, m, so, nervous, i...",[i m go to the movi i m so nervous i m will le...,['i m go to the movi i m so nervous i m will l...,i m go to the movi i m so nervous i m will lea...
3,3,1,Now i can save so i can get myself out of this...,"[now, i, can, save, so, i, can, get, myself, o...",[now i can save so i can get myself out of thi...,['now i can save so i can get myself out of th...,now i can save so i can get myself out of this...
4,4,1,I always do this. I’ll stay up until the wee h...,"[i, alway, do, this, i, ll, stay, up, until, t...",[i alway do this i ll stay up until the wee ho...,['i alway do this i ll stay up until the wee h...,i alway do this i ll stay up until the wee hou...


## Saving Snow Data 
- Called snow.csv

In [25]:
#remember i am only analyzing text, therefore iwant to keep the string and subreddit
features = ['subreddit', 'stringOG']
dataframe = df[features]

In [26]:
dataframe.head()

Unnamed: 0,subreddit,stringOG
0,1,we know that septemb 10 was world suicid preve...
1,1,welcom to r depress check in post a place to t...
2,1,i m go to the movi i m so nervous i m will lea...
3,1,now i can save so i can get myself out of this...
4,1,i alway do this i ll stay up until the wee hou...


In [27]:
dataframe.shape

(1891, 2)

In [28]:
#saving it to df
dataframe.to_csv('./datasets/snow.csv')