# 1. Preparation

In [5]:
# pip install -U textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
     |████████████████████████████████| 636 kB 3.0 MB/s            
[?25hCollecting nltk>=3.1
  Downloading nltk-3.6.5-py3-none-any.whl (1.5 MB)
     |████████████████████████████████| 1.5 MB 35.2 MB/s            
Collecting joblib
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
     |████████████████████████████████| 306 kB 21.9 MB/s            
[?25hCollecting regex>=2021.8.3
  Downloading regex-2021.10.23-cp37-cp37m-macosx_10_9_x86_64.whl (288 kB)
     |████████████████████████████████| 288 kB 22.9 MB/s            
Installing collected packages: regex, joblib, nltk, textblob
Successfully installed joblib-1.1.0 nltk-3.6.5 regex-2021.10.23 textblob-0.17.1
You should consider upgrading via the '/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [31]:
import pandas as pd
import numpy as np
import string
import re
from textblob import TextBlob
import nltk
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/qintianzhang/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
df = pd.read_csv('data/reddit_data.csv').dropna()
posts = df['body']
posts.head(3)

0    Congrats /r/anxiety we've all made it to Wedne...
1    With both the subreddit and Discord continuing...
2    I went to get my haircut and the person cuttin...
Name: body, dtype: object

# 2. Clean Data

1. Remove punctuations
2. Tokenization - Converting a sentence into list of words
3. Remove stopwords
4. Lammetization/stemming - Tranforming any form of a word to its root word

In [26]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])     # this changes contraction to non-words (e.g. "We've" to "weve")
    text = re.sub('[0-9]+', '', text)
    return text

df['reddit_punct'] = df['body'].apply(lambda x: remove_punct(x))
df.head(3)

Unnamed: 0,topic,title,score,id,subreddit,url,num_comments,body,created,reddit_punct
0,anxiety,Let your light shine!,17,qc0aqd,Anxiety,https://www.reddit.com/r/Anxiety/comments/qc0a...,23,Congrats /r/anxiety we've all made it to Wedne...,1634735000.0,Congrats ranxiety weve all made it to Wednesda...
1,anxiety,Looking for new mods! (subreddit and Discord),12,qb0ort,Anxiety,https://www.reddit.com/r/Anxiety/comments/qb0o...,0,With both the subreddit and Discord continuing...,1634606000.0,With both the subreddit and Discord continuing...
2,anxiety,fuck,159,qe7rl0,Anxiety,https://www.reddit.com/r/Anxiety/comments/qe7r...,32,I went to get my haircut and the person cuttin...,1635005000.0,I went to get my haircut and the person cuttin...


In [27]:
def tokenization(text):
    text = re.split('\W+', text)
    return text

df['reddit_tokenized'] = df['reddit_punct'].apply(lambda x: tokenization(x.lower()))
df.head(3)

Unnamed: 0,topic,title,score,id,subreddit,url,num_comments,body,created,reddit_punct,reddit_tokenized
0,anxiety,Let your light shine!,17,qc0aqd,Anxiety,https://www.reddit.com/r/Anxiety/comments/qc0a...,23,Congrats /r/anxiety we've all made it to Wedne...,1634735000.0,Congrats ranxiety weve all made it to Wednesda...,"[congrats, ranxiety, weve, all, made, it, to, ..."
1,anxiety,Looking for new mods! (subreddit and Discord),12,qb0ort,Anxiety,https://www.reddit.com/r/Anxiety/comments/qb0o...,0,With both the subreddit and Discord continuing...,1634606000.0,With both the subreddit and Discord continuing...,"[with, both, the, subreddit, and, discord, con..."
2,anxiety,fuck,159,qe7rl0,Anxiety,https://www.reddit.com/r/Anxiety/comments/qe7r...,32,I went to get my haircut and the person cuttin...,1635005000.0,I went to get my haircut and the person cuttin...,"[i, went, to, get, my, haircut, and, the, pers..."


In [32]:
stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text
    
df['reddit_nonstop'] = df['reddit_tokenized'].apply(lambda x: remove_stopwords(x))
df.head(3)

Unnamed: 0,topic,title,score,id,subreddit,url,num_comments,body,created,reddit_punct,reddit_tokenized,reddit_nonstop
0,anxiety,Let your light shine!,17,qc0aqd,Anxiety,https://www.reddit.com/r/Anxiety/comments/qc0a...,23,Congrats /r/anxiety we've all made it to Wedne...,1634735000.0,Congrats ranxiety weve all made it to Wednesda...,"[congrats, ranxiety, weve, all, made, it, to, ...","[congrats, ranxiety, weve, made, wednesday, we..."
1,anxiety,Looking for new mods! (subreddit and Discord),12,qb0ort,Anxiety,https://www.reddit.com/r/Anxiety/comments/qb0o...,0,With both the subreddit and Discord continuing...,1634606000.0,With both the subreddit and Discord continuing...,"[with, both, the, subreddit, and, discord, con...","[subreddit, discord, continuing, grow, looking..."
2,anxiety,fuck,159,qe7rl0,Anxiety,https://www.reddit.com/r/Anxiety/comments/qe7r...,32,I went to get my haircut and the person cuttin...,1635005000.0,I went to get my haircut and the person cuttin...,"[i, went, to, get, my, haircut, and, the, pers...","[went, get, haircut, person, cutting, cut, way..."


# 3. Build Model

In [None]:
def sentiment_analysis(data):
    #Create a function to get the subjectivity
     def getSubjectivity(text):
        return TextBlob(text).sentiment.subjectivity
  
    #Create a function to get the polarity
    def getPolarity(text):
        return TextBlob(text).sentiment.polarity
    
    #Create two new columns ‘Subjectivity’ & ‘Polarity’
    data[‘TextBlob_Subjectivity’] = data['body'].apply(getSubjectivity)
    data[‘TextBlob_Polarity’] = data['body'].apply(getPolarity)
    def getAnalysis(score):
        if score < 0:
            return ‘Negative’
        elif score == 0:
            return ‘Neutral’
        else:
            return ‘Positive’
    data['TextBlob_Analysis'] = data['TextBlob_Polarity'].apply(getAnalysis)
return data

# References:

- https://towardsdatascience.com/my-absolute-go-to-for-sentiment-analysis-textblob-3ac3a11d524
- https://towardsdatascience.com/cleaning-preprocessing-text-data-for-sentiment-analysis-382a41f150d6
