# 1. Preparation

In [None]:
# pip install -U textblob

In [3]:
import pandas as pd
import numpy as np
import string
import re
from textblob import TextBlob
import nltk
# nltk.download('stopwords')

In [4]:
df = pd.read_csv('data/reddit_data.csv').dropna()
posts = df['body']
posts.head(3)

0    Congrats /r/anxiety we've all made it to Wedne...
1    With both the subreddit and Discord continuing...
2    I went to get my haircut and the person cuttin...
Name: body, dtype: object

# 2. Clean Data

1. Remove punctuations
2. Tokenization - Converting a sentence into list of words
3. Remove stopwords
4. Lammetization/stemming - Tranforming any form of a word to its root word

In [5]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])     # this changes contraction to non-words (e.g. "We've" to "weve")
    text = re.sub('[0-9]+', '', text)
    return text

df['reddit_punct'] = df['body'].apply(lambda x: remove_punct(x))
df.head(3)

Unnamed: 0,topic,title,score,id,subreddit,url,num_comments,body,created,reddit_punct
0,anxiety,Let your light shine!,18,qc0aqd,Anxiety,https://www.reddit.com/r/Anxiety/comments/qc0a...,23,Congrats /r/anxiety we've all made it to Wedne...,1634735000.0,Congrats ranxiety weve all made it to Wednesda...
1,anxiety,Looking for new mods! (subreddit and Discord),11,qb0ort,Anxiety,https://www.reddit.com/r/Anxiety/comments/qb0o...,0,With both the subreddit and Discord continuing...,1634606000.0,With both the subreddit and Discord continuing...
2,anxiety,fuck,239,qe7rl0,Anxiety,https://www.reddit.com/r/Anxiety/comments/qe7r...,39,I went to get my haircut and the person cuttin...,1635005000.0,I went to get my haircut and the person cuttin...


In [6]:
def tokenization(text):
    text = re.split('\W+', text)
    return text

df['reddit_tokenized'] = df['reddit_punct'].apply(lambda x: tokenization(x.lower()))
df.head(3)

Unnamed: 0,topic,title,score,id,subreddit,url,num_comments,body,created,reddit_punct,reddit_tokenized
0,anxiety,Let your light shine!,18,qc0aqd,Anxiety,https://www.reddit.com/r/Anxiety/comments/qc0a...,23,Congrats /r/anxiety we've all made it to Wedne...,1634735000.0,Congrats ranxiety weve all made it to Wednesda...,"[congrats, ranxiety, weve, all, made, it, to, ..."
1,anxiety,Looking for new mods! (subreddit and Discord),11,qb0ort,Anxiety,https://www.reddit.com/r/Anxiety/comments/qb0o...,0,With both the subreddit and Discord continuing...,1634606000.0,With both the subreddit and Discord continuing...,"[with, both, the, subreddit, and, discord, con..."
2,anxiety,fuck,239,qe7rl0,Anxiety,https://www.reddit.com/r/Anxiety/comments/qe7r...,39,I went to get my haircut and the person cuttin...,1635005000.0,I went to get my haircut and the person cuttin...,"[i, went, to, get, my, haircut, and, the, pers..."


In [7]:
stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text
    
df['reddit_nonstop'] = df['reddit_tokenized'].apply(lambda x: remove_stopwords(x))
df.head(3)

Unnamed: 0,topic,title,score,id,subreddit,url,num_comments,body,created,reddit_punct,reddit_tokenized,reddit_nonstop
0,anxiety,Let your light shine!,18,qc0aqd,Anxiety,https://www.reddit.com/r/Anxiety/comments/qc0a...,23,Congrats /r/anxiety we've all made it to Wedne...,1634735000.0,Congrats ranxiety weve all made it to Wednesda...,"[congrats, ranxiety, weve, all, made, it, to, ...","[congrats, ranxiety, weve, made, wednesday, we..."
1,anxiety,Looking for new mods! (subreddit and Discord),11,qb0ort,Anxiety,https://www.reddit.com/r/Anxiety/comments/qb0o...,0,With both the subreddit and Discord continuing...,1634606000.0,With both the subreddit and Discord continuing...,"[with, both, the, subreddit, and, discord, con...","[subreddit, discord, continuing, grow, looking..."
2,anxiety,fuck,239,qe7rl0,Anxiety,https://www.reddit.com/r/Anxiety/comments/qe7r...,39,I went to get my haircut and the person cuttin...,1635005000.0,I went to get my haircut and the person cuttin...,"[i, went, to, get, my, haircut, and, the, pers...","[went, get, haircut, person, cutting, cut, way..."


# 3. Build Model

In [8]:
def sentiment_analysis(data):
    #Create a function to get the subjectivity
    def getSubjectivity(text):
        return TextBlob(text).sentiment.subjectivity
  
    #Create a function to get the polarity
    def getPolarity(text):
        return TextBlob(text).sentiment.polarity
    
    #Create two new columns ‘Subjectivity’ & ‘Polarity’
    data['TextBlob_Subjectivity'] = data['body'].apply(getSubjectivity)
    data['TextBlob_Polarity'] = data['body'].apply(getPolarity)
    def getAnalysis(score):
        if score < 0:
            return 'Negative'
        elif score == 0:
            return 'Neutral'
        else:
            return 'Positive'
    data['TextBlob_Analysis'] = data['TextBlob_Polarity'].apply(getAnalysis)
    return data

In [9]:
sentiment = sentiment_analysis(df)
sentiment.head(3)

Unnamed: 0,topic,title,score,id,subreddit,url,num_comments,body,created,reddit_punct,reddit_tokenized,reddit_nonstop,TextBlob_Subjectivity,TextBlob_Polarity,TextBlob_Analysis
0,anxiety,Let your light shine!,18,qc0aqd,Anxiety,https://www.reddit.com/r/Anxiety/comments/qc0a...,23,Congrats /r/anxiety we've all made it to Wedne...,1634735000.0,Congrats ranxiety weve all made it to Wednesda...,"[congrats, ranxiety, weve, all, made, it, to, ...","[congrats, ranxiety, weve, made, wednesday, we...",0.480967,0.223088,Positive
1,anxiety,Looking for new mods! (subreddit and Discord),11,qb0ort,Anxiety,https://www.reddit.com/r/Anxiety/comments/qb0o...,0,With both the subreddit and Discord continuing...,1634606000.0,With both the subreddit and Discord continuing...,"[with, both, the, subreddit, and, discord, con...","[subreddit, discord, continuing, grow, looking...",0.455455,0.003295,Positive
2,anxiety,fuck,239,qe7rl0,Anxiety,https://www.reddit.com/r/Anxiety/comments/qe7r...,39,I went to get my haircut and the person cuttin...,1635005000.0,I went to get my haircut and the person cuttin...,"[i, went, to, get, my, haircut, and, the, pers...","[went, get, haircut, person, cutting, cut, way...",0.400183,-0.084753,Negative


In [17]:
neg_sentiment = sentiment[sentiment['TextBlob_Analysis'] == 'Negative']
print('number of negative posts: ',len(neg_sentiment))
print('percentage of negative posts: ', len(neg_sentiment)/len(df)*100, '%')

number of negative posts:  1346
percentage of negative posts:  47.32770745428973 %


# References:

- https://towardsdatascience.com/my-absolute-go-to-for-sentiment-analysis-textblob-3ac3a11d524
- https://towardsdatascience.com/cleaning-preprocessing-text-data-for-sentiment-analysis-382a41f150d6
