# 1. Preparation

In [49]:
# pip install -U textblob

In [73]:
import pandas as pd
import numpy as np
import string
import re
from textblob import TextBlob
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sparshbohra/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [66]:
df = pd.read_csv('data/reddit_data_2.0.csv').dropna()
posts = df['body']
posts.head(3)

0    Congrats /r/anxiety we've all made it to Wedne...
1    With both the subreddit and Discord continuing...
2    I went to get my haircut and the person cuttin...
Name: body, dtype: object

In [67]:
df.size

17568

# 2. Clean Data

1. Remove punctuations
2. Tokenization - Converting a sentence into list of words
3. Remove stopwords

In [68]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])     # this changes contraction to non-words (e.g. "We've" to "weve")
    text = re.sub('[0-9]+', '', text)
    return text

df['reddit_punct'] = df['body'].apply(lambda x: remove_punct(x))
df.head(3)

Unnamed: 0,topic,title,score,id,subreddit,url,num_comments,body,created,reddit_punct
0,anxiety,Let your light shine!,19,qc0aqd,Anxiety,https://www.reddit.com/r/Anxiety/comments/qc0a...,25,Congrats /r/anxiety we've all made it to Wedne...,1634735000.0,Congrats ranxiety weve all made it to Wednesda...
1,anxiety,Looking for new mods! (subreddit and Discord),11,qb0ort,Anxiety,https://www.reddit.com/r/Anxiety/comments/qb0o...,0,With both the subreddit and Discord continuing...,1634606000.0,With both the subreddit and Discord continuing...
2,anxiety,fuck,409,qe7rl0,Anxiety,https://www.reddit.com/r/Anxiety/comments/qe7r...,47,I went to get my haircut and the person cuttin...,1635005000.0,I went to get my haircut and the person cuttin...


In [69]:
stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text
    
df['reddit_nonstop'] = df['reddit_punct'].apply(lambda x: remove_stopwords(x))
df.head(3)

Unnamed: 0,topic,title,score,id,subreddit,url,num_comments,body,created,reddit_punct,reddit_nonstop
0,anxiety,Let your light shine!,19,qc0aqd,Anxiety,https://www.reddit.com/r/Anxiety/comments/qc0a...,25,Congrats /r/anxiety we've all made it to Wedne...,1634735000.0,Congrats ranxiety weve all made it to Wednesda...,"[C, n, g, r, , r, n, x, e, , w, e, v, e, , ..."
1,anxiety,Looking for new mods! (subreddit and Discord),11,qb0ort,Anxiety,https://www.reddit.com/r/Anxiety/comments/qb0o...,0,With both the subreddit and Discord continuing...,1634606000.0,With both the subreddit and Discord continuing...,"[W, h, , b, h, , h, e, , u, b, r, e, , n, ..."
2,anxiety,fuck,409,qe7rl0,Anxiety,https://www.reddit.com/r/Anxiety/comments/qe7r...,47,I went to get my haircut and the person cuttin...,1635005000.0,I went to get my haircut and the person cuttin...,"[I, , w, e, n, , , g, e, , , h, r, c, u, ..."


In [70]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

def low_caps(text):
    text = text.lower()
    return text

df['reddit_punct'] = df['reddit_punct'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['reddit_punct'] = df['reddit_punct'].apply(lambda x: low_caps(x))
df.head()


Unnamed: 0,topic,title,score,id,subreddit,url,num_comments,body,created,reddit_punct,reddit_nonstop
0,anxiety,Let your light shine!,19,qc0aqd,Anxiety,https://www.reddit.com/r/Anxiety/comments/qc0a...,25,Congrats /r/anxiety we've all made it to Wedne...,1634735000.0,congrats ranxiety weve made wednesday this wee...,"[C, n, g, r, , r, n, x, e, , w, e, v, e, , ..."
1,anxiety,Looking for new mods! (subreddit and Discord),11,qb0ort,Anxiety,https://www.reddit.com/r/Anxiety/comments/qb0o...,0,With both the subreddit and Discord continuing...,1634606000.0,with subreddit discord continuing grow looking...,"[W, h, , b, h, , h, e, , u, b, r, e, , n, ..."
2,anxiety,fuck,409,qe7rl0,Anxiety,https://www.reddit.com/r/Anxiety/comments/qe7r...,47,I went to get my haircut and the person cuttin...,1635005000.0,i went get haircut person cutting cut way shor...,"[I, , w, e, n, , , g, e, , , h, r, c, u, ..."
3,anxiety,Does anyone else feel bad for inanimate objects?,143,qeecds,Anxiety,https://www.reddit.com/r/Anxiety/comments/qeec...,27,"For example, I feel bad for a snowblower that ...",1635026000.0,for example i feel bad snowblower parents don’...,"[F, r, , e, x, p, l, e, , I, , f, e, e, l, ..."
4,anxiety,Does anybody have like a week of no anxiety th...,248,qe3u9l,Anxiety,https://www.reddit.com/r/Anxiety/comments/qe3u...,26,I swear I have been in a cycle of going “I’ve ...,1634992000.0,i swear i cycle going “i’ve never felt better”...,"[I, , w, e, r, , I, , h, v, e, , b, e, e, ..."


In [71]:
df.to_csv('clean_data3.csv')

# 3. Calculate polarity & subjectivity

In [32]:
def sentiment_analysis(data):
    #Create a function to get the subjectivity
    def getSubjectivity(text):
        return TextBlob(text).sentiment.subjectivity
  
    #Create a function to get the polarity
    def getPolarity(text):
        return TextBlob(text).sentiment.polarity
    
    #Create two new columns ‘Subjectivity’ & ‘Polarity’
    data['TextBlob_Subjectivity'] = data['body'].apply(getSubjectivity)
    data['TextBlob_Polarity'] = data['body'].apply(getPolarity)
    def getAnalysis(score):
        if score < 0:
            return 'Negative'
        elif score == 0:
            return 'Neutral'
        else:
            return 'Positive'
    data['TextBlob_Analysis'] = data['TextBlob_Polarity'].apply(getAnalysis)
    return data

In [33]:
sentiment = sentiment_analysis(df)
sentiment.head(3)

Unnamed: 0,topic,title,score,id,subreddit,url,num_comments,body,created,reddit_punct,reddit_tokenized,reddit_nonstop,TextBlob_Subjectivity,TextBlob_Polarity,TextBlob_Analysis
0,anxiety,Let your light shine!,19,qc0aqd,Anxiety,https://www.reddit.com/r/Anxiety/comments/qc0a...,25,Congrats /r/anxiety we've all made it to Wedne...,1634735000.0,Congrats ranxiety weve all made it to Wednesda...,"[congrats, ranxiety, weve, all, made, it, to, ...","[congrats, ranxiety, weve, made, wednesday, we...",0.0,0.0,Neutral
1,anxiety,Looking for new mods! (subreddit and Discord),11,qb0ort,Anxiety,https://www.reddit.com/r/Anxiety/comments/qb0o...,0,With both the subreddit and Discord continuing...,1634606000.0,With both the subreddit and Discord continuing...,"[with, both, the, subreddit, and, discord, con...","[subreddit, discord, continuing, grow, looking...",0.0,0.0,Neutral
2,anxiety,fuck,409,qe7rl0,Anxiety,https://www.reddit.com/r/Anxiety/comments/qe7r...,47,I went to get my haircut and the person cuttin...,1635005000.0,I went to get my haircut and the person cuttin...,"[i, went, to, get, my, haircut, and, the, pers...","[went, get, haircut, person, cutting, cut, way...",0.0,0.0,Neutral


In [34]:
neg_sentiment = sentiment[sentiment['TextBlob_Analysis'] == 'Negative']  # range of polarity is [-1,1]
print('number of negative posts: ',len(neg_sentiment))
print('percentage of negative posts: ', len(neg_sentiment)/len(df)*100, '%')

number of negative posts:  35
percentage of negative posts:  1.7930327868852458 %


In [35]:
subj_sentiment = sentiment[sentiment['TextBlob_Subjectivity'] > 0.5] # range of subjectivity is [0,1]
print('number of subjective posts: ',len(subj_sentiment))
print('percentage of subjective posts: ', len(subj_sentiment)/len(df)*100, '%')

number of subjective posts:  73
percentage of subjective posts:  3.7397540983606556 %


In [36]:
new_data = sentiment[sentiment['TextBlob_Subjectivity'] > 0.5] 
new_data = new_data[new_data['TextBlob_Analysis']=='Negative']
print('number of negative & subjective posts: ',len(new_data))
print('percentage of negative & subjective posts: ', len(new_data)/len(df)*100, '%')

number of negative & subjective posts:  35
percentage of negative & subjective posts:  1.7930327868852458 %


## Classifier (incomplete)

In [40]:
# Creating a dataframe with 50%
# values of original dataframe
train = new_data.sample(frac = 0.7)
train_label = train['subreddit']
train_df = train['reddit_nonstop']
  
# Creating dataframe with 
# rest of the 50% values
test = new_data.drop(train.index)
test_label = test['subreddit']
test_df = test['reddit_nonstop']
print('size of training set: ',len(train))
print('size of testing set: ',len(test))

size of training set:  24
size of testing set:  11


In [34]:
import os
import numpy as np
import pandas as pd

import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import tensorflow as tf
from tensorflow.python.keras.models import Model, Sequential
from tensorflow.python.keras.layers import Dense, Embedding, Input, Activation, Masking
from tensorflow.python.keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras import optimizers, initializers, layers

# import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

In [36]:
max_features = 20000
maxlen = 200
tokenizer = Tokenizer(num_words = max_features)

In [37]:
# Tokenize

tokenizer.fit_on_texts(train_df)
X_train_token = tokenizer.texts_to_sequences(train_df)

tokenizer.fit_on_texts(test_df)
X_test_token = tokenizer.texts_to_sequences(test_df)


In [38]:
# Pad

X_train = pad_sequences(X_train_token, maxlen = maxlen, padding = 'post')
X_test  = pad_sequences(X_test_token, maxlen = maxlen, padding = 'post')
print(X_train.shape, X_test.shape)

(416, 200) (179, 200)


In [42]:
targets = ['Anxiety','Depression']
targets

['Anxiety', 'Depression']

In [44]:
# Create the Model

model = Sequential([Input(shape=(maxlen, )),
                    Embedding(max_features, 128, mask_zero = True),
                    LSTM(64, return_sequences = True, dropout = 0.2),
                    GlobalMaxPool1D(),
                    Dropout(0.2),
                    Dense(64, activation = 'relu'),
                    Dropout(0.2),
                    Dense(2, activation = 'softmax')])

model.compile(loss = 'binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 128)          2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 200, 64)           49408     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 6)                

In [47]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

monitor = EarlyStopping(monitor = 'val_loss', 
                        min_delta = 1e-3, 
                        patience = 5, verbose = 1, 
                        restore_best_weights = True)

history = model.fit(X_train, targets,
                    batch_size = 32,
                    epochs = 3, validation_split = 0.1,
                    callbacks = [monitor])

Using TensorFlow backend.


ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {"<class 'str'>"})

In [46]:
conda install keras

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/anaconda3

  added / updated specs:
    - keras


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    keras-2.3.1                |                0          12 KB
    keras-base-2.3.1           |           py37_0         501 KB
    ------------------------------------------------------------
                                           Total:         513 KB

The following NEW packages will be INSTALLED:

  keras              pkgs/main/osx-64::keras-2.3.1-0
  keras-base         pkgs/main/osx-64::keras-base-2.3.1-py37_0

The following packages will be SUPERSEDED by a higher-priority channel:

  ca-certificates    conda-forge::ca-certificates-2021.10.~ --> pkgs/main::ca-certificates-2021.9.30-hecd8cb5_1
  certifi            conda-forge::certifi-2021.10.8-py37hf~ --

# Backup

In [20]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
for sentence in train_data:
    vs = analyzer.polarity_scores(sentence)
    print("{:-<65} {}".format(sentence, str(vs)))

TypeError: unsupported format string passed to list.__format__

In [13]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
from sklearn.model_selection import train_test_split
import os

# !pip install bert-tensorflow
import bert
# from bert import run_classifier
from bert import optimization
from bert import tokenization

In [14]:
print("tensorflow version : ", tf.__version__)
print("tensorflow_hub version : ", hub.__version__)

tensorflow version :  2.0.0
tensorflow_hub version :  0.12.0


In [15]:
# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    with tf.Graph().as_default():
        bert_module = hub.Module(BERT_MODEL_HUB)
        tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.compat.v1.Session() as sess:
        vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
    return bert.tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


RuntimeError: The Session graph is empty.  Add operations to the graph before calling run().

In [None]:
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 1000

# Convert our train and validation features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_data, train_label, MAX_SEQ_LENGTH, tokenizer)

val_features = bert.run_classifier.convert_examples_to_features(val_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

# References:

- https://towardsdatascience.com/my-absolute-go-to-for-sentiment-analysis-textblob-3ac3a11d524
- https://towardsdatascience.com/cleaning-preprocessing-text-data-for-sentiment-analysis-382a41f150d6
- https://github.com/HenrySilvaCS/SentiMentalHealth/blob/main/src/models.py
