# Sentiment Analysis

Research Question: What linguistic patterns and sentiment markers are most indicative of suicidal intent in social media text?

## Import Relevant Modules

In [24]:
import os
import sys
DATA_PATH = os.path.abspath(os.path.join(os.path.dirname(os.curdir), 'data'))
ASSIGNMENT_1_PATH = os.path.abspath(
    os.path.join(
        os.path.dirname(os.curdir), 
        "..",
        'Assignment 1',
        "src"
        )
)
sys.path.append(DATA_PATH)
sys.path.append(ASSIGNMENT_1_PATH)
import zipfile
import shutil
import re
from collections import Counter
import numpy as np
import pandas as pd

In [28]:
stopwords = open(
    os.path.join(
        ASSIGNMENT_1_PATH, 
        "StopWords.txt"
    ), "r").read().splitlines()
print(f"Number of Stopwords in consideration: {len(stopwords)}")

Number of Stopwords in consideration: 779


## Fetch the Texts from the Datasets

In [30]:
depression = pd.read_json(
    os.path.join(
        DATA_PATH, 
        'Depression_Tweets', 
        'depression_json'
    )
)
reddit = pd.read_csv(
    os.path.join(
        DATA_PATH, 
        'Reddit_SuicideWatch', 
        'reddit_suicidewatch.csv'
    ), 
    encoding='utf-8'
)
social_media_sentiment_analysis = pd.read_csv(
    os.path.join(
        DATA_PATH, 
        'Social_Media_Sentiments_Analysis_Dataset', 
        'sentimentdataset_annotated_binary.csv'
    ), 
    encoding='utf-8'
)
twitter_suicidal_data = pd.read_csv(
    os.path.join(
        DATA_PATH, 
        'Twitter_Suicidal_Data', 
        'twitter-suicidal_data.csv'
    ), 
    encoding='utf-8'
)


In [32]:
depression.columns

Index(['content'], dtype='object')

In [35]:
reddit.columns

Index(['subreddit', 'selftext', 'author_fullname', 'title', 'hide_score',
       'name', 'upvote_ratio', 'ups', 'author_flair_template_id', 'score',
       'edited', 'author_flair_css_class', 'created', 'selftext_html',
       'no_follow', 'over_18', 'id', 'author', 'num_comments',
       'author_flair_text_color', 'permalink', 'url', 'created_utc'],
      dtype='object')

In [37]:
social_media_sentiment_analysis.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Text', 'Sentiment', 'Timestamp', 'User',
       'Platform', 'Hashtags', 'Retweets', 'Likes', 'Country', 'Year', 'Month',
       'Day', 'Hour', 'Annotation'],
      dtype='object')

In [40]:
twitter_suicidal_data.columns

Index(['tweet', 'intention'], dtype='object')

## N-Gram Analysis

In [29]:
def get_n_gram(n: int=2, corpus: str="", stopwords: list=[]):
    pattern = re.compile(r'\b\w+\b')
    # filtered corpus should have no punctuations AND stopwords
    filtered_corpus = [word for word in corpus if pattern.match(word) and word not in stopwords]
    # Compute bigrams from the filtered corpus
    ngrams = [tuple(filtered_corpus[i:i+n]) for i in range(len(filtered_corpus)-n+1)]
    return ngrams

In [None]:
N = 5
# Depression
ngrams_depression = get_n_gram(
    n=N,
    corpus=depression['content'][0].split(),
)
print(f"Number of {N}-gram from Depression JSON dataset: {len(ngrams_depression)}")
print(f"First 20 {N}-grams from Depression JSON dataset: {ngrams_depression[:20]}")

# Reddit
reddit['combined_text'] = reddit['title'] + " " + reddit['selftext']
ngrams_reddit = get_n_gram(
    n=N,
    corpus=reddit['combined_text'][0].split(),
)
print(f"Number of {N}-gram from Reddit SuicideWatch dataset: {len(ngrams_reddit)}")
print(f"First 20 {N}-grams from Reddit SuicideWatch dataset: {ngrams_reddit[:20]}")

# Social Media Sentiment Analysis
social_media_sentiment_analysis['combined_text'] = social_media_sentiment_analysis['Text'] + " " + social_media_sentiment_analysis['Hashtags']
ngrams_social_media_sentiment_analysis = get_n_gram(
    n=N,
    corpus=social_media_sentiment_analysis['combined_text'][0].split(),
)
print(f"Number of {N}-gram from Social Media Sentiment Analysis dataset: {len(ngrams_social_media_sentiment_analysis)}")
print(f"First 20 {N}-grams from Social Media Sentiment Analysis dataset: {ngrams_social_media_sentiment_analysis[:20]}")

# Twitter Suicidal Data
ngrams_twitter_suicidal_data = get_n_gram(
    n=N,
    corpus=twitter_suicidal_data['tweet'][0].split(),
)
print(f"Number of {N}-gram from Twitter Suicidal dataset: {len(ngrams_twitter_suicidal_data)}")
print(f"First 20 {N}-grams from Twitter Suicidal dataset: {ngrams_twitter_suicidal_data[:20]}")

Number of 5-gram from Depression JSON dataset: 13
First 20 5-grams from Depression JSON dataset: [('Yea', 'typically', 'crying', 'is', 'a'), ('typically', 'crying', 'is', 'a', 'sign'), ('crying', 'is', 'a', 'sign', 'of'), ('is', 'a', 'sign', 'of', 'uncontrolled'), ('a', 'sign', 'of', 'uncontrolled', 'depression'), ('sign', 'of', 'uncontrolled', 'depression', 'which'), ('of', 'uncontrolled', 'depression', 'which', 'he'), ('uncontrolled', 'depression', 'which', 'he', 'has'), ('depression', 'which', 'he', 'has', 'struggled'), ('which', 'he', 'has', 'struggled', 'with'), ('he', 'has', 'struggled', 'with', 'his'), ('has', 'struggled', 'with', 'his', 'whole'), ('struggled', 'with', 'his', 'whole', "life'")]
Number of 5-gram from Depression JSON dataset: 225
First 20 5-grams from Depression JSON dataset: [('Help?', 'I', 'reached', 'a', 'really'), ('I', 'reached', 'a', 'really', 'low'), ('reached', 'a', 'really', 'low', 'point'), ('a', 'really', 'low', 'point', 'today.'), ('really', 'low', 'po

## Sentiment Analysis with TextBlob