In [85]:
# imports
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import os

In [86]:
# determine the current working directory to make it easier to find file paths
os.getcwd()

'/Users/haydenprescott/projects/red-flag-text-detection/data/sources/set2'

In [87]:
# read the csv file containing the second dataset used for training/evaluation, and convert it to a pandas dataframe
df = pd.read_csv("./DepressionDetectionNLP/final_training_data.txt", delimiter="\t")
# Rename column 'combined' to 'text' for better clarity
df.rename(columns={'combined':'text'}, inplace=True)

# Add columns for positive/negative sentiment and the label
df["label"] = ""
df["pos"] = ""
df["neg"] = ""

In [88]:
#Perform Sentiment Analysis on Training Data and Populate Pandas DataFrame
analyzer = SentimentIntensityAnalyzer()

# Thresholds to discard outliers (positive posts in suicide watch subreddit & negative posts in casual conversation subreddit)
NEG_DISCARD_THRESHOLD = 0.1
POS_DISCARD_THRESHOLD = 0.1

# Discard outliers (CC with negative sentiment and SW with positive sentiment)
for index, row in df.iterrows():
    subreddit = row.subreddit
    
    # Run sentiment anlysis for the post
    scores = analyzer.polarity_scores(row.text)
    row.pos = scores['pos']
    row.neg = scores['neg']

for index, row in df.iterrows():

    if (row.neg > NEG_DISCARD_THRESHOLD and subreddit == "cc") or (row.pos > POS_DISCARD_THRESHOLD and subreddit == "sw"):
        df.drop(index, inplace=True)

# Label rows based on subreddit (CC, SW)
# Conventions: depressive = 1, neutral = 0
for index, row in df.iterrows():
    subreddit = row.subreddit
    if subreddit == "cc":
        row.label = 0
    elif subreddit == "sw":
        row.label = 1
    else:
        row.label = np.nan
    
df.head(10)

Unnamed: 0,text,subreddit,label,pos,neg
6,Where should I read my college required readin...,cc,0,0.0,0.0
7,"Your and You’re An easy way to remember it, fo...",cc,0,0.078,0.066
8,"did i miss a day? oh poop. well, halfway to 5 ...",cc,0,0.353,0.028
9,My flight just got delayed for super long.. wh...,cc,0,0.065,0.032
11,a Thank you guys so much for responding. I rea...,sw,1,0.422,0.0
13,How’s it going Just wanted to talk nothing in ...,cc,0,0.0,0.0
17,The Hungarian language has more than 20 words ...,cc,0,0.082,0.043
19,Back again. I posted a couple times here about...,sw,1,0.0,0.098
20,just some observations funny how I know I’ll k...,sw,1,0.235,0.071
22,I didn’t kiss him goodbye like I usually do. I...,sw,1,0.29,0.0


In [89]:
# fill the id column of the dataframe with unique integers starting from 0, and counting up until all of the rows have a unique id
df["id"] = list(range(df.shape[0]))
df.head(100)

Unnamed: 0,text,subreddit,label,pos,neg,id
6,Where should I read my college required readin...,cc,0,0.0,0.0,0
7,"Your and You’re An easy way to remember it, fo...",cc,0,0.078,0.066,1
8,"did i miss a day? oh poop. well, halfway to 5 ...",cc,0,0.353,0.028,2
9,My flight just got delayed for super long.. wh...,cc,0,0.065,0.032,3
11,a Thank you guys so much for responding. I rea...,sw,1,0.422,0.0,4
...,...,...,...,...,...,...
225,Any trustworthy people out there joining in on...,cc,0,0.212,0.034,95
231,What is really the point? I'm beginning to ser...,sw,1,0.129,0.07,96
232,"I just stomped through downtown Tokyo, knockin...",cc,0,0.21,0.073,97
233,I have class at 8am Yea I regret choosing to h...,cc,0,0.047,0.047,98


In [90]:
# export the set 2 dataframe to a csv file
df.to_csv("./set2_processed.csv")