In [39]:
import pandas as pd
import numpy as np
import re
import datetime

import spacy
import en_core_web_sm
from spacy.lemmatizer import Lemmatizer

In [5]:
# read in the dataframe

df = pd.read_csv('../data/cleaned/combined.csv')

In [6]:
# check the shape and the first 5 rows

print(df.shape)
df.head()

(102031, 4)


Unnamed: 0,subreddit,author,date,post
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new..."
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...
3,alcoholism,SauceoffSauceOn,2020-01-01,I am an Alcoholic. How do I quit? I have been ...
4,alcoholism,ben42187,2020-01-01,Funniest Thing about Alcoholism With every oth...


### Feature Engineering and Preprocessing

In [7]:
## select posts only created in 2020

df['date'].dtype # object -> should be changed to datetime

dtype('O')

In [8]:
# changing date field to datetime datatype

df['date'] = pd.to_datetime(df['date'])

In [9]:
# create a function to create a list of 1s and 0s

def create_target(column):    
    
    labels = []
    
    # separate each row
    for text in column:
        
        # initiate 0 as label
        label = 0
        
        # iterate through words in relevant_words
        for relevant_word in relevant:
            
            # if relevant_word is found in text, assign 1 as label
            # stop comparing once assigned as 1, then append it to the list
            if relevant_word in text.lower():
                label = 1
                break
        labels.append(label)
        
    return labels

In [12]:
# check if the post is related to COVID

relevant = ['corona', 'virus', 'viral', 'covid', 'sars', 'influenza', 'pandemic', 'epidemic', 'quarantine', 'lockdown',
            'distancing', 'national emergency', 'flatten', 'infect', 'ventilator', 'mask', 'symptomatic', 'epidemiolog',
            'immun', 'incubation', 'transmission', 'vaccine', 'outbreak', 'epicenter', 'covid', 'contagi', 'transmit', 
            'transmission', 'community spread', 'shelter in place', 'immunocompromis', 'isolation', 'n95', 'incubation',
            'travel ban', 'wuhan', 'china virus', 'chinese virus', 'shutdown', 'confirmed case', 'stay at home', 'stay-at-home',
            'work from home', 'working from home', 'zoom meeting', 'stuck inside', 'stuck home', 'flu', 'toilet paper', 
            'toilette paper', 'asymptomatic']

In [13]:
# assign a column and fill it with labels

df['covid_related'] = create_target(df['post'])

df['covid_related'].value_counts(normalize = True)

0    0.886632
1    0.113368
Name: covid_related, dtype: float64

In [14]:
# check if the post indicates suicide suggestive

relevant = ['suicide', 'jump off a bridge', 'i want to overdose', 'i wanna overdose', "i’m a burden", "i’m such a burden", 
            'feel like a burden', 'shoot myself', 'shooting myself', 'slit my wrist', "don't wanna wake up", 'live any more',
            'i will overdose', 'thinking about overdose', 'kill myself', 'killing myself', 'hang myself', "can't do this anymore",
            'hanging myself', 'cut myself', 'cutting myself', 'hurt myself', 'hurting myself', 'want to die', 'wanna die',
            "don’t want to wake up", "don’t wake up", 'never want to wake up', "don’t want to be alive", 'want to be alive', 
            'wish it would all end', 'done with living', 'want it to end, it all ends tonight', 'live anymore', 
            'living anymore', 'life anymore', 'be dead', 'take it anymore', 'end my life', 'think about death', 'hopeless', 
            'hurt myself', 'no one will miss me', 'if i live or die', 'i hate my life', 'shoot me', 'kill me', 'suicide', 
            'no point', 'deserve to die', "i'm worthless", 'no one would care', 'excedrin', 'ibuprofen', 'acetaminophen', 
            '800 mg', '800mg', 'end my life', 'want it to stop', "don’t want to be here anymore", "don't wanna be here anymore"]

In [15]:
# assign a column and fill it with labels

df['suicidal'] = create_target(df['post'])

df['suicidal'].value_counts(normalize = True)

0    0.74889
1    0.25111
Name: suicidal, dtype: float64

In [16]:
# check if the post suggests alcohol overuse

relevant = ['drink too much', 'drinking too much', 'drink all day', 'drinking all day', 'drink so much', 'drinking so much', 
            'drink a lot', 'drinking a lot', 'drink everyday', 'drinking everyday', 'drink more', 'drinking more', 
            'beer', 'alcohol', 'vodka', 'whiskey', 'whisky', 'tequila', 'wine', 'gin', 'brandy', 'booze', 'scotch',
            'liquor', 'liqueur', 'liquer', 'rum', 'spirits', 'bourbon', 'drunk', 'hung over', 'not sober', 'never sober']

In [17]:
# assign a column and fill it with labels

df['alc_abuse'] = create_target(df['post'])

df['alc_abuse'].value_counts(normalize = True)

0    0.822407
1    0.177593
Name: alc_abuse, dtype: float64

In [18]:
# check if the post expresses ones' loneliness

relevant = ['alone', 'lonely', 'no one cares', "can’t see anyone", "can’t see my", 'i miss my', 'i want to see my', 
            'trapped', "i’m in a cage", 'feel ignored', 'ignoring me', 'rejected', 'avoid', 'avoiding me', 'am single', 
            'been single', 'quarantine', 'lockdown', 'isolat', 'self-isolat', 'disconnect', 'broke up', 'broken up', 
            'break up', 'breakup', 'divorce', 'loneliness', "can't talk to anyone", 'cannot talk to anyone', 
            "can't hang out", 'cannot hang out']

In [19]:
# assign a column and fill it with labels

df['loneliness'] = create_target(df['post'])

df['loneliness'].value_counts(normalize = True)

0    0.719272
1    0.280728
Name: loneliness, dtype: float64

In [20]:
# check if the post includes any stressors

relevant = ['furlough', 'laid off', 'fired', 'rent', 'make ends meet', 'bills', 'evict', 'enough money', 'cannot afford', 
            "can't afford", 'debt', 'no money', 'single mom', 'single dad', 'lost my job', 'mortgage', 'landlord',
            'single parent', 'divorce', 'domestic violence', 'abuse', 'unemploy', 'homeless', 'out on the street', 
            'food bank', 'slap', 'hit me', 'fight', 'lost income']

In [21]:
# assign a column and fill it with labels

df['stress'] = create_target(df['post'])

df['stress'].value_counts(normalize = True)

0    0.712186
1    0.287814
Name: stress, dtype: float64

In [22]:
df.shape

(102031, 9)

In [31]:
# instantiate spacy

nlp = en_core_web_sm.load()

In [34]:
def get_nums(column):
    
    # create empty lists to store numbers of words and sentences
    words = []
    sentences = []
    
    # iterate through every row in post column
    for i in range(len(column)):
        
        doc = nlp(column[i])
        
        num_words = len(doc)
        words.append(num_words)
        
        num_sents = len(list(doc.sents))
        sentences.append(num_sents)
        
    return words, sentences

In [35]:
# add number of words and sentences in the dataframe

df['n_words'], df['n_sentences'] = get_nums(df['post'])

In [36]:
# check the values

df['n_words'][:5]

0     24
1    577
2     64
3    111
4    136
Name: n_words, dtype: int64

In [37]:
df['n_sentences'][:5]

0     3
1    44
2     5
3    14
4    10
Name: n_sentences, dtype: int64

In [38]:
print(df.shape)
df.head()

(102031, 11)


Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,44
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5
3,alcoholism,SauceoffSauceOn,2020-01-01,I am an Alcoholic. How do I quit? I have been ...,0,0,1,0,1,111,14
4,alcoholism,ben42187,2020-01-01,Funniest Thing about Alcoholism With every oth...,0,0,1,0,0,136,10


In [59]:
# stopwords will not be removed from the posts for LSTM in modeling process
# create a function to lemmatize

def lemmatize(column):
    
    lem = []
    
    for i in range(len(column)):
        
        doc = nlp(column[i])
        
        token_lem = ' '.join(token.lemma_ for token in doc if token.is_alpha == True)

        lem.append(token_lem)
        
    return lem

In [60]:
# store lemmatized posts in dataframe 

df['lemmatized'] = lemmatize(df['post'])

In [61]:
df[['post', 'lemmatized']]

Unnamed: 0,post,lemmatized
0,Day 1 of sobriety Feeling anxious and letting ...,day of sobriety feel anxious and let the feeli...
1,"Started the New Year with a bang. Hey, I'm new...",start the New Year with a bang hey -PRON- new ...
2,Why can't I get drunk anymore I've been a heav...,why can -PRON- get drunk anymore -PRON- be a h...
3,I am an Alcoholic. How do I quit? I have been ...,-PRON- be an Alcoholic how do -PRON- quit -PRO...
4,Funniest Thing about Alcoholism With every oth...,Funniest thing about Alcoholism with every oth...
...,...,...
102026,My worst fear came to life today. One of my fr...,-PRON- bad fear come to life today one of -PRO...
102027,***GROSS ANXIETY STORY*** help... This is real...,gross ANXIETY story help this be really gross ...
102028,Really need advice and help please The last mo...,really need advice and help please the last mo...
102029,Anxiety to live up in the tech industry I’m a ...,anxiety to live up in the tech industry -PRON-...


In [62]:
# check the dataframe

print(df.shape)
df.head()

(102031, 12)


Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences,lemmatized
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3,day of sobriety feel anxious and let the feeli...
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,44,start the New Year with a bang hey -PRON- new ...
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5,why can -PRON- get drunk anymore -PRON- be a h...
3,alcoholism,SauceoffSauceOn,2020-01-01,I am an Alcoholic. How do I quit? I have been ...,0,0,1,0,1,111,14,-PRON- be an Alcoholic how do -PRON- quit -PRO...
4,alcoholism,ben42187,2020-01-01,Funniest Thing about Alcoholism With every oth...,0,0,1,0,0,136,10,Funniest thing about Alcoholism with every oth...


In [63]:
# save the final file to csv

df.to_csv('../data/cleaned/final.csv', index = False)