In [1]:
# import libraries

# basic

import pandas as pd
import numpy as np
import datetime

# nlp

import re
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
import spacy

# warning

import warnings
warnings.filterwarnings('ignore')

In [2]:
# read in the dataframe

df = pd.read_csv('../data/cleaned/combined.csv')

In [3]:
# check the shape and the first 5 rows

print(df.shape)
df.head()

(102031, 4)


Unnamed: 0,subreddit,author,date,post
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new..."
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...
3,alcoholism,SauceoffSauceOn,2020-01-01,I am an Alcoholic. How do I quit? I have been ...
4,alcoholism,ben42187,2020-01-01,Funniest Thing about Alcoholism With every oth...


### Feature Engineering and Preprocessing

In [4]:
## select posts only created in 2020

df['date'].dtype # object -> should be changed to datetime

dtype('O')

In [5]:
# changing date field to datetime datatype

df['date'] = pd.to_datetime(df['date'])

In [6]:
# create a function to create a list of 1s and 0s

def create_target(column):    
    
    labels = []
    
    # separate each row
    for text in column:
        
        # initiate 0 as label
        label = 0
        
        # iterate through words in relevant_words
        for relevant_word in relevant:
            
            # if relevant_word is found in text, assign 1 as label
            # stop comparing once assigned as 1, then append it to the list
            if relevant_word in text.lower():
                label = 1
                break
        labels.append(label)
        
    return labels

In [7]:
# check if the post is related to COVID

relevant = ['corona', 'virus', 'viral', 'covid', 'sars', 'influenza', 'pandemic', 'epidemic', 'quarantine', 'lockdown',
            'distancing', 'national emergency', 'flatten', 'infect', 'ventilator', 'mask', 'symptomatic', 'epidemiolog',
            'immun', 'incubation', 'transmission', 'vaccine', 'outbreak', 'epicenter', 'covid', 'contagi', 'transmit', 
            'transmission', 'community spread', 'shelter in place', 'immunocompromis', 'isolation', 'n95', 'incubation',
            'travel ban', 'wuhan', 'china virus', 'chinese virus', 'shutdown', 'confirmed case', 'stay at home', 'stay-at-home',
            'work from home', 'working from home', 'zoom meeting', 'stuck inside', 'stuck home', 'flu', 'toilet paper', 
            'toilette paper', 'asymptomatic']

In [8]:
# assign a column and fill it with labels

df['covid_related'] = create_target(df['post'])

df['covid_related'].value_counts(normalize = True)

0    0.886632
1    0.113368
Name: covid_related, dtype: float64

In [9]:
# check if the post indicates suicide suggestive

relevant = ['suicide', 'jumped off a bridge', 'jump off a bridge', 'i want to overdose', 'i wanna overdose', "i’m a burden", 
            "i’m such a burden", 'feel like a burden', 'shoot myself', 'shooting myself', 'slit my wrist', "don't wanna wake up", 
            'live any more', 'i will overdose', 'thinking about overdose', 'kill myself', 'killing myself', 'hang myself', 
            "can't do this anymore", 'hanging myself', 'cut myself', 'cutting myself', 'hurt myself', 'hurting myself', 'want to die',
            'wanna die', "don’t want to wake up", "don’t wake up", 'never want to wake up', "don’t want to be alive", 
            'want to be alive',  'wish it would all end', 'done with living', 'want it to end, it all ends tonight', 'live anymore', 
            'self harm', 'i die', "don't want to live", 'living anymore', 'life anymore', 'be dead', 'take it anymore', 'end my life',
            'think about death', 'hopeless', 'hurt myself', 'no one will miss me', 'if i live or die', 'i hate my life', 'shoot me', 
            'kill me', 'suicide', 'no point', 'deserve to die', "i'm worthless", 'no one would care', 'excedrin', 'ibuprofen', 
            'acetaminophen', '800 mg', '800mg', 'end my life', 'want it to stop', "don’t want to be here anymore", 'cut my wrist',
            "don't wanna be here anymore", "can't keep going", "won't be alive", 'will not be alive', 'in the worst low', 'suicidal', 
            'self hatred', 'take this all away', 'end my life', 'run away from life', 'clock out', 'better without me', 'no reason to live',
            'put myself out', "can't take it anymore", 'die in my sleep', 'not worth living', 'ready to die', 'end it all', 
            'thinking about ending it']

In [10]:
# assign a column and fill it with labels

df['suicidal'] = create_target(df['post'])

df['suicidal'].value_counts(normalize = True)

0    0.68884
1    0.31116
Name: suicidal, dtype: float64

In [13]:
# check if the post suggests alcohol overuse

relevant = ['drink too much', 'drinking too much', 'drink all day', 'drinking all day', 'drink so much', 'drinking so much', 
            'drink a lot', 'drinking a lot', 'drink everyday', 'drinking everyday', 'drink more', 'drinking more', 
            'beer', 'alcohol', 'vodka', 'whiskey', 'whisky', 'tequila', 'wine', 'gin', 'brandy', 'booze', 'booz', 'scotch',
            'liquor', 'liqueur', 'liquer', 'rum', 'spirits', 'bourbon', 'drunk', 'hung over', 'not sober', 'never sober']

In [14]:
# assign a column and fill it with labels

df['alc_abuse'] = create_target(df['post'])

df['alc_abuse'].value_counts(normalize = True)

0    0.822358
1    0.177642
Name: alc_abuse, dtype: float64

In [15]:
# check if the post expresses ones' loneliness

relevant = ['alone', 'lonely', 'no one cares', "can’t see anyone", "can’t see my", 'i miss my', 'i want to see my', 
            'trapped', "i’m in a cage", 'feel ignored', 'ignoring me', 'rejected', 'avoid', 'avoiding me', 'am single', 
            'been single', 'quarantine', 'lockdown', 'isolat', 'self-isolat', 'disconnect', 'broke up', 'broken up', 
            'break up', 'breakup', 'divorce', 'loneliness', "can't talk to anyone", 'cannot talk to anyone', 
            "can't hang out", 'cannot hang out']

In [16]:
# assign a column and fill it with labels

df['loneliness'] = create_target(df['post'])

df['loneliness'].value_counts(normalize = True)

0    0.719272
1    0.280728
Name: loneliness, dtype: float64

In [17]:
# check if the post includes any stressors

relevant = ['furlough', 'laid off', 'fired', 'rent', 'make ends meet', 'bills', 'evict', 'enough money', 'cannot afford', 
            "can't afford", 'debt', 'no money', 'single mom', 'single dad', 'lost my job', 'mortgage', 'landlord',
            'single parent', 'divorce', 'domestic violence', 'abuse', 'unemploy', 'homeless', 'out on the street', 
            'food bank', 'slap', 'hit me', 'fight', 'lost income']

In [18]:
# assign a column and fill it with labels

df['stress'] = create_target(df['post'])

df['stress'].value_counts(normalize = True)

0    0.712186
1    0.287814
Name: stress, dtype: float64

In [19]:
df.shape

(102031, 9)

In [20]:
# instantiate spacy

nlp = spacy.load('en_core_web_md')

In [25]:
def get_nums(column):
    
    # create empty lists to store numbers of words and sentences
    words = []
    sentences = []
    
    # iterate through every row in post column
    for i in range(len(column)):
        
        doc = nlp(column[i])
        
        num_words = len(doc)
        words.append(num_words)
        
        num_sents = len(list(doc.sents))
        sentences.append(num_sents)
        
    return words, sentences

In [26]:
# add number of words and sentences in the dataframe

df['n_words'], df['n_sentences'] = get_nums(df['post'])

In [27]:
# check the values

df[['n_words', 'n_sentences']][:5]

Unnamed: 0,n_words,n_sentences
0,24,3
1,577,41
2,64,5
3,111,14
4,136,9


In [28]:
print(df.shape)
df.head()

(102031, 11)


Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,41
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5
3,alcoholism,SauceoffSauceOn,2020-01-01,I am an Alcoholic. How do I quit? I have been ...,0,0,1,0,1,111,14
4,alcoholism,ben42187,2020-01-01,Funniest Thing about Alcoholism With every oth...,0,0,1,0,0,136,9


In [29]:
# check spacy's stopwords

print(len(nlp.Defaults.stop_words))
print(nlp.Defaults.stop_words)

326
{'afterwards', 'although', 'there', 'whereafter', 'can', 'please', 'before', 'made', 'does', 'my', 'now', 'never', 'me', 'must', 'third', 'upon', '’re', 'yours', 'further', 'however', 'being', 'besides', 'during', 'quite', 'about', 'former', 'very', 'themselves', 'herself', 'thru', 'cannot', 'is', 'perhaps', "'m", 'less', 'somehow', '’d', 'noone', 'own', 'via', 'at', 'above', 'either', 'namely', 'yet', 'do', 'nor', 'just', 'as', 'say', 'be', 'except', 'with', '‘ll', 'someone', 'much', 'really', 'somewhere', 'one', 'thereafter', 'same', 'others', 'seems', 'whole', 'whereupon', 'an', 'itself', 'empty', 'back', 'this', 'unless', 'could', 'when', 'thereupon', 'who', 'other', 'again', 'regarding', 'did', 'should', 'front', 'have', 'where', 'ever', 'i', 'since', 'myself', 'together', 'though', 'using', 'by', 'those', 'towards', 'more', 'these', 'them', 'off', 'sometime', 'might', 'himself', 'seeming', 'eight', 'along', 'between', 'the', 'part', 'due', 'make', 'call', 'few', 'else', 'used

In [30]:
# customize stop words - 
# remove several words from stopwords - will use bidirectional LSTM, and negative sentiments could be important

stop = nlp.Defaults.stop_words
stop -= {'not', 'no', 'nobody', 'noone', 'anyone', 'over', 'empty', 'nothing'}

# check if worked

len(stop)

318

In [40]:
# create a function to lemmatize

def lemmatize(column):
    
    lem = []
    vec = []
    
    for i in range(len(column)):      
        
        # unfold contraction words for lemmatization
        
        doc = re.sub(r"\'s", ' is ', column[i])
        doc = re.sub(r"\'ve", ' have ', doc)
        doc = re.sub(r"n't", ' not ', doc)
        doc = re.sub(r"I'm", ' I am ', doc)
        doc = re.sub(r"i'm", ' I am ', doc)
        doc = re.sub(r"\'re", ' are ', doc)
        doc = re.sub(r"\'d", ' would ', doc)
        doc = re.sub(r"\'ll", ' will ', doc)
        doc = re.sub('[^A-Za-z0-9]+', ' ', doc)
        
        # stem the words
        
        doc = nlp(doc) 
        text = ' '.join([token.lemma_ for token in doc if token not in stop])
        
        text = text.lower()
        vector = doc.vector
        
        lem.append(text)
        vec.append(vector)
        
    return lem, vec

In [None]:
# store lemmatized posts in dataframe 

df['lemmatized'], df['vectors'] = lemmatize(df['post'])

In [33]:
df[['post', 'lemmatized', 'vectors']]

Unnamed: 0,post,lemmatized,vectors
0,Day 1 of sobriety Feeling anxious and letting ...,day 1 of sobriety feel anxious and let the fee...,"[-0.026765203, 0.2515704, -0.1566844, -0.00761..."
1,"Started the New Year with a bang. Hey, I'm new...",start the new year with a bang hey -pron- be n...,"[0.010248344, 0.18354495, -0.22630265, -0.0344..."
2,Why can't I get drunk anymore I've been a heav...,why ca not -pron- get drunk anymore -pron- hav...,"[-0.09739925, 0.18395872, -0.22718114, -0.1007..."
3,I am an Alcoholic. How do I quit? I have been ...,-pron- be an alcoholic how do -pron- quit -pro...,"[-0.0117096035, 0.18657142, -0.25766036, -0.09..."
4,Funniest Thing about Alcoholism With every oth...,funniest thing about alcoholism with every oth...,"[-0.04437424, 0.18121068, -0.1554362, -0.12354..."
...,...,...,...
102026,My worst fear came to life today. One of my fr...,-pron- bad fear come to life today one of -pro...,"[-0.054425444, 0.18733731, -0.2389673, -0.0360..."
102027,***GROSS ANXIETY STORY*** help... This is real...,gross anxiety story help this be really gros...,"[-0.049861066, 0.22880034, -0.24683814, -0.052..."
102028,Really need advice and help please The last mo...,really need advice and help please the last mo...,"[-0.032756463, 0.20075087, -0.23279792, -0.080..."
102029,Anxiety to live up in the tech industry I’m a ...,anxiety to live up in the tech industry -pron-...,"[-0.0065991883, 0.17816082, -0.22199208, -0.06..."


In [38]:
df['vectors'][0][0].dtype

dtype('float32')

In [39]:
df['vectors'].dtype

dtype('O')

In [34]:
# check the dataframe

print(df.shape)
df.head()

(102031, 13)


Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences,lemmatized,vectors
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3,day 1 of sobriety feel anxious and let the fee...,"[-0.026765203, 0.2515704, -0.1566844, -0.00761..."
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,41,start the new year with a bang hey -pron- be n...,"[0.010248344, 0.18354495, -0.22630265, -0.0344..."
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5,why ca not -pron- get drunk anymore -pron- hav...,"[-0.09739925, 0.18395872, -0.22718114, -0.1007..."
3,alcoholism,SauceoffSauceOn,2020-01-01,I am an Alcoholic. How do I quit? I have been ...,0,0,1,0,1,111,14,-pron- be an alcoholic how do -pron- quit -pro...,"[-0.0117096035, 0.18657142, -0.25766036, -0.09..."
4,alcoholism,ben42187,2020-01-01,Funniest Thing about Alcoholism With every oth...,0,0,1,0,0,136,9,funniest thing about alcoholism with every oth...,"[-0.04437424, 0.18121068, -0.1554362, -0.12354..."


In [35]:
# save the final file to json to save vectors column in list format

df.to_json('../data/cleaned/final.json')

In [44]:
# save the final file to csv as well

df.to_csv('../data/cleaned/final.csv', index = False)

### clustering

subspace clustering

hierarchical clustering method with sklearn