# **Li**nguistic **F**eature **E**xtraction for **S**ystem**a**tic **V**al**e**nce **R**ecognition (LiFESaVeR)
CS 6501: Natural Language Processing Final Project

Param Damle (psd9vgc), Richard Wang (rxw2cxy), Kabir Menghrajani (km5qte)

## Setup

### Imports

In [None]:
# library imports
import random
import csv
!pip install snowballstemmer
import snowballstemmer
import numpy as np
from tqdm import tqdm
!pip install contractions
import contractions
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.nn import softmax
import matplotlib.pyplot as plt
from keras.initializers import RandomNormal



In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\param\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Helper Functions

In [None]:
def magnitude(arr, axis=0):
  # adjusted L2 norm function that takes norm with modifier to prevent 0 values
  return np.linalg.norm(arr, axis=axis) + 1**-8

In [None]:
def adv_index(iterable, start, stop):
  # advanced indexing adds start and end of sentence tokens to out of bound indexing
  return (["<bos>"] * max(0, -start)) + iterable[max(0, start):stop] + (["<eos>"] * (stop - len(iterable)))

### Load Data

We load in our 4 datasets:
- **Anxiety**: https://www.kaggle.com/code/docxian/anxiety-and-depression-text-analytics/input

  6896 entries of text labeled as 1 to indicate anxiety/depression and 0 to indicate no anxiety/depression. There are 733 entries labeled as 1, and 6247 labeled as 0. The Excel file was converted to CSV beforehand.
- **Stress**: https://www.kaggle.com/datasets/kreeshrajani/human-stress-prediction

  2343 entries sourced from various mental health subreddits, and is labeled as 0 for no stress and 1 for indicates stress. 21% of the entries come from the r/ptsd subreddit and 19% come from the r/relationships subreddit.
- **Depression**: https://www.kaggle.com/datasets/nidhiy07/student-depression-text

  7489 entries sourced from various social media platforms, with posts following English grammar from 15-17 yeara old students. The five columns of this dataset are text, labels, age, age category, and gender. The Excel file was converted to CSV beforehand.
- **Suicide**: https://www.kaggle.com/datasets/aunanya875/suicidal-tweet-detection-dataset

  1778 tweets, with about 37% being potential suicide posts and about 63% being non-suicide posts.

In [None]:
conditions = ['anxiety', 'stress', 'depression', 'suicide']

In [None]:
max_n_tokenize = 2  # up to this n, we will also tokenize n-grams

In [None]:
def tokenize(sentence):
  # Turns sentence of form "sample information" to ["sampl", "inform"]
  sent = sentence.lower()
  sent = contractions.fix(sent)  # to standardize results

  sent = ''.join([c for c in sent if c.isalpha() or c.isspace()])
  split_sent = sent.split()
  stop_word_locations = set([i for i in range(len(split_sent)) if split_sent[i] in stop_words])
  stemmer = snowballstemmer.stemmer('english')
  stemmed_sent = stemmer.stemWords(split_sent)
  tokenized_sentence = []
  for n in range(1,max_n_tokenize+1):
    # for start in range(1-n,len(stemmed_sent)):
    for start in range(len(stemmed_sent)+1-n):
      contains_non_stop_word = False
      for i in range(start, start + n):
        if i not in stop_word_locations:
          contains_non_stop_word = True
          break
      if contains_non_stop_word:
        # tokenized_sentence.append(tuple(adv_index(stemmed_sent, start, start + n)))
        tokenized_sentence.append(tuple(stemmed_sent[start:start + n]))
  return tokenized_sentence

def tokenize_corpus(corpus):
  ''' input: list of sentences, each sentence is a string
      output: list of sentences, each sentence is a list of tokens
  '''
  return [ tokenize(document) for document in tqdm(corpus) ]

In [None]:
tokenize("here's a random string of words that I put together")

[('random',),
 ('string',),
 ('word',),
 ('put',),
 ('togeth',),
 ('a', 'random'),
 ('random', 'string'),
 ('string', 'of'),
 ('of', 'word'),
 ('word', 'that'),
 ('i', 'put'),
 ('put', 'togeth')]

In [None]:
def load_data(condition, file_name):
  # loads data into corpus of documents and corresponding labels
  indexes = {
      "anxiety": (0,1),
      "stress": (3,4),
      "depression": (0,1),
      "suicide": (0,1)
  }
  text_index, label_index = indexes[condition]

  X = []  # list of documents, each document is a raw string
  y = []  # list of labels, 0 for condition not present and 1 for condition present

  with open(file_name, "r", encoding="utf8") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
      if condition == "suicide":  # special dataset format
        row[label_index] = '0' if (row[label_index][0] == 'N') else '1'
      if row[label_index] in ('0','1'):
        X.append(row[text_index])
        y.append(int(row[label_index]))
  return (X, y)

In [None]:
condition_data = {}  # maps condition name to (corpus of documents, class labels)
for condition in conditions:
  print("Loading", condition, "dataset...")
  condition_data[condition] = load_data(condition, condition + '.csv')

Loading anxiety dataset...
Loading stress dataset...
Loading depression dataset...
Loading suicide dataset...


## Training

In [None]:
def Classifier(num_docs):
    # Just the class that will take the similarity scores for each document in the corpus and output a prediction
    # Create model
    initializer = RandomNormal(mean=0.0, stddev=2.0, seed=420)
    model = Sequential()
    model.add(keras.Input(shape=(num_docs,)))
    model.add(Dense(256, activation='linear', kernel_initializer=initializer))
    # model.add(Dense(128, activation='relu', kernel_initializer=initializer))
    model.add(Dropout(0.1))
    # model.add(Dense(32, activation='relu', kernel_initializer=initializer))
    model.add(Dense(1, activation='sigmoid', kernel_initializer=initializer))

    # Compile model
    # model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
    model.compile(loss=keras.losses.BinaryCrossentropy(from_logits=False), optimizer='nadam', metrics=["accuracy", keras.metrics.FBetaScore(beta=1.75)])
    return model

In [None]:
class ThresholdCallback(tf.keras.callbacks.Callback):
    # a class that will stop model training once it's sufficiently trained
    def __init__(self, metric, min_metric, min_epochs):
        # terminates training when accuracy and epochs are both above threshold
        super(ThresholdCallback, self).__init__()
        self.metric = metric
        self.metric_threshold = min_metric
        self.epoch_threshold = min_epochs

    def on_epoch_end(self, epoch, logs=None):
        metric_level = logs[self.metric]
        if metric_level >= self.metric_threshold and (epoch+1) >= self.epoch_threshold:
            self.model.stop_training = True

In [None]:
# Build TF-IDF Vectorizer given tokenized data

# Class for training, builds vocab mapping (unique words) as well as TFIDF matrix (# of words in vocab *  # sentences)
class DetectionModel:

    def format_tf(self, raw_tf_vec):
        # transforms raw occurrence count to logarithmic tf vector
        return [ (1 + np.log10(tf) if tf > 0 else 0) for tf in raw_tf_vec ]

    def train(self, corpus, labels):
        ''' corpus should be formatted as list of documents, where each document is a string
            labels should be a list of corresponding integer class labels
        '''
        # Shuffle dataset for robustness
        print("Shuffling training data...")
        training_data = list(zip(corpus, labels))
        random.shuffle(training_data)
        corpus, labels = zip(*training_data)

        # Create vocab from corpus
        print("Tokenizing corpus...")
        tokenized_corpus = tokenize_corpus(corpus)
        self.num_documents = len(tokenized_corpus)
        self.pos_y = np.asarray(labels,dtype=bool)
        self.neg_y = np.invert(self.pos_y)

        self.num_pos = np.sum(self.pos_y)
        self.num_neg = self.num_documents - self.num_pos
        # print(self.num_documents, self.num_pos, self.num_neg)

        self.doc_weights = np.ones(self.num_documents) # used in case we want to assign documents different weights
        self.vocab = {}  # maps token to id

        id = 0
        for sentence in tokenized_corpus:
            for token in sentence:
                if token not in self.vocab:
                    self.vocab[token] = id
                    id += 1
        self.vocab_size = len(self.vocab)

        print("Calculating term frequencies...")
        tf_matrix = []
        for sentence in tqdm(tokenized_corpus):
            sentence_tf = [0] * self.vocab_size
            for token in sentence:
                sentence_tf[self.vocab[token]] += 1
            tf_matrix.append(self.format_tf(sentence_tf))

        print("Calculating inverted document frequencies...")
        token_df = [0] * self.vocab_size
        for document in tokenized_corpus:
            token_ids_in_document = set()
            for token in document:
                token_ids_in_document.add(self.vocab[token])
            for id in token_ids_in_document:
                token_df[id] += 1
        token_idf = [np.log10(self.num_documents / df) for df in token_df]

        print("Calculating tf-idf scores...")
        self.tfidf = np.multiply(tf_matrix, token_idf)
        self.tfidf /= magnitude(self.tfidf, axis=1)[:,np.newaxis]  # normalization so that dot product is cosine similarity
        self.tfidf = self.tfidf.T  # transposed so we can do row_sample*tfidf to get a row of outputs

        ''' instead of simply averaging the similarity scores across documents,
            we develop a neural net that converts document similarity into a binary prediction
        '''
        print("Training neural network...")
        scaled_tf_matrix = np.asarray(tf_matrix, dtype=np.float64)
        scaled_tf_matrix /= magnitude(scaled_tf_matrix, axis=1)[:,np.newaxis]
        self.classifier = Classifier(self.num_documents)
        train_X = np.matmul(scaled_tf_matrix, self.tfidf)  # result will have a row per document tf vector, where each column is its similarity score with every document's tfidf vector
        train_y = np.array(self.pos_y, dtype=np.float64)
        hist = self.classifier.fit(train_X, train_y, epochs=100, batch_size=32, shuffle=True, validation_split=0.15, callbacks=[ThresholdCallback(metric = "fbeta_score", min_metric = 0.97, min_epochs = 10)])
        return hist


    def update_doc_weights(self, new_doc_weights):
      # in case different documents should be weighted differently
      self.doc_weights = np.array(new_doc_weights, dtype=np.float64) + np.min(new_doc_weights)
      # scale so that the magnitudes across positive and negative samples add up to the number of positive and negative samples, respectively
      for (indexes, instances) in ((self.pos_y, self.num_pos), (self.neg_y, self.num_neg)):
        sum_weights = np.sum(self.doc_weights[indexes])
        if sum_weights == 0: # protect division by 0
          sum_weights = 1
        self.doc_weights[indexes] *= instances / sum_weights


    def score(self, sentence):
        tokenized_sentence = tokenize(sentence)

        token_ids = []
        for token in tokenized_sentence:
            if token in self.vocab:
                token_ids.append(self.vocab[token])

        sentence_tf_raw = [0] * self.vocab_size
        for id in token_ids:
            sentence_tf_raw[id] += 1
        sentence_tf = np.asarray(self.format_tf(sentence_tf_raw), dtype=np.float64)
        sentence_magnitude = magnitude(sentence_tf)
        if sentence_magnitude == 0: # protect division by 0
          sentence_magnitude = 1
        sentence_tf /= sentence_magnitude  # normalization so that dot product is cosine similarity

        # cosine of angle between sentence given and document from training corpus
        similarity_per_doc = np.matmul(sentence_tf, self.tfidf)

        return self.classifier.predict(similarity_per_doc[np.newaxis,:], verbose=0).item()

        # # if we consider certain documents from training corpus to be more important
        # weighted_similarity = np.multiply(similarity_per_doc, self.doc_weights)

        # ''' since we are subtracting off a negative similarity score, we are effectively eliminating
        #     the "noise" and we can scale our similarity score so that both similarity values
        #     are not hovering around 0.05
        # '''
        # similarity_scaling_factor = np.max(weighted_similarity)
        # if similarity_scaling_factor == 0: # protect division by 0
        #   similarity_scaling_factor = 1

        # # find separate scores for our positive and negative samples
        # positive_score = np.sum(weighted_similarity[self.pos_y]) / (self.num_pos * similarity_scaling_factor)
        # negative_score = np.sum(weighted_similarity[self.neg_y]) / (self.num_neg * similarity_scaling_factor)

        # scaled_similarity = (positive_score - negative_score + 1.0)/2.0  # converts similarity in [-1,1] to [0,1]

        return scaled_similarity

In [None]:
detectors = {}  # maps condition to detector object
for condition in conditions:
  print("Building", condition, "detector...")
  detectors[condition] = DetectionModel()
  detectors[condition].train(*condition_data[condition])
  print()

Building anxiety detector...
Shuffling training data...
Tokenizing corpus...


100%|████████████████████████████████████████████████████████████████████████████| 6980/6980 [00:04<00:00, 1429.73it/s]


Calculating term frequencies...


100%|█████████████████████████████████████████████████████████████████████████████| 6980/6980 [00:11<00:00, 631.37it/s]


Calculating inverted document frequencies...
Calculating tf-idf scores...
Training neural network...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100

Building stress detector...
Shuffling training data...
Tokenizing corpus...


100%|█████████████████████████████████████████████████████████████████████████████| 2838/2838 [00:08<00:00, 344.38it/s]


Calculating term frequencies...


100%|█████████████████████████████████████████████████████████████████████████████| 2838/2838 [00:08<00:00, 328.46it/s]


Calculating inverted document frequencies...
Calculating tf-idf scores...
Training neural network...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/10

100%|████████████████████████████████████████████████████████████████████████████| 7486/7486 [00:03<00:00, 2181.49it/s]


Calculating term frequencies...


100%|█████████████████████████████████████████████████████████████████████████████| 7486/7486 [00:12<00:00, 592.32it/s]


Calculating inverted document frequencies...
Calculating tf-idf scores...
Training neural network...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100

Building suicide detector...
Shuffling training data...
Tokenizing corpus...


100%|████████████████████████████████████████████████████████████████████████████| 1788/1788 [00:01<00:00, 1496.57it/s]


Calculating term frequencies...


100%|████████████████████████████████████████████████████████████████████████████| 1788/1788 [00:01<00:00, 1271.20it/s]


Calculating inverted document frequencies...
Calculating tf-idf scores...
Training neural network...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/10

## Testing

In [None]:
def flag( sentence,
          threshold = np.full(len(conditions), .5),
          find_red_flags = False ):
  '''
  Inputs:
    sentence: text we want to analyze for red flags
    threshold: cosine similarity value above which a red flag is assigned to the sentence, for each condition
      a higher threshold favors precision, a lower threshold favors recall
  Output:
    dictionary that maps condition name (str) to (numerical score, None or substring with highest red flag score)
  '''
  results = {}
  for i, condition in enumerate(conditions):
    score = detectors[condition].score(sentence)
    biggest_red_flag = None
    if find_red_flags and score >= threshold[i]: # red flag level meets threshold
      split_sentence = sentence.split()
      max_ngram_score = 0
      max_ngram = (0,0)  # the segment of words that are most concerning as (start index, end index + 1)
      for n in range(2,7): # n grams of varying sizes
        for ngram_start in range(0, len(split_sentence) + 1 - n):
          ngram_score = detectors[condition].score(" ".join(split_sentence[ngram_start:ngram_start+n]))
          if ngram_score > max_ngram_score:
            max_ngram_score = ngram_score
            max_ngram = (ngram_start, ngram_start+n)
      biggest_red_flag = ("..." if max_ngram[0] > 0 else "") + " ".join(split_sentence[max_ngram[0]:max_ngram[1]]) + ("..." if max_ngram[1] < len(split_sentence) else "")

    results[condition] = (score, biggest_red_flag)
  return results

In [None]:
flag("wow, I'm enjoying life. it's so much fun!", find_red_flags = True)  # notably positive valence

{'anxiety': (0.0, None),
 'stress': (1.0835822195076616e-06, None),
 'depression': (7.476426006202954e-34, None),
 'suicide': (4.4346108859755925e-17, None)}

In [None]:
flag("hey, do you wanna play video games later?", find_red_flags = True)  # neutral/mild positive valence

{'anxiety': (0.0, None),
 'stress': (1.0866306672596723e-27, None),
 'depression': (0.0, None),
 'suicide': (9.635192412899177e-31, None)}

In [None]:
flag("I have been feeling unwell lately", find_red_flags = True)  # mild negative valence

{'anxiety': (3.914957247275197e-09, None),
 'stress': (1.0, '...feeling unwell lately'),
 'depression': (1.0, '...been feeling...'),
 'suicide': (0.011789314448833466, None)}

In [None]:
flag("everything is hopeless, nothing works and life is depressing", find_red_flags = True)  # strongly negative valence

{'anxiety': (0.0, None),
 'stress': (1.0, '...hopeless, nothing...'),
 'depression': (1.0, 'everything is...'),
 'suicide': (1.0, '...life is...')}