In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
import re

# import tensorflow as tf
print(tf.__version__)


2.10.1


In [2]:
# Dataset Link (https://www.kaggle.com/datasets/nikhileswarkomati/suicide-watch/data)
csv_file_path = 'Suicide_Detection.csv'
df = pd.read_csv(csv_file_path)

# Display the DataFrame
df.drop(df.columns[0], axis=1, inplace=True)
df = df.head(30000)
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    30000 non-null  object
 1   class   30000 non-null  object
dtypes: object(2)
memory usage: 468.9+ KB
None


In [3]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [4]:
def pre_processing(data):
    #remove digit
    data = data.apply(lambda x: ''.join([char for char in x if not char.isdigit()]))

    #remove empty spaces
    data = data.apply(lambda x: " ".join(x.split()))

    #remove puncutations except ! bcos ! can show emotion
    data = data.apply(lambda x: re.sub(r'[^\w\s!]', '', x))

    #Remove emoji
    data = data.apply(remove_emoji)

    return data

df['text'] = pre_processing(df['text'])

print(df.head())

                                                text        class
0  Ex Wife Threatening SuicideRecently I left my ...      suicide
1  Am I weird I dont get affected by compliments ...  non-suicide
2  Finally is almost over So I can never hear  ha...  non-suicide
3          i need helpjust help me im crying so hard      suicide
4  Im so lostHello my name is Adam  and Ive been ...      suicide


In [5]:
#Map output to be binary 1 or 0
mapping = {'suicide': 1, 'non-suicide': 0}
df['class'] = df['class'].map(mapping)
print(df.head())

                                                text  class
0  Ex Wife Threatening SuicideRecently I left my ...      1
1  Am I weird I dont get affected by compliments ...      0
2  Finally is almost over So I can never hear  ha...      0
3          i need helpjust help me im crying so hard      1
4  Im so lostHello my name is Adam  and Ive been ...      1


In [6]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [7]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


In [8]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

vocab_size = 20000  # Only consider the top 20k words
maxlen = 50

X = df['text'].values
y = df['class'].values
y=np.array(y)


# # Tokenize the entire dataset
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X)

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to a fixed length
X = pad_sequences(sequences, maxlen=maxlen, padding="post")

print(X)
print(y)


[[    2    29     4 ...    10   171   402]
 [   49     2   451 ...     0     0     0]
 [  289    15   275 ...     0     0     0]
 ...
 [  393  1059  9316 ...    97    34   108]
 [   18 10053     9 ...    15   352     9]
 [  800     1    65 ...     0     0     0]]
[1 0 0 ... 0 1 0]


In [9]:
embed_dim = 32  # Embedding size for each token
num_heads = 8  # Number of attention heads
ff_dim = 128  # Hidden layer size in feed forward network inside transformer


inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.35)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])


In [10]:
# Define k-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
loss_total = 0
accuracy_total = 0

for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
    print(f"Fold {fold + 1}:")

    # Split the data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # # Tokenize the data
    # X_train_tokenized = Tokenizer(X_train, truncation=True, padding=True)
    # X_val_tokenized = Tokenizer(X_val, truncation=True, padding=True)

    # Train the model
    model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_val, y_val))

    # Evaluate the model
    metrics = model.evaluate(X_val, y_val)
    print(f"Validation Metrics: {metrics}")
    loss_total += metrics[0]
    accuracy_total += metrics[1]

print("Average loss: ", loss_total/5)
print("Average accuracy: ", accuracy_total/5)

Fold 1:
Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation Metrics: [0.2809550166130066, 0.8948333263397217]
Fold 2:
Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation Metrics: [0.22827109694480896, 0.9271666407585144]
Fold 3:
Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation Metrics: [0.12907998263835907, 0.9549999833106995]
Fold 4:
Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation Metrics: [0.07715890556573868, 0.9739999771118164]
Fold 5:
Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation Metrics: [0.0438835434615612, 0.9823333621025085]
Average loss:  0.1518697090446949
Average accuracy:  0.9466666579246521


In [11]:
#Save Model
model.save('MentalAI', save_format='tf')



INFO:tensorflow:Assets written to: MentalAI\assets


INFO:tensorflow:Assets written to: MentalAI\assets


# Demo - 6 test texts

In [12]:
text_array=["Every time my partner gets angry for anything, she takes it out on me. Nothing I do is right, and once she's mad, she calls me all kinds of names and is verbally abusive. She says it isn't abuse, it's just angry verbal bashing, and that it's different. It gets worse each time. The names are very vulgar now.", "I am depressed and wanna kill myself, everything seems hopeless now", "Please don't stop me, there's no turning back anymore", "I think I am okay, but not really", "I want to rest, I am overwhelmed", "I ate some food just now"]
print(len(text_array))

def MentalAI(text):
    text = pre_processing(pd.Series(text))

    sequences = tokenizer.texts_to_sequences(text)

    text = pad_sequences(sequences, maxlen=maxlen, padding="post")
    predictions = model.predict(text)
    predicted_class = "non-suicidal" if predictions[0][0] > 0.5 else "suicidal"
    print(f"Predicted Class: {predicted_class}")

    return predictions

sum = 0

# Iterate through every text messages
for i in range(len(text_array)):
    result = MentalAI(text_array[i])
    sum += result[0, 1]

MentalAIOutput = sum/len(text_array)
%store MentalAIOutput


6
Predicted Class: suicidal
Predicted Class: suicidal
Predicted Class: suicidal
Predicted Class: suicidal
Predicted Class: suicidal
Predicted Class: non-suicidal
Stored 'MentalAIOutput' (float64)
