In [1]:
!pip install -q kaggle


In [2]:
from google.colab import files
files.upload()


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"osaldealwis","key":"9201e248377512930a916c3e2d60cdb7"}'}

In [3]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [4]:
!kaggle datasets download -d nikhileswarkomati/suicide-watch --force


Dataset URL: https://www.kaggle.com/datasets/nikhileswarkomati/suicide-watch
License(s): CC-BY-SA-4.0
Downloading suicide-watch.zip to /content
 99% 60.0M/60.6M [00:04<00:00, 13.4MB/s]
100% 60.6M/60.6M [00:04<00:00, 12.8MB/s]


In [5]:
!unzip -o suicide-watch.zip




Archive:  suicide-watch.zip
  inflating: Suicide_Detection.csv   


In [34]:
#Import required libraries
import pandas as pd
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re
import spacy

In [50]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Suicide_Detection.csv')

# Drop the first column (numbers)
df = df.iloc[:, 1:]

# Rename columns for clarity
df.columns = ['text', 'label']

# Check data
print(df.head())
print(df.info())


                                                text        label
0  Ex Wife Threatening SuicideRecently I left my ...      suicide
1  Am I weird I don't get affected by compliments...  non-suicide
2  Finally 2020 is almost over... So I can never ...  non-suicide
3          i need helpjust help me im crying so hard      suicide
4  I’m so lostHello, my name is Adam (16) and I’v...      suicide
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    232074 non-null  object
 1   label   232074 non-null  object
dtypes: object(2)
memory usage: 3.5+ MB
None


In [51]:
# Checking for data-imbalance
print(df['label'].value_counts())

label
suicide        116037
non-suicide    116037
Name: count, dtype: int64


In [52]:
# Map the labels to binary
df['label'] = df['label'].map({'suicide': 1, 'non-suicide': 0})
print(df.head())

                                                text  label
0  Ex Wife Threatening SuicideRecently I left my ...      1
1  Am I weird I don't get affected by compliments...      0
2  Finally 2020 is almost over... So I can never ...      0
3          i need helpjust help me im crying so hard      1
4  I’m so lostHello, my name is Adam (16) and I’v...      1


In [53]:
# Convert text to lowercase
df['text'] = df['text'].str.lower()

# Remove special characters, numbers, and URLs
df['text'] = df['text'].apply(lambda x: re.sub(r'http\S+|www\S+|[^a-zA-Z\s]', '', x))

# Remove duplicates
df = df.drop_duplicates(subset=['text'])

# Reset index
df = df.reset_index(drop=True)

print(df.head())

                                                text  label
0  ex wife threatening suiciderecently i left my ...      1
1  am i weird i dont get affected by compliments ...      0
2  finally  is almost over so i can never hear  h...      0
3          i need helpjust help me im crying so hard      1
4  im so losthello my name is adam  and ive been ...      1


In [54]:
# Split the dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)


In [55]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [56]:
# Tokenize the train and test texts
def tokenize(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors="tf")

train_encodings = tokenize(train_texts)
test_encodings = tokenize(test_texts)


In [57]:
# Convert to TensorFlow format
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
))


In [58]:
# Creating the BERT model

class BERTForClassification(tf.keras.Model):
    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='sigmoid')  # Using sigmoid for binary classification

    def call(self, inputs):
        x = self.bert(inputs)[1]  # Get the pooled output
        return self.fc(x)

In [59]:
# Load pre-trained BERT model
bert_model = TFAutoModel.from_pretrained("bert-base-uncased")


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [60]:
# Create the BERT model for classification
model = BERTForClassification(bert_model, num_classes=1)

In [61]:
# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)

In [62]:
# Train the model
history = model.fit(
    train_dataset.batch(36),
    epochs=3,
    validation_data=test_dataset.batch(36)
)

Epoch 1/3
[1m5151/5151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1957s[0m 375ms/step - accuracy: 0.6450 - loss: 0.6427 - val_accuracy: 0.7591 - val_loss: 0.5496
Epoch 2/3
[1m5151/5151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1906s[0m 370ms/step - accuracy: 0.7727 - loss: 0.5313 - val_accuracy: 0.8077 - val_loss: 0.4847
Epoch 3/3
[1m5151/5151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1901s[0m 369ms/step - accuracy: 0.8138 - loss: 0.4726 - val_accuracy: 0.8320 - val_loss: 0.4420


In [63]:
# Evaluate the model on test data
test_loss, test_acc = model.evaluate(test_dataset.batch(32))
print(f"Test accuracy: {test_acc:.4f}")


[1m1449/1449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m393s[0m 267ms/step - accuracy: 0.8341 - loss: 0.4410
Test accuracy: 0.8320


In [64]:
# Generate predictions on test data
predictions = model.predict(test_dataset.batch(32))
pred_labels = (predictions > 0.5).astype(int)  # Convert probabilities to labels (0 or 1)

# Print the classification report
print(classification_report(test_labels, pred_labels))


[1m1449/1449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m408s[0m 276ms/step
              precision    recall  f1-score   support

           0       0.84      0.82      0.83     23238
           1       0.82      0.84      0.83     23121

    accuracy                           0.83     46359
   macro avg       0.83      0.83      0.83     46359
weighted avg       0.83      0.83      0.83     46359



In [66]:
# Save the underlying BERT model
model.bert.save_pretrained("./suicide_detection_model")


In [68]:
# List of custom text inputs
sample_texts = [
    "I feel so hopeless and lost, life is meaningless.",
    "I am really happy and excited about my upcoming vacation!",
    "Nothing seems to work out for me. I don't know what to do anymore.",
    "Life is beautiful, and I am grateful for everything I have.",
    "I think Im doing really good these days",
    "I dont want to live anymore",
    "Nobody loves me"
]

# Tokenize the inputs
sample_tokens = tokenizer(
    sample_texts,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="tf"
)

# Convert the BatchEncoding object to a dictionary of TensorFlow tensors
sample_tokens = {key: tf.constant(value) for key, value in sample_tokens.items()}

# Predict using the trained model
predictions = model.predict(sample_tokens)

# Convert probabilities to binary labels (0 or 1)
predicted_labels = (predictions > 0.5).astype(int)

# Map labels to meaningful text
label_map = {0: "Non-Suicide", 1: "Suicide"}

# Display the results
for text, label in zip(sample_texts, predicted_labels):
    print(f"Text: {text}")
    print(f"Prediction: {label_map[label[0]]}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step
Text: I feel so hopeless and lost, life is meaningless.
Prediction: Non-Suicide

Text: I am really happy and excited about my upcoming vacation!
Prediction: Non-Suicide

Text: Nothing seems to work out for me. I don't know what to do anymore.
Prediction: Suicide

Text: Life is beautiful, and I am grateful for everything I have.
Prediction: Non-Suicide

Text: I think Im doing really good these days
Prediction: Non-Suicide

Text: I dont want to live anymore
Prediction: Non-Suicide

Text: Nobody loves me
Prediction: Non-Suicide

