In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

2025-07-06 04:40:39.792176: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751776839.985498      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751776840.042623      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
df = pd.read_csv('/kaggle/input/sentiment-analysis-for-mental-health/Combined Data.csv')

if 'Unnamed: 0' in df.columns:
    df.drop('Unnamed: 0', axis=1, inplace=True)

df.dropna(subset=['statement', 'status'], inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

print(df.head())


                                           statement   status
0                                         oh my gosh  Anxiety
1  trouble sleeping, confused mind, restless hear...  Anxiety
2  All wrong, back off dear, forward doubt. Stay ...  Anxiety
3  I've shifted my focus to something else but I'...  Anxiety
4  I'm restless and restless, it's been a month n...  Anxiety


In [3]:
# Resample to balance classes 
max_count = df['status'].value_counts().max()
df_bal = pd.DataFrame()
for status, grp in df.groupby('status'):
    if len(grp) < max_count:
        grp_res = resample(grp, replace=True, n_samples=max_count, random_state=42)
    else:
        grp_res = grp
    df_bal = pd.concat([df_bal, grp_res])

df_bal.reset_index(drop=True, inplace=True)
print("After resampling:\n", df_bal['status'].value_counts())

df = df_bal


After resampling:
 status
Anxiety                 16040
Bipolar                 16040
Depression              16040
Normal                  16040
Personality disorder    16040
Stress                  16040
Suicidal                16040
Name: count, dtype: int64


In [4]:
# Build dynamic label map and one-hot labels
target_names = sorted(df['status'].unique())
num_classes = len(target_names)
label_map = {label: idx for idx, label in enumerate(target_names)}
df['label'] = df['status'].map(label_map)

In [6]:
# Tokenize with BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 128
enc = tokenizer(
    df['statement'].astype(str).tolist(), padding='max_length', truncation=True,
    max_length=MAX_LEN, return_tensors='np'
)
input_ids = enc['input_ids']
attention_mask = enc['attention_mask']


In [7]:
# Prepare labels and tf.data dataset
labels = tf.keras.utils.to_categorical(df['label'], num_classes=num_classes)
dataset = tf.data.Dataset.from_tensor_slices(
    ({'input_ids': input_ids, 'attention_mask': attention_mask}, labels)
)

I0000 00:00:1751777210.792734      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1751777210.793417      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [8]:
# Train/val/test split
BATCH = 32
dataset = dataset.shuffle(buffer_size=len(df), seed=42)
n = len(df)
n_train = int(0.8 * n)
n_val = int(0.1 * n)
train_ds = dataset.take(n_train).batch(BATCH)
val_ds = dataset.skip(n_train).take(n_val).batch(BATCH)
test_ds = dataset.skip(n_train + n_val).batch(BATCH)


In [9]:
# Build model
def build_model():
    bert = TFBertModel.from_pretrained('bert-base-uncased')
    in_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
    in_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')
    x = bert(in_ids, attention_mask=in_mask).pooler_output
    x = tf.keras.layers.Dropout(0.5)(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    out = tf.keras.layers.Dense(num_classes, activation='softmax')(x)
    return tf.keras.Model(inputs=[in_ids, in_mask], outputs=out)

model = build_model()
model.summary()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPooli   1094822   ['input_ids[0][0]',           
 )                           ngAndCrossAttentions(last_   40         'attention_mask[0][0]']      
                             hidden_state=(None, 128, 7                                       

In [10]:
# Compile model
optim = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-8)
metrics = ['accuracy']
for cls, idx in label_map.items():
    metrics += [
        tf.keras.metrics.Precision(name=f"prec_{cls}", class_id=idx),
        tf.keras.metrics.Recall(name=f"rec_{cls}", class_id=idx)
    ]

model.compile(
    optimizer=optim,
    loss='categorical_crossentropy',
    metrics=metrics
)


In [11]:
# Train
epochs = 10
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)
]
hist = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=callbacks
)

Epoch 1/10


I0000 00:00:1751777267.390498     103 service.cc:148] XLA service 0x7e791039c390 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1751777267.391136     103 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1751777267.391161     103 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1751777267.504798     103 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1751777267.619936     103 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




  saving_api.save_model(


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [33]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model

In [40]:
from sklearn.model_selection import train_test_split

# Re-split df the same way for test set (same stratified logic)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
val_df, test_df   = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])



In [41]:
# Tokenize the test_df only
enc_test = tokenizer(
    test_df['statement'].astype(str).tolist(),
    padding='max_length',
    truncation=True,
    max_length=MAX_LEN,
    return_tensors='np'
)


In [42]:
# Run prediction using the in-memory trained model
test_inputs = {'input_ids': enc_test['input_ids'], 'attention_mask': enc_test['attention_mask']}
probs = model.predict(test_inputs, batch_size=BATCH)
y_pred_idx = probs.argmax(axis=1)




In [43]:
# Convert back to label names
inv_label_map = {v: k for k, v in label_map.items()}
bert_preds = [inv_label_map[i] for i in y_pred_idx]

In [44]:
# Create aligned DataFrame
df_bert = pd.DataFrame({
    'text': test_df['statement'].tolist(),
    'true_label': test_df['status'].tolist(),
    'bert_pred': bert_preds
})

In [45]:
# Save to CSV
df_bert.to_csv('predictions_bert3.csv', index=False)
print("✅ predictions_bert.csv saved with", len(df_bert), "rows")
print(df_bert.head())


✅ predictions_bert.csv saved with 11228 rows
                                                text  true_label   bert_pred
0  I need information. As emotional as I am right...      Stress      Stress
1  I'm BURNED OUT... What should I do? So, let me...      Stress      Stress
2                boy, it's chilly outside, isn't it?      Normal      Normal
3                                     I'm goosebumps      Normal      Normal
4  Originally posted on the r/suboxone, I thought...  Depression  Depression
