# Hybrid Approach (CNN-BiLSTM)

In [12]:
import os
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.layers import (Input, Embedding, SpatialDropout1D, Conv1D,
                                     BatchNormalization, Dropout, GlobalMaxPooling1D,
                                     Bidirectional, LSTM, Dense, concatenate)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from gensim.downloader import load as gensim_load
from sklearn.metrics import confusion_matrix, classification_report
from wordcloud import WordCloud

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
# Load the dataset
df = pd.read_csv("/kaggle/input/dataset3/D3.csv")

In [14]:
# Drop the unnecessary column if it exists
if 'Unnamed: 0' in df.columns:
    df.drop('Unnamed: 0', axis=1, inplace=True)


In [15]:
# Drop rows with missing values and show new shape
df.dropna(inplace=True)
print("\nShape after dropping missing values:", df.shape)


Shape after dropping missing values: (28484, 2)


In [16]:
# Check distribution of status before resampling
print("\nStatus Value Counts before resampling:")
print(df['status'].value_counts())


Status Value Counts before resampling:
status
EDAnonymous      6947
adhd             5110
autism           4576
ptsd             4483
schizophrenia    4281
alcoholism       3087
Name: count, dtype: int64


In [17]:
# Data Resampling for Class Balance
def resample_data(df):
    max_count = df['status'].value_counts().max()
    df_resampled = pd.DataFrame()
    for status in df['status'].unique():
        df_class = df[df['status'] == status]
        if len(df_class) < max_count:
            df_class_resampled = resample(df_class, replace=True, n_samples=max_count, random_state=42)
            df_resampled = pd.concat([df_resampled, df_class_resampled])
        else:
            df_resampled = pd.concat([df_resampled, df_class])
    return df_resampled

df = resample_data(df)
print("Value counts after resampling:")
print(df['status'].value_counts())

Value counts after resampling:
status
adhd             6947
alcoholism       6947
autism           6947
EDAnonymous      6947
ptsd             6947
schizophrenia    6947
Name: count, dtype: int64


In [18]:
# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['post'] = df['post'].apply(preprocess_text)
print("\nSample preprocessed text:")
print(df['post'].head())



Sample preprocessed text:
860     found psych misdiagnosed adhd child psych clin...
3772    started strattera experience starting stratter...
3092    kvk tech dextroamphetamine quality concern hey...
466     mydayis nausea week one 70mg vyvanse everyday ...
4426    diagnosed add 28 history emotional abuse child...
Name: post, dtype: object


In [19]:
# Splitting Data and Encoding Labels
X = df['post'].values
y = df['status'].values

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (33345,) (33345, 6)
Testing set shape: (8337,) (8337, 6)


In [20]:
# Tokenization and Padding
max_words = 50000
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_sequence_length = 100
X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post', truncating='post')
print("Padded Training Data Shape:", X_train_padded.shape)
print("Padded Testing Data Shape:", X_test_padded.shape)

Padded Training Data Shape: (33345, 100)
Padded Testing Data Shape: (8337, 100)


In [21]:
# Load Pre-trained Word2Vec and Build Embedding Matrix
print("Downloading the pre-trained Google News Word2Vec model. This may take a while...")
word2vec_model = gensim_load("word2vec-google-news-300")
embedding_dim = word2vec_model.vector_size  
print("Download complete. Embedding dimension:", embedding_dim)

word_index = tokenizer.word_index
num_words = min(max_words, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
found_count = 0

for word, i in word_index.items():
    if i >= max_words:
        continue
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]
        found_count += 1
    else:
        # Initialize missing words with random vectors
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))

print(f"Found embeddings for {found_count} out of {num_words} words.")


Downloading the pre-trained Google News Word2Vec model. This may take a while...
Download complete. Embedding dimension: 300
Found embeddings for 26232 out of 44876 words.


In [22]:
# Building the Enhanced Hybrid CNN + BiLSTM Model
# Using the Functional API for a parallel architecture:
input_layer = Input(shape=(max_sequence_length,), name='input_text')
embedding_layer = Embedding(input_dim=num_words,
                            output_dim=embedding_dim,
                            input_length=max_sequence_length,
                            weights=[embedding_matrix],
                            trainable=True,
                            name='embedding')(input_layer)
drop_embedding = SpatialDropout1D(0.3, name='spatial_dropout')(embedding_layer)

# CNN Branch: apply several 1D convolutions and global pooling
conv1 = Conv1D(filters=256, kernel_size=3, padding='same', activation='relu', name='conv1_3')(drop_embedding)
bn1   = BatchNormalization(name='bn1')(conv1)
conv2 = Conv1D(filters=256, kernel_size=4, padding='same', activation='relu', name='conv2_4')(bn1)
bn2   = BatchNormalization(name='bn2')(conv2)
conv3 = Conv1D(filters=256, kernel_size=5, padding='same', activation='relu', name='conv3_5')(bn2)
bn3   = BatchNormalization(name='bn3')(conv3)
pool  = GlobalMaxPooling1D(name='global_max_pool')(bn3)

# BiLSTM Branch: capture sequential context
bilstm = Bidirectional(LSTM(units=128, dropout=0.3, recurrent_dropout=0.3), name='bilstm')(drop_embedding)
drop_bilstm = Dropout(0.5, name='drop_bilstm')(bilstm)

# Merge both branches
merged = concatenate([pool, drop_bilstm], name='concatenate')

# Fully connected layers after merging
fc1 = Dense(128, activation='relu', name='fc1')(merged)
bn_fc1 = BatchNormalization(name='bn_fc1')(fc1)
drop_fc1 = Dropout(0.5, name='drop_fc1')(bn_fc1)

# Output layer for multi-class classification
output_layer = Dense(y_categorical.shape[1], activation='softmax', name='output')(drop_fc1)

# Build and compile the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

I0000 00:00:1751982591.044624      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1751982591.045323      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [23]:
# Model Training with Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)

history = model.fit(X_train_padded, y_train,
                    epochs=25,
                    batch_size=128,
                    validation_data=(X_test_padded, y_test),
                    callbacks=[early_stop, reduce_lr])


Epoch 1/25


I0000 00:00:1751982614.268770     118 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 472ms/step - accuracy: 0.5361 - loss: 1.3236 - val_accuracy: 0.8702 - val_loss: 0.4655 - learning_rate: 0.0010
Epoch 2/25
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 465ms/step - accuracy: 0.8909 - loss: 0.3364 - val_accuracy: 0.9226 - val_loss: 0.2456 - learning_rate: 0.0010
Epoch 3/25
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 463ms/step - accuracy: 0.9454 - loss: 0.1759 - val_accuracy: 0.9470 - val_loss: 0.1796 - learning_rate: 0.0010
Epoch 4/25
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 464ms/step - accuracy: 0.9643 - loss: 0.1132 - val_accuracy: 0.9500 - val_loss: 0.1874 - learning_rate: 0.0010
Epoch 5/25
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 464ms/step - accuracy: 0.9793 - loss: 0.0686 - val_accuracy: 0.9587 - val_loss: 0.1723 - learning_rate: 0.0010
Epoch 6/25
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [24]:
# Evaluating the Model
test_loss, test_accuracy = model.evaluate(X_test_padded, y_test)
print("\nTest Accuracy: {:.2f}%".format(test_accuracy * 100))

[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 84ms/step - accuracy: 0.9665 - loss: 0.1507

Test Accuracy: 96.35%


In [25]:
# Predict on test set
y_pred_nn_probs = model.predict(X_test_padded)
y_pred_nn = np.argmax(y_pred_nn_probs, axis=1)

[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 88ms/step


In [26]:
# Get original label strings
# Re-decode one-hot true labels to integers
y_true_nn = np.argmax(y_test, axis=1)

# Use the same label encoder to decode class indices back to strings
true_labels = label_encoder.inverse_transform(y_true_nn)
pred_labels = label_encoder.inverse_transform(y_pred_nn)

In [28]:
#  Save to CSV
df_nn.to_csv('predictions_nn.csv', index=False)
print("✅ Saved predictions_nn.csv with", len(df_nn), "rows")
print(df_nn.head())

✅ Saved predictions_nn.csv with 8337 rows
                                                text     true_label  \
0  new tradition getting 4th july tbi ptsd httpst...           ptsd   
1  disbelief existence schizo anyone else constan...  schizophrenia   
2  whim last night threw scale measuring tape del...    EDAnonymous   
3  yes well harder im sitting debating one drink ...     alcoholism   
4  20 drink day past 10 year wondering like side ...     alcoholism   

         nn_pred  
0           ptsd  
1  schizophrenia  
2    EDAnonymous  
3     alcoholism  
4     alcoholism  
