# Experiment: Train Sequential LSTM Model

## Confirm Environment

In [1]:
!conda info


     active environment : northeastern
    active env location : /home/curtis/anaconda3/envs/northeastern
            shell level : 2
       user config file : /home/curtis/.condarc
 populated config files : /home/curtis/anaconda3/.condarc
          conda version : 24.9.2
    conda-build version : 24.9.0
         python version : 3.12.7.final.0
                 solver : libmamba (default)
       virtual packages : __archspec=1=skylake
                          __conda=24.9.2=0
                          __glibc=2.39=0
                          __linux=6.6.87.2=0
                          __unix=0=0
       base environment : /home/curtis/anaconda3  (writable)
      conda av data dir : /home/curtis/anaconda3/etc/conda
  conda av metadata url : None
           channel URLs : https://repo.anaconda.com/pkgs/main/linux-64
                          https://repo.anaconda.com/pkgs/main/noarch
                          https://repo.anaconda.com/pkgs/r/linux-64
                          https://r

## Setup and Imports

In [2]:
from emolex.preprocessing import load_mental_health_sentiment_dataset, clean_text, encode_sentiment_labels, split_data, dl_text_vectorization
from emolex.dl_models import lstm_model 
from emolex.train import train_dl_model
from emolex.evaluation import plot_training_history, generate_confusion_matrix, generate_classification_report
from emolex.utils import detect_and_set_device

2025-07-02 14:59:08.796305: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-02 14:59:08.829937: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751482748.846507   44457 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751482748.854458   44457 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1751482748.872073   44457 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

## Device Setup

In [3]:
# Detect and set up GPU or use CPU
device_used = detect_and_set_device()
print(f"TensorFlow is configured to use: {device_used}")

No GPU devices found despite TensorFlow being built with CUDA. Using CPU.
TensorFlow is configured to use: CPU


2025-07-02 14:59:21.400267: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


## Load Data

In [4]:
df = load_mental_health_sentiment_dataset()
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51093 entries, 0 to 51092
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    51093 non-null  object
 1   label   51093 non-null  object
dtypes: object(2)
memory usage: 798.5+ KB


Unnamed: 0,text,label
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


## Clean Data

In [5]:
print(f"\n--- Cleaning Text ---")
df['clean_text'] = df["text"].apply(clean_text)
print("Text cleaning complete. Sample cleaned text:")
print("\n", df[["text", "clean_text"]].sample(5))


--- Cleaning Text ---
Text cleaning complete. Sample cleaned text:

                                                     text  \
44391                       soluna is slower than accord   
39194  i am the definition of a failure i have mental...   
27552  Hey guys, I was watching Seth Meyers and he ma...   
24362  Hi all, in my short 19 years on this earth I h...   
48129  Happy | Energetic Music For Peace music to des...   

                                              clean_text  
44391                               soluna slower accord  
39194  definition failure mental diagnosis job educat...  
27552  hey guy watching seth meyers made joke longest...  
24362  hi short year earth many trial triumph life se...  
48129   happy energetic music peace music destress enjoy  


## Encode Labels

In [6]:
print(f"\n--- Encoding Labels ---")
df, encoder = encode_sentiment_labels(df)
print("Label encoding complete. Sample encoded labels:")
print("\n", df[['label', 'label_encoded']].sample(5))


--- Encoding Labels ---
Label Encoding Map: {'Anxiety': 0, 'Bipolar': 1, 'Depression': 2, 'Normal': 3, 'Personality disorder': 4, 'Stress': 5, 'Suicidal': 6}
Label encoding complete. Sample encoded labels:

             label  label_encoded
23822  Depression              2
42563      Normal              3
6119       Normal              3
18732  Depression              2
34396     Anxiety              0


## Train-Test Split

In [7]:
print("\n--- Perform Train-Test Split ---")
X_train_raw, X_test_raw, y_train, y_test = split_data(df) 
print(f"Train set size: {len(X_train_raw)} samples")
print(f"Test set size: {len(X_test_raw)} samples")


--- Perform Train-Test Split ---
Train set size: 40874 samples
Test set size: 10219 samples


## Vectorization

In [8]:
print("\n--- Performing Text Vectoriation ---")
X_train_pad_filtered, X_test_pad_filtered, y_train_filtered, y_test_filtered = dl_text_vectorization(X_train_raw, X_test_raw, y_train, y_test)
print("Vecorization complete.")


--- Performing Text Vectoriation ---
Original X_train shape: (40874,), Filtered X_train_pad shape: (40807, 100)
Original X_test shape: (10219,), Filtered X_test_pad shape: (10205, 100)
Vecorization complete.


## Build Model

In [9]:
print("\n--- Build Model ---")
model = lstm_model(num_classes=len(encoder.classes_), vocab_size=10000, max_len=100)


--- Build Model ---


## Train Model

In [None]:
print("\n--- Training Model ---")
model, history = train_dl_model(model, X_train_pad_filtered, y_train_filtered, X_test_pad_filtered, y_test_filtered)


--- Training Model ---
Starting model training for 10 epochs with batch size 32...
Epoch 1/10
[1m1276/1276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 143ms/step - accuracy: 0.4012 - loss: 1.5309 - val_accuracy: 0.5707 - val_loss: 1.1520
Epoch 2/10
[1m1276/1276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 159ms/step - accuracy: 0.5627 - loss: 1.1602 - val_accuracy: 0.6275 - val_loss: 0.8421
Epoch 3/10
[1m1276/1276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 159ms/step - accuracy: 0.6522 - loss: 0.8169 - val_accuracy: 0.7051 - val_loss: 0.7406
Epoch 4/10
[1m1276/1276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 169ms/step - accuracy: 0.7357 - loss: 0.6673 - val_accuracy: 0.7258 - val_loss: 0.6930
Epoch 5/10
[1m1276/1276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m211s[0m 165ms/step - accuracy: 0.7769 - loss: 0.5736 - val_accuracy: 0.7251 - val_loss: 0.6976
Epoch 6/10
[1m1276/1276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s

## Evaluate Model

In [None]:
print("\n--- Plot Training History ---")
plot_training_history(history)

In [None]:
print("\n--- Predict Test Classes ---")
y_pred = model.predict(X_test_pad_filtered)
y_pred_classes = y_pred.argmax(axis=1)

In [None]:
print("\n--- Generate Confusion Matrix ---")
fig, ax = generate_confusion_matrix(y_test_filtered, y_pred_classes, class_labels=encoder.classes_)

In [None]:
print("\n--- Generate Classification Report ---")
generate_classification_report(y_test_filtered, y_pred_classes, class_labels=encoder.classes_)