# Experiment: Train/Tune BERT Model

## Confirm Environment

In [1]:
!conda info


     active environment : base
    active env location : /shared/EL9/explorer/anaconda3/2024.06
            shell level : 1
       user config file : /home/neiderer.c/.condarc
 populated config files : 
          conda version : 24.5.0
    conda-build version : 24.5.1
         python version : 3.12.4.final.0
                 solver : libmamba (default)
       virtual packages : __archspec=1=broadwell
                          __conda=24.5.0=0
                          __cuda=12.3=0
                          __glibc=2.34=0
                          __linux=5.14.0=0
                          __unix=0=0
       base environment : /shared/EL9/explorer/anaconda3/2024.06  (read only)
      conda av data dir : /shared/EL9/explorer/anaconda3/2024.06/etc/conda
  conda av metadata url : None
           channel URLs : https://repo.anaconda.com/pkgs/main/linux-64
                          https://repo.anaconda.com/pkgs/main/noarch
                          https://repo.anaconda.com/pkgs/r/linux-64

## Setup and Imports

In [None]:
from emolex.preprocessing import load_mental_health_sentiment_dataset, clean_text, encode_sentiment_labels, split_data, dl_text_vectorization
from emolex.train import train_bert_model
from emolex.evaluation import plot_training_history, generate_confusion_matrix, generate_classification_report
from emolex.utils import detect_and_set_device

## Device Setup

In [None]:
# Detect and set up GPU or use CPU
device_used = detect_and_set_device()
print(f"TensorFlow is configured to use: {device_used}")

## Load Data

In [None]:
df = load_mental_health_sentiment_dataset()
df.info()
df.head()

## Clean Data

In [None]:
print(f"\n--- Cleaning Text ---")
df['clean_text'] = df["text"].apply(clean_text)
print("Text cleaning complete. Sample cleaned text:")
print("\n", df[["text", "clean_text"]].sample(5))

## Encode Labels

In [None]:
print(f"\n--- Encoding Labels ---")
df, encoder = encode_sentiment_labels(df)
print("Label encoding complete. Sample encoded labels:")
print("\n", df[['label', 'label_encoded']].sample(5))

## Train-Test Split

In [None]:
print("\n--- Perform Train-Test Split ---")
X_train_raw, X_test_raw, y_train, y_test = split_data(df) 
print(f"Train set size: {len(X_train_raw)} samples")
print(f"Test set size: {len(X_test_raw)} samples")

## Vectorization

In [None]:
print("\n--- Performing Text Vectoriation ---")
X_train_tokenized, X_test_tokenized = hf_vectorization("bert", X_train_raw, X_test_raw, y_train, y_test)

## Train Model

In [None]:
trainer, results = train_hf_model("bert", X_train_tokenized, X_test_tokenized, num_classes=len(encoder.classes_), num_train_epochs=1)

## Evaluate Model

In [None]:
import pandas as pd
history = pd.DataFrame(trainer.state.log_history)
history

In [None]:
print("\n--- Predict Test Classes ---")
import numpy as np
y_pred = trainer.predict(X_test_tokenized).predictions
y_pred_classes = np.argmax(y_pred, axis=1)

In [None]:
print("\n--- Generate Confusion Matrix ---")
fig, ax = generate_confusion_matrix(y_test, y_pred_classes, class_labels=encoder.classes_)

In [None]:
print("\n--- Generate Classification Report ---")
generate_classification_report(y_test, y_pred_classes, class_labels=encoder.classes_)