In [8]:
# Necessary Libraries
!pip install pandas tensorflow transformers scikit-learn numpy matplotlib seaborn imbalanced-learn openpyxl nlpaug
from google.colab import drive
drive.mount('/content/drive')
import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_scheduler
from tqdm import tqdm
import torch.nn as nn
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
import nlpaug.augmenter.word as naw

# Set Random Seed
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# Dataset Class
class ERDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.float),
        }

# Load Dataset
file_path = "/content/drive/MyDrive/Dataset500.xlsx"
df = pd.read_excel(file_path)

# Filter out rows with missing values in `er_strat`
df = df[df['er_strat'].notna()]

# Process Labels
# Multi-label binarization for `er_strat` (6 categories)
mlb = MultiLabelBinarizer()
df['er_labels'] = df['er_strat'].apply(lambda x: x.split(", "))
er_labels = mlb.fit_transform(df['er_labels'])

# Process `adaptive` as two binary labels: adaptive (1, 0) and maladaptive (0, 1)
df['adaptive_binary'] = df['adaptive'].apply(
    lambda x: [1, 0] if x == "adaptive" else [0, 1] if x == "maladaptive" else [0, 0]
)
adaptive_labels = np.array(df['adaptive_binary'].tolist())

# Combine all labels: 6 `er_strat` + 2 `adaptive/maladaptive`
combined_labels = np.hstack([er_labels, adaptive_labels])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['text_cleaned'], combined_labels, test_size=0.3, random_state=42
)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Data Augmentation for Weak Classes
def augment_text(text, augmenter, num_augments=2):
    """
    Augment a given text using the specified augmenter.
    :param text: Input text to augment.
    :param augmenter: Augmentation method (e.g., synonym replacement).
    :param num_augments: Number of augmented samples to generate.
    :return: List of augmented texts.
    """
    augmented_texts = [augmenter.augment(text) for _ in range(num_augments)]
    return augmented_texts

# Identify weak classes
weak_classes = ["Situation Modification", "Response Modulation", "Situation Selection"]

# Get indices of samples belonging to weak classes
weak_class_indices = [i for i, label in enumerate(y_train) if any(label[mlb.classes_.tolist().index(cls)] for cls in weak_classes)]

# Augment samples for weak classes
augmented_texts = []
augmented_labels = []

# Synonym Replacement Augmenter
synonym_augmenter = naw.SynonymAug(aug_src='wordnet')

for idx in weak_class_indices:
    text = X_train.iloc[idx]
    label = y_train[idx]

    # Generate augmented texts
    new_texts = augment_text(text, synonym_augmenter, num_augments=5)

    # Add augmented texts and their corresponding labels
    augmented_texts.extend(new_texts)
    augmented_labels.extend([label] * len(new_texts))

# Add augmented data to the training set
X_train = X_train.tolist() + augmented_texts
y_train = np.vstack([y_train, augmented_labels])

# Dataset and DataLoader
max_length = 128
batch_size = 16

train_dataset = ERDataset(X_train, y_train, tokenizer, max_length)
test_dataset = ERDataset(X_test.tolist(), y_test, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Model
num_labels = combined_labels.shape[1]  # Total number of labels (6 `er_strat` + 2 `adaptive`)
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=num_labels, problem_type="multi_label_classification"
)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# Loss Function with Class Weights
class_counts = np.sum(y_train, axis=0)
epsilon = 1e-5  # To avoid division by zero
pos_weights = torch.tensor(
    (len(y_train) - class_counts) / (class_counts + epsilon), dtype=torch.float
).to(device)
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weights)

# Optimizer and Scheduler
epochs = 10
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
num_training_steps = len(train_loader) * epochs
num_warmup_steps = int(0.1 * num_training_steps)
scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
)

# Training Function
def train_model(model, data_loader, optimizer, loss_fn, scheduler, device):
    model.train()
    total_loss = 0

    for batch in tqdm(data_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)

# Evaluation Function
def evaluate_model(model, data_loader, device, threshold=0.5):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs.logits
            predictions.extend(torch.sigmoid(logits).cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    binary_predictions = (np.array(predictions) > threshold).astype(int)
    return binary_predictions, np.array(true_labels)

# Training Loop
best_f1 = 0
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss = train_model(model, train_loader, optimizer, loss_fn, scheduler, device)
    print(f"Train Loss: {train_loss:.4f}")

    # Evaluate on validation data
    predictions, true_labels = evaluate_model(model, test_loader, device)
    f1 = f1_score(true_labels, predictions, average="macro")
    print(f"Validation F1 Score: {f1:.4f}")

    # Save best model
    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), "best_model_augmented.pth")

# Final Evaluation
model.load_state_dict(torch.load("best_model_augmented.pth"))
predictions, true_labels = evaluate_model(model, test_loader, device)

# Classification Report
print("Final Evaluation on Test Set (After Data Augmentation)")
print("Classification Report:")
target_names = list(mlb.classes_) + ["adaptive", "maladaptive"]
print(classification_report(true_labels, predictions, target_names=target_names))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10


100%|██████████| 56/56 [00:21<00:00,  2.60it/s]


Train Loss: 0.9994
Validation F1 Score: 0.4460
Epoch 2/10


100%|██████████| 56/56 [00:21<00:00,  2.65it/s]


Train Loss: 0.8709
Validation F1 Score: 0.4989
Epoch 3/10


100%|██████████| 56/56 [00:20<00:00,  2.70it/s]


Train Loss: 0.7006
Validation F1 Score: 0.5076
Epoch 4/10


100%|██████████| 56/56 [00:21<00:00,  2.62it/s]


Train Loss: 0.5561
Validation F1 Score: 0.4941
Epoch 5/10


100%|██████████| 56/56 [00:21<00:00,  2.66it/s]


Train Loss: 0.4248
Validation F1 Score: 0.4864
Epoch 6/10


100%|██████████| 56/56 [00:20<00:00,  2.69it/s]


Train Loss: 0.3328
Validation F1 Score: 0.4860
Epoch 7/10


100%|██████████| 56/56 [00:20<00:00,  2.68it/s]


Train Loss: 0.2733
Validation F1 Score: 0.4792
Epoch 8/10


100%|██████████| 56/56 [00:20<00:00,  2.69it/s]


Train Loss: 0.2309
Validation F1 Score: 0.4739
Epoch 9/10


100%|██████████| 56/56 [00:21<00:00,  2.66it/s]


Train Loss: 0.2086
Validation F1 Score: 0.4926
Epoch 10/10


100%|██████████| 56/56 [00:20<00:00,  2.69it/s]


Train Loss: 0.1968
Validation F1 Score: 0.4831


  model.load_state_dict(torch.load("best_model_augmented.pth"))


Final Evaluation on Test Set (After Data Augmentation)
Classification Report:
                        precision    recall  f1-score   support

Attentional Deployment       0.42      0.63      0.50        51
      Cognitive Change       0.49      0.75      0.59        56
   Response Modulation       0.20      0.70      0.31        27
Situation Modification       0.27      0.46      0.34        13
   Situation Selection       0.16      0.57      0.25        14
                  none       0.68      0.79      0.73        19
              adaptive       0.74      0.87      0.80        92
           maladaptive       0.44      0.67      0.53        27

             micro avg       0.44      0.74      0.55       299
             macro avg       0.43      0.68      0.51       299
          weighted avg       0.51      0.74      0.59       299
           samples avg       0.44      0.74      0.54       299



In [9]:
# Print the number of augmented samples
print(f"Number of augmented texts: {len(augmented_texts)}")
print(f"Number of augmented labels: {len(augmented_labels)}")

# Print some examples of augmented texts and their labels
for i in range(min(5, len(augmented_texts))):  # Print first 5 examples
    print(f"Augmented Text {i+1}: {augmented_texts[i]}")
    print(f"Corresponding Label: {augmented_labels[i]}")
    print("-" * 50)

Number of augmented texts: 560
Number of augmented labels: 560
Augmented Text 1: ['Right now i feel so low. Apr 4. I donâ € ™ t quite now why. All of it i guess. Iâ € ™ m not excited just about much of anything, even the thought of home decor. Oregon the possibility of a new patio and hosting, with a few unintentional moments of feeling left out from [Named Entity] and [Named Entity] and [Named Entity] and [Named Entity ]. Single want to include everyone but that takes vim. I donâ € ™ t have much. Thankfully God has been sending me Orr meetings. . Iâ € ™ m working with [Named Entity] of them this week. It gets me out of my head. Iâ € ™ m trying to get excited about planning VBS with [Named Entity ], about virtual bible study with [Named Entity] and [Named Entity ], about spelling. About exercise. About anything. I want uncluttered time with God to communicate and hear from him and he told me that that isnâ € ™ t always possible, that life IS cluttered, and Iâ € ™ m to find him and talk

In [10]:
# Print the size of the training set before and after augmentation
print(f"Training set size before augmentation: {len(X_train) - len(augmented_texts)}")
print(f"Training set size after augmentation: {len(X_train)}")

Training set size before augmentation: 336
Training set size after augmentation: 896


In [13]:
print(f"test set size after augmentation: {len(X_test)}")

Training set size after augmentation: 144


In [11]:
# Count samples for weak classes before and after augmentation
weak_classes = ["Situation Modification", "Response Modulation", "Situation Selection"]

# Before augmentation
weak_class_counts_before = {cls: 0 for cls in weak_classes}
for label in y_train[:len(y_train) - len(augmented_labels)]:
    for cls in weak_classes:
        if label[mlb.classes_.tolist().index(cls)] == 1:
            weak_class_counts_before[cls] += 1

# After augmentation
weak_class_counts_after = {cls: 0 for cls in weak_classes}
for label in y_train:
    for cls in weak_classes:
        if label[mlb.classes_.tolist().index(cls)] == 1:
            weak_class_counts_after[cls] += 1

print("Weak class counts before augmentation:", weak_class_counts_before)
print("Weak class counts after augmentation:", weak_class_counts_after)

Weak class counts before augmentation: {'Situation Modification': 22, 'Response Modulation': 68, 'Situation Selection': 29}
Weak class counts after augmentation: {'Situation Modification': 132, 'Response Modulation': 408, 'Situation Selection': 174}
