In [None]:
# !pip install optuna

In [None]:
import os
import random
import numpy as np
import pandas as pd
from lxml import etree
import xml.etree.ElementTree as ET
import torch.backends.cudnn as cudnn
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
import re

import torch
import torch.nn as nn
from transformers import BertModel, DistilBertModel, RobertaModel
from torch.autograd import Function
import torch.optim as optim
import csv

from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, RobertaTokenizer
import argparse
import math
import matplotlib.pyplot as plt

import optuna
from optuna.trial import TrialState
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score

# **Seed**

In [None]:
seed = 168
train_seed = 168

# **Utility Functions**

In [None]:
class EarlyStopper:
    def __init__(self, patience=2, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

class InputFeatures(object):
    def __init__(self, input_ids, input_mask, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.label_id = label_id

def CSV2Array(path):
    data = pd.read_csv(path, encoding='latin')
    reviews, labels = data.reviews.values.tolist(), data.labels.values.tolist()
    return reviews, labels

def make_cuda(tensor):
    if torch.cuda.is_available():
        tensor = tensor.cuda()
    return tensor

def init_model(net, restore=None):
    # check if cuda is available
    if torch.cuda.is_available():
        cudnn.benchmark = True
        net.cuda()
    return net

def text2features(reviews, labels, max_seq_length, tokenizer,
                                 cls_token='[CLS]', sep_token='[SEP]',
                                 pad_token=0):
    features = []
    for ex_index, (review, label) in enumerate(zip(reviews, labels)):
        tokens = tokenizer.tokenize(review)
        if len(tokens) > max_seq_length - 2:
            tokens = tokens[:(max_seq_length - 2)]
        tokens = [cls_token] + tokens + [sep_token]
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        padding_length = max_seq_length - len(input_ids)
        input_ids = input_ids + ([pad_token] * padding_length)
        input_mask = input_mask + ([0] * padding_length)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length

        features.append(
            InputFeatures(input_ids=input_ids,
                          input_mask=input_mask,
                          label_id=label))
    return features

def get_data_loader(features, batch_size, for_training = True):
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_input_mask, all_label_ids)
    if for_training:
        # For training
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    else:
        # For testing
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return dataloader

def set_seed(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.device_count() > 0:
        torch.cuda.manual_seed_all(seed)

# **Model Definition**

In [None]:
# Gradient Reversal Layer
class GradientReversalLayer(Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha
        return output, None

# BERT encoder as Feature Extractor
class FeatureExtractor(nn.Module):
    def __init__(self):
        super(FeatureExtractor, self).__init__()
        self.encoder = BertModel.from_pretrained('bert-base-uncased')

    def forward(self, x, mask=None):
        outputs = self.encoder(x, attention_mask=mask)
        feat = outputs[1]
        return feat

# Label Classifier for Sentiment Analysis
class LabelClassifier(nn.Module):
    def __init__(self, hidden_size=768, num_labels=3, dropout=0.5):
        super(LabelClassifier, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.classifier = nn.Linear(hidden_size, num_labels)
        self.apply(self.initialize_weights)

    def forward(self, x):
        x = self.dropout(x)
        out = self.classifier(x)
        return out

    def initialize_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

# Domain Clasifier as Discriminator
class DomainClassifier(nn.Module):
    def __init__(self, hidden_size=768, dropout=0.5):
        super(DomainClassifier, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.classifier = nn.Linear(hidden_size, 2)
        self.apply(self.initialize_weights)

    def forward(self, x, alpha):
        x = self.dropout(x)
        x = GradientReversalLayer.apply(x, alpha)
        out = self.classifier(x)
        return out

    def initialize_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

# **Evaluation looop**

In [None]:
def evaluate(encoder, classifier, data_loader, get_confusion_metric = False):
    encoder.eval()
    classifier.eval()

    # Initialize loss and accuracy
    total_loss = 0
    total_acc = 0
    all_preds = []
    all_labels = []

    # Set loss function
    criterion = nn.CrossEntropyLoss()

    # Evaluate network
    for (reviews, mask, labels) in data_loader:
        reviews = make_cuda(reviews)
        mask = make_cuda(mask)
        labels = make_cuda(labels)

        with torch.no_grad():
            feat = encoder(reviews, mask)
            preds = classifier(feat)

        # Calculate loss
        total_loss += criterion(preds, labels).item()

        # Get predictions and true labels
        pred_cls = preds.data.max(1)[1]  # Get the predicted class with highest score
        all_preds.extend(pred_cls.cpu().numpy())  # Store predicted labels
        all_labels.extend(labels.cpu().numpy())  # Store true labels

        # Calculate accuracy
        total_acc += pred_cls.eq(labels.data).cpu().sum().item()

    # Average loss and accuracy
    avg_loss = total_loss / len(data_loader)
    avg_acc = total_acc / len(data_loader.dataset)

    if get_confusion_metric:
        # Calculate confusion matrix and F1 score
        conf_matrix = confusion_matrix(all_labels, all_preds)
        f1 = f1_score(all_labels, all_preds, average='weighted')
        return avg_loss, avg_acc, conf_matrix, f1
    else:
        return avg_loss, avg_acc

# **Optuna Hyperparameter Tuning**

In [None]:
def objective(trial):
    try:
        set_seed(train_seed) # Set Seed
        num_folds = 4  # Number of cross-validation folds
        avg_accuracy = 0

        # Sample hyperparameters
        dropout = trial.suggest_categorical('dropout', [0.1, 0.2, 0.4])
        learning_rate = trial.suggest_categorical('learning_rate', [2e-5, 3e-5, 5e-5])
        alpha = trial.suggest_categorical('alpha', [1e-6, 1e-3, 1e-2])
        weight_decay_constant = trial.suggest_categorical('weight_decay_constant', [1e-4, 1e-3, 1e-2])

        # Load BERT Tokenizer
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        # Load reddit data as feature source data
        src_x, src_y = CSV2Array('./preprocessed_reddit.csv')
        src_x, src_test_x, src_y, src_test_y = train_test_split(src_x, src_y, test_size=0.2, stratify=src_y, random_state=seed)
        src_features = text2features(src_x, src_y, 75, tokenizer) # Source Features Train + Validation
        src_test_features = text2features(src_test_x, src_test_y, 75, tokenizer) # Source Feature Test

        # Load financial data as feature target data
        tgt_x, tgt_y = CSV2Array('./preprocessed_financial.csv')
        tgt_train_x, tgt_test_x, tgt_train_y, tgt_test_y = train_test_split(tgt_x, tgt_y, test_size=0.25, stratify=tgt_y, random_state=seed)
        tgt_features = text2features(tgt_train_x, tgt_train_y, 75, tokenizer) # Target Features Train + Validation

        # Cross-validation on target data
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=seed)

        f=0
        for train_index, val_index in kf.split(tgt_features):
            f+=1
            print(f"Fold: {f}")
            # Get target train and validation features
            tgt_train_features = [tgt_features[i] for i in train_index] # Target Feature Training
            tgt_val_features = [tgt_features[i] for i in val_index] # Target Feature Validation

            # Get size of source training
            src_size = len(src_features)
            tgt_size = len(tgt_train_features)

            # Create data loaders
            batch_size = 128
            tgt_data_loader_batch_size = math.ceil((tgt_size/src_size) * batch_size)
            src_data_loader = get_data_loader(src_features, batch_size, for_training=True) # Src Data Loader Training
            tgt_data_loader = get_data_loader(tgt_train_features, tgt_data_loader_batch_size, for_training=True) # Target Data Loader Training
            tgt_data_loader_val = get_data_loader(tgt_val_features, batch_size, for_training=False) # Target Data Loader Validation

            # Load and initialize models
            encoder = FeatureExtractor()
            cls_classifier = LabelClassifier(dropout=dropout)
            dom_classifier = DomainClassifier(dropout=dropout)
            encoder = init_model(encoder)
            cls_classifier = init_model(cls_classifier)
            dom_classifier = init_model(dom_classifier)

            # Setup 0ptimizer
            optimizer = optim.AdamW(list(encoder.parameters()) + list(cls_classifier.parameters()) + list(dom_classifier.parameters()),
                lr=learning_rate,
                weight_decay=weight_decay_constant
            )

            # Initializations
            num_epochs=10

            # Setup criterion
            CELoss = nn.CrossEntropyLoss()

            # Initialize Early Stopper
            early_stopper = EarlyStopper()

            for epoch in range(num_epochs):
                # Set Train State
                encoder.train()
                cls_classifier.train()
                dom_classifier.train()

                data_zip = enumerate(zip(src_data_loader, tgt_data_loader))
                for step, ((src_reviews, src_mask, src_labels), (tgt_reviews, tgt_mask, tgt_labels)) in data_zip:
                    src_reviews = make_cuda(src_reviews)
                    src_mask = make_cuda(src_mask)
                    src_labels = make_cuda(src_labels)
                    tgt_reviews = make_cuda(tgt_reviews)
                    tgt_mask = make_cuda(tgt_mask)
                    tgt_labels = make_cuda(tgt_labels)

                    # Extract and concat features
                    src_feat = encoder(src_reviews, src_mask)
                    tgt_feat = encoder(tgt_reviews, tgt_mask)
                    feat_concat = torch.cat((src_feat, tgt_feat), 0)
                    src_preds = cls_classifier(src_feat)
                    dom_preds = dom_classifier(feat_concat, alpha=alpha)

                    # Prepare domain label
                    optimizer.zero_grad()
                    label_src = make_cuda(torch.ones(src_feat.size(0))) # 1 is src
                    label_tgt = make_cuda(torch.zeros(tgt_feat.size(0))) # 0 is target domain
                    label_concat = torch.cat((label_src, label_tgt), 0).long()
                    loss_cls = CELoss(src_preds, src_labels)
                    loss_dom = CELoss(dom_preds, label_concat)
                    loss_tgt_cls = CELoss(cls_classifier(tgt_feat), tgt_labels)
                    loss = loss_cls + loss_dom + loss_tgt_cls

                    loss.backward()
                    optimizer.step()

                # Evaluate Current Fold Current Epoch for Target
                eval_tgt_loss, eval_tgt_acc = evaluate(encoder, cls_classifier, tgt_data_loader_val)
                print(f"Epoch: {epoch} Evaluation Accuracy: {eval_tgt_acc}")

                # Early stopping
                if early_stopper.early_stop(eval_tgt_loss):
                    print("Early stopping triggered!")
                    break

            # Accumulate maximum accuracy over folds
            avg_accuracy += eval_tgt_acc

        # Calculate average accuracy over all folds
        avg_accuracy /= num_folds

        # Return average validation accuracy for optimization
        return avg_accuracy

    except Exception as e:
        print(e)
        return 0.0


# Define the SQLite database URL
db_url = "sqlite:///optuna_study.db"

# Create a study and store it in an SQLite database
study = optuna.create_study(direction="maximize", storage=db_url, study_name="bert_hyperparameter_study", load_if_exists=True)
study.optimize(objective, n_trials=1)

[I 2024-11-11 11:11:32,595] Using an existing study with name 'bert_hyperparameter_study' instead of creating a new one.


Fold: 1
Epoch: 0 Evaluation Accuracy: 0.834983498349835
Epoch: 1 Evaluation Accuracy: 0.8173817381738174
Epoch: 2 Evaluation Accuracy: 0.845984598459846
Early stopping triggered!
Fold: 2
Epoch: 0 Evaluation Accuracy: 0.8502202643171806
Epoch: 1 Evaluation Accuracy: 0.8072687224669604
Epoch: 2 Evaluation Accuracy: 0.816079295154185
Early stopping triggered!
Fold: 3
Epoch: 0 Evaluation Accuracy: 0.8491189427312775
Epoch: 1 Evaluation Accuracy: 0.8546255506607929
Epoch: 2 Evaluation Accuracy: 0.8414096916299559
Epoch: 3 Evaluation Accuracy: 0.8314977973568282
Early stopping triggered!
Fold: 4
Epoch: 0 Evaluation Accuracy: 0.8348017621145375
Epoch: 1 Evaluation Accuracy: 0.8689427312775331
Epoch: 2 Evaluation Accuracy: 0.8458149779735683
Epoch: 3 Evaluation Accuracy: 0.8381057268722467
Early stopping triggered!


[I 2024-11-11 11:40:13,412] Trial 16 finished with value: 0.8329168544607766 and parameters: {'dropout': 0.1, 'learning_rate': 2e-05, 'alpha': 0.001, 'weight_decay_constant': 0.001}. Best is trial 11 with value: 0.8458613207135691.


In [None]:
for trial in study.trials:
    print(f"Trial {trial.number}: State = {trial.state}, Params = {trial.params}, Value = {trial.value}")

Trial 0: State = 1, Params = {'dropout': 0.1, 'learning_rate': 2e-05, 'alpha': 1e-06, 'weight_decay_constant': 0.01}, Value = 0.8362235452620151
Trial 1: State = 1, Params = {'dropout': 0.4, 'learning_rate': 5e-05, 'alpha': 0.01, 'weight_decay_constant': 0.0001}, Value = 0.8370510509200699
Trial 2: State = 1, Params = {'dropout': 0.1, 'learning_rate': 2e-05, 'alpha': 1e-06, 'weight_decay_constant': 0.0001}, Value = 0.8453966453914115
Trial 3: State = 1, Params = {'dropout': 0.1, 'learning_rate': 5e-05, 'alpha': 1e-06, 'weight_decay_constant': 0.01}, Value = 0.8304473619168085
Trial 4: State = 1, Params = {'dropout': 0.4, 'learning_rate': 5e-05, 'alpha': 1e-06, 'weight_decay_constant': 0.001}, Value = 0.8210806763495733
Trial 5: State = 1, Params = {'dropout': 0.1, 'learning_rate': 2e-05, 'alpha': 1e-06, 'weight_decay_constant': 0.01}, Value = 0.8362235452620151
Trial 6: State = 1, Params = {'dropout': 0.4, 'learning_rate': 3e-05, 'alpha': 1e-06, 'weight_decay_constant': 0.001}, Value =

In [None]:
print("Best hyperparameters:", study.best_trial.params)

Best hyperparameters: {'dropout': 0.1, 'learning_rate': 2e-05, 'alpha': 0.001, 'weight_decay_constant': 0.0001}
