## Scope3 Encoder Model

In [None]:
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
from tqdm import tqdm
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

In [None]:
# Variables
model_name = 'intfloat/e5-large-v2'
vagueness_class_mapping = {"specific":1, "ambiguous":2, "generic":3, "notESG":0 }
scope3_class_mapping = {"yes":1, "no":0}
train_file_path = 'train_data.csv'
test_file_path = 'test_data.csv'
device = torch.device("cuda:0")
batch_size = 8
lr = 1e-5
epochs = 30
train_test_split = 0.1

In [None]:
def tokenize_and_format(sentences, max_sentence_length=200):
    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True, use_fast=False)

    input_ids = []
    attention_masks = []

    for sentence in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sentence,
                            add_special_tokens = True,
                            max_length = max_sentence_length,
                            padding = 'max_length',
                            truncation = True,
                            return_attention_mask = True,
                            return_tensors = 'pt',
                        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

In [None]:
class MultiTaskDataset(Dataset):
    def __init__(self, df, scope3_class_mapping, vagueness_class_mapping, augment=False):
        self.df = df
        self.texts = self.df['text'].tolist()
        self.scope3 = self.df['scope3'].apply(lambda x: scope3_class_mapping[x]).tolist()
        self.vagueness = self.df['vague'].apply(lambda x: vagueness_class_mapping[x]).tolist()

        if augment:

            batch_size = 16
            
            aug1 = naw.BackTranslationAug(from_model_name='facebook/wmt19-en-de', to_model_name='facebook/wmt19-de-en', device='cuda', batch_size=batch_size, verbose=True)
            aug2 = naw.SynonymAug(aug_src='wordnet')
            
            aug_text1 = aug1.augment(self.texts)
            aug_text2 = aug2.augment(self.texts)

            self.texts = np.concatenate([self.texts, aug_text1, aug_text2])
            self.scope3 = np.concatenate([self.scope3, self.scope3, self.scope3])
            # self.vagueness = np.concatenate([self.vagueness, self.vagueness, self.vagueness])


        self.input_ids, self.attention_masks = tokenize_and_format(self.texts)
        self.scope3 = torch.tensor(self.scope3)
        # self.vagueness = torch.tensor(self.vagueness)
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx], self.scope3[idx]

In [None]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

train_len = len(train_df)
test_len = len(test_df)

num_val = int(train_test_split * train_len)

train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
validation_df = train_df.iloc[:num_val]
train_df = train_df.iloc[num_val:]


In [None]:
len(train_df), len(validation_df), len(test_df)

In [None]:
print("Train set statistics")
print(train_df['scope3'].value_counts())
# print(train_df['vague'].value_counts())

print("\nValidation set statistics")
print(validation_df['scope3'].value_counts())
# print(validation_df['vague'].value_counts())

print("\nTest set statistics")
print(test_df['scope3'].value_counts())
# print(test_df['vague'].value_counts())

In [None]:
train_dataset = MultiTaskDataset(train_df, scope3_class_mapping, vagueness_class_mapping, augment=False)
valid_dataset = MultiTaskDataset(validation_df, scope3_class_mapping, vagueness_class_mapping)
test_dataset = MultiTaskDataset(test_df, scope3_class_mapping, vagueness_class_mapping)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
len(train_dataset)

In [None]:
model_name = 'intfloat/e5-large-v2'

In [None]:
class BERTMultiTask(torch.nn.Module):
    def __init__(self, encoder_model='bert-base-uncased'):
        super(BERTMultiTask, self).__init__()

        self.encoder = AutoModel.from_pretrained(encoder_model)
        hidden_size = self.encoder.config.hidden_size
        self.linear1 = torch.nn.Linear(hidden_size, 256)
        self.linear2 = torch.nn.Linear(256, 256)
        self.scope3_out = torch.nn.Linear(256, 2)
        # self.vagueness_out = torch.nn.Linear(256, 4)
        self.relu = torch.nn.ReLU()

    def forward(self, input_ids, mask):
        outputs = self.encoder(input_ids, attention_mask=mask)

        linear1_out = self.relu(self.linear1(outputs.last_hidden_state[:,0,:]))
        linear2_out = self.relu(self.linear2(linear1_out))
        scope3_out = self.scope3_out(linear2_out)
        # vagueness_out = self.vagueness_out(linear1_out)
        return scope3_out

In [None]:
model = BERTMultiTask(encoder_model=model_name)

In [None]:
model.to(device)

In [None]:
model_param_size = sum([p.nelement() for p in model.parameters()])
print(f"Model parameters: {model_param_size/1e6}M")

In [None]:
encoder_params = list(model.encoder.named_parameters())
new_layer_params = list(model.scope3_out.named_parameters()) + list(model.linear1.named_parameters()) + list(model.linear2.named_parameters())
no_decay = {'bias', 'LayerNorm.weight'}

base_learning_rate = 1e-5
new_learning_rate = 1e-4

optimizer_grouped_parameters = [
    {'params': [p for n, p in encoder_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01, 'lr': base_learning_rate},
    {'params': [p for n, p in encoder_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': base_learning_rate},
    {'params': [p for n, p in new_layer_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01, 'lr': new_learning_rate},
    {'params': [p for n, p in new_layer_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': new_learning_rate}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=base_learning_rate, eps=1e-8)
# optimizer = AdamW(model.parameters(), lr=base_learning_rate, eps=1e-8)
scheduler = ReduceLROnPlateau(optimizer, factor=0.33, patience=2, verbose=True)
ce_loss = torch.nn.CrossEntropyLoss()


In [None]:
# for param in model.encoder.parameters():
#     param.requires_grad = False

In [None]:
def calculate_metrics(y_pred, y_true, class_mapping):
    
    y_pred_class = torch.argmax(y_pred, dim=-1)
    reverse_class_mapping = {v:k for k,v in class_mapping.items()}

    metrics = []
    for i in reverse_class_mapping:

        true_positives = torch.sum((y_pred_class == i) & (y_true == i)).item()
        true_negatives = torch.sum((y_pred_class != i) & (y_true != i)).item()
        false_positives = torch.sum((y_pred_class == i) & (y_true != i)).item()
        false_negatives = torch.sum((y_pred_class != i) & (y_true == i)).item()

        accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
        precision = true_positives / (true_positives + false_positives) if true_positives + false_positives != 0 else 0
        recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives != 0 else 0

        class_name = reverse_class_mapping[i]
        metrics.append(f'Accuracy_{class_name}: {accuracy:.4f} | Precision_{class_name}: {precision:.4f} | Recall_{class_name}: {recall:.4f}|')

    metrics = " ".join(metrics)
    return metrics

In [None]:
best_val_loss = 10000
best_val_epoch = -1

In [None]:
for epoch_i in range(0, epochs):

    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

    total_train_loss = 0

    model.train()

    # running_vagueness_tensor = torch.tensor([]).to(device)
    running_scope3_tensor = torch.tensor([]).to(device)
    # running_vagueness_pred_tensor = torch.tensor([]).to(device)
    running_scope3_pred_tensor = torch.tensor([]).to(device)

    num_batches = len(train_dataloader)

    for i, data in enumerate(train_dataloader):
        
        input_id_tensors = data[0].to(device)
        input_mask_tensors = data[1].to(device)
        # vagueness_tensors = data[2].to(device)
        scope3_tensors = data[2].to(device)

        model.zero_grad()

        outputs = model(input_id_tensors, mask=input_mask_tensors)

        # vagueness_loss = ce_loss(outputs[0], vagueness_tensors)
        scope3_loss = ce_loss(outputs, scope3_tensors)

        final_loss = scope3_loss

        total_train_loss += final_loss.item()

        final_loss.backward()
        optimizer.step()

        # running_vagueness_tensor = torch.cat([running_vagueness_tensor, vagueness_tensors])
        running_scope3_tensor = torch.cat([running_scope3_tensor, scope3_tensors])
        # running_vagueness_pred_tensor = torch.cat([running_vagueness_pred_tensor, outputs[0]])
        running_scope3_pred_tensor = torch.cat([running_scope3_pred_tensor, outputs])

        # vagueness_metrics = calculate_metrics(running_vagueness_pred_tensor, running_vagueness_tensor, vagueness_class_mapping)
        scope3_metrics = calculate_metrics(running_scope3_pred_tensor, running_scope3_tensor, scope3_class_mapping)

        average_train_loss = total_train_loss / (i+1)
    
        print(f'\rBatch [{i+1}/{num_batches}], Average Train Loss: {average_train_loss:.4f}', scope3_metrics, end='')

    print("")
    model.eval()
    with torch.no_grad():
        
        total_valid_loss = 0
        # running_vagueness_tensor = torch.tensor([]).to(device)
        running_scope3_tensor = torch.tensor([]).to(device)
        # running_vagueness_pred_tensor = torch.tensor([]).to(device)
        running_scope3_pred_tensor = torch.tensor([]).to(device)

        for data in valid_dataloader:
            
            input_id_tensors = data[0].to(device)
            input_mask_tensors = data[1].to(device)
            # vagueness_tensors = data[2].to(device)
            scope3_tensors = data[2].to(device)

            outputs = model(input_id_tensors, mask=input_mask_tensors)

            # vagueness_loss = ce_loss(outputs[0], vagueness_tensors)
            scope3_loss = ce_loss(outputs, scope3_tensors)

            final_loss = scope3_loss
            
            total_valid_loss += final_loss.item()

            # running_vagueness_tensor = torch.cat([running_vagueness_tensor, vagueness_tensors])
            running_scope3_tensor = torch.cat([running_scope3_tensor, scope3_tensors])
            # running_vagueness_pred_tensor = torch.cat([running_vagueness_pred_tensor, outputs[0]])
            running_scope3_pred_tensor = torch.cat([running_scope3_pred_tensor, outputs])
        
        average_valid_loss = total_valid_loss / len(valid_dataloader)
        scheduler.step(average_valid_loss)

        if average_valid_loss < best_val_loss:
            best_val_loss = average_valid_loss
            best_val_epoch = epoch_i
            torch.save(model.state_dict(), 'best_model_scope3.pth')

        # vagueness_metrics = calculate_metrics(running_vagueness_pred_tensor, running_vagueness_tensor, vagueness_class_mapping)
        scope3_metrics = calculate_metrics(running_scope3_pred_tensor, running_scope3_tensor, scope3_class_mapping)

        

        print(f'Avg Validation Loss: {average_valid_loss:.4f} | Best Validation Loss: {best_val_loss:.4f} | Best Epoch: {best_val_epoch}',scope3_metrics)
        
print("\nTraining complete!")

In [None]:
class InferenceDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.texts = self.df['text'].values

        self.input_ids, self.attention_masks = tokenize_and_format(self.texts)
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx]

In [None]:
# load the model
model = BERTMultiTask(encoder_model=model_name)
model.load_state_dict(torch.load('best_model_scope3.pth'))
model.to(device)

model.eval()


inference_dataset = MultiTaskDataset(test_df, scope3_class_mapping, vagueness_class_mapping, augment=False)
inference_dataloader = DataLoader(inference_dataset, batch_size=1, shuffle=False)

# inference
vagueness_inference = []
scope3_inference = []

reverse_vagueness_class_mapping = {v:k for k,v in vagueness_class_mapping.items()}
reverse_scope3_class_mapping = {v:k for k,v in scope3_class_mapping.items()}
with torch.no_grad():
    # running_vagueness_tensor = torch.tensor([]).to(device)
    running_scope3_tensor = torch.tensor([]).to(device)
    # running_vagueness_pred_tensor = torch.tensor([]).to(device)
    running_scope3_pred_tensor = torch.tensor([]).to(device)
    for data in inference_dataloader:
        
        input_id_tensors = data[0].to(device)
        input_mask_tensors = data[1].to(device)
        # vagueness_tensors = data[2].to(device)
        scope3_tensors = data[2].to(device)

        outputs = model(input_id_tensors, mask=input_mask_tensors)

        # running_vagueness_tensor = torch.cat([running_vagueness_tensor, vagueness_tensors])
        running_scope3_tensor = torch.cat([running_scope3_tensor, scope3_tensors])
        # running_vagueness_pred_tensor = torch.cat([running_vagueness_pred_tensor, outputs[0]])
        running_scope3_pred_tensor = torch.cat([running_scope3_pred_tensor, outputs])

        # vagueness_pred = torch.argmax(outputs[0], dim=-1).to('cpu').tolist()
        scope3_pred = torch.argmax(outputs, dim=-1).to('cpu').tolist()

        # vagueness_inference.extend([reverse_vagueness_class_mapping[x] for x in vagueness_pred])
        scope3_inference.extend([reverse_scope3_class_mapping[x] for x in scope3_pred])

    # vagueness_metrics = calculate_metrics(running_vagueness_pred_tensor, running_vagueness_tensor, vagueness_class_mapping)
    scope3_metrics = calculate_metrics(running_scope3_pred_tensor, running_scope3_tensor, scope3_class_mapping)

# test_df['vagueness_pred'] = vagueness_inference
test_df['scope3_pred'] = scope3_inference
# print(vagueness_metrics)
print(scope3_metrics)