## MultiTask Encoder Model

In [1]:
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
from tqdm import tqdm
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

In [2]:
# Variables
model_name = 'intfloat/e5-large-v2'
vagueness_class_mapping = {"specific":1, "ambiguous":2, "generic":3, "notESG":0 }
scope3_class_mapping = {"yes":1, "no":0}
train_file_path = 'train_data.csv'
test_file_path = 'test_data.csv'
device = torch.device("cuda:0")
batch_size = 8
lr = 1e-5
epochs = 30
train_test_split = 0.1

In [3]:
def tokenize_and_format(sentences, max_sentence_length=200):
    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True, use_fast=False)

    input_ids = []
    attention_masks = []

    for sentence in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sentence,
                            add_special_tokens = True,
                            max_length = max_sentence_length,
                            padding = 'max_length',
                            truncation = True,
                            return_attention_mask = True,
                            return_tensors = 'pt',
                        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

In [4]:
class MultiTaskDataset(Dataset):
    def __init__(self, df, scope3_class_mapping, vagueness_class_mapping, augment=False):
        self.df = df
        self.texts = self.df['text'].tolist()
        self.scope3 = self.df['scope3'].apply(lambda x: scope3_class_mapping[x]).tolist()
        self.vagueness = self.df['vague'].apply(lambda x: vagueness_class_mapping[x]).tolist()

        if augment:

            batch_size = 16
            
            aug1 = naw.BackTranslationAug(from_model_name='facebook/wmt19-en-de', to_model_name='facebook/wmt19-de-en', device='cuda', batch_size=batch_size, verbose=True)
            aug2 = naw.SynonymAug(aug_src='wordnet')
            
            aug_text1 = aug1.augment(self.texts)
            aug_text2 = aug2.augment(self.texts)

            self.texts = np.concatenate([self.texts, aug_text1, aug_text2])
            self.scope3 = np.concatenate([self.scope3, self.scope3, self.scope3])
            self.vagueness = np.concatenate([self.vagueness, self.vagueness, self.vagueness])


        self.input_ids, self.attention_masks = tokenize_and_format(self.texts)
        self.scope3 = torch.tensor(self.scope3)
        self.vagueness = torch.tensor(self.vagueness)
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx], self.vagueness[idx], self.scope3[idx]

In [5]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

train_len = len(train_df)
test_len = len(test_df)

num_val = int(train_test_split * train_len)

train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
validation_df = train_df.iloc[:num_val]
train_df = train_df.iloc[num_val:]


In [6]:
len(train_df), len(validation_df), len(test_df)

(8776, 975, 1083)

In [7]:
print("Train set statistics")
print(train_df['scope3'].value_counts())
print(train_df['vague'].value_counts())

print("\nValidation set statistics")
print(validation_df['scope3'].value_counts())
print(validation_df['vague'].value_counts())

print("\nTest set statistics")
print(test_df['scope3'].value_counts())
print(test_df['vague'].value_counts())

Train set statistics
scope3
no     8272
yes     504
Name: count, dtype: int64
vague
notESG       4151
specific     1858
ambiguous    1590
generic      1177
Name: count, dtype: int64

Validation set statistics
scope3
no     910
yes     65
Name: count, dtype: int64
vague
notESG       432
specific     205
ambiguous    191
generic      147
Name: count, dtype: int64

Test set statistics
scope3
no     1024
yes      59
Name: count, dtype: int64
vague
notESG       490
specific     222
ambiguous    222
generic      149
Name: count, dtype: int64


In [8]:
train_dataset = MultiTaskDataset(train_df, scope3_class_mapping, vagueness_class_mapping, augment=True)
valid_dataset = MultiTaskDataset(validation_df, scope3_class_mapping, vagueness_class_mapping)
test_dataset = MultiTaskDataset(test_df, scope3_class_mapping, vagueness_class_mapping)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-en-de and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-de-en and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [9]:
len(train_dataset)

26328

In [9]:
class BERTMultiTask(torch.nn.Module):
    def __init__(self, encoder_model='bert-base-uncased'):
        super(BERTMultiTask, self).__init__()

        self.encoder = AutoModel.from_pretrained(encoder_model)
        hidden_size = self.encoder.config.hidden_size
        self.linear1 = torch.nn.Linear(hidden_size, 256)
        self.scope3_out = torch.nn.Linear(256, 2)
        self.vagueness_out = torch.nn.Linear(256, 4)
        self.relu = torch.nn.ReLU()

    def forward(self, input_ids, mask):
        outputs = self.encoder(input_ids, attention_mask=mask)

        linear1_out = self.relu(self.linear1(outputs.last_hidden_state[:,0,:]))
        scope3_out = self.scope3_out(linear1_out)
        vagueness_out = self.vagueness_out(linear1_out)
        return vagueness_out, scope3_out

In [10]:
model = BERTMultiTask(encoder_model=model_name)

In [12]:
model.to(device)

BERTMultiTask(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12,

In [13]:
model_param_size = sum([p.nelement() for p in model.parameters()])
print(f"Model parameters: {model_param_size/1e6}M")

Model parameters: 335.40583M


In [14]:
encoder_params = list(model.encoder.named_parameters())
new_layer_params = list(model.scope3_out.named_parameters()) + list(model.vagueness_out.named_parameters()) + list(model.linear1.named_parameters())
no_decay = {'bias', 'LayerNorm.weight'}

base_learning_rate = 5e-7
new_learning_rate = 1e-5

optimizer_grouped_parameters = [
    {'params': [p for n, p in encoder_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01, 'lr': base_learning_rate},
    {'params': [p for n, p in encoder_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': base_learning_rate},
    {'params': [p for n, p in new_layer_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01, 'lr': new_learning_rate},
    {'params': [p for n, p in new_layer_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': new_learning_rate}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=base_learning_rate, eps=1e-8)
# optimizer = AdamW(model.parameters(), lr=base_learning_rate, eps=1e-8)
scheduler = ReduceLROnPlateau(optimizer, factor=0.33, patience=2, verbose=True)
ce_loss = torch.nn.CrossEntropyLoss()


In [15]:
def calculate_metrics(y_pred, y_true, class_mapping):
    
    y_pred_class = torch.argmax(y_pred, dim=-1)
    reverse_class_mapping = {v:k for k,v in class_mapping.items()}

    metrics = []
    for i in reverse_class_mapping:

        true_positives = torch.sum((y_pred_class == i) & (y_true == i)).item()
        true_negatives = torch.sum((y_pred_class != i) & (y_true != i)).item()
        false_positives = torch.sum((y_pred_class == i) & (y_true != i)).item()
        false_negatives = torch.sum((y_pred_class != i) & (y_true == i)).item()

        accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
        precision = true_positives / (true_positives + false_positives) if true_positives + false_positives != 0 else 0
        recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives != 0 else 0

        class_name = reverse_class_mapping[i]
        metrics.append(f'Accuracy_{class_name}: {accuracy:.4f} | Precision_{class_name}: {precision:.4f} | Recall_{class_name}: {recall:.4f}|')

    metrics = " ".join(metrics)
    return metrics

In [16]:
best_val_loss = 10000
best_val_epoch = -1

In [18]:
for epoch_i in range(0, epochs):

    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

    total_train_loss = 0

    model.train()

    running_vagueness_tensor = torch.tensor([]).to(device)
    running_scope3_tensor = torch.tensor([]).to(device)
    running_vagueness_pred_tensor = torch.tensor([]).to(device)
    running_scope3_pred_tensor = torch.tensor([]).to(device)

    num_batches = len(train_dataloader)

    for i, data in enumerate(train_dataloader):
        
        input_id_tensors = data[0].to(device)
        input_mask_tensors = data[1].to(device)
        vagueness_tensors = data[2].to(device)
        scope3_tensors = data[3].to(device)

        model.zero_grad()

        outputs = model(input_id_tensors, mask=input_mask_tensors)

        vagueness_loss = ce_loss(outputs[0], vagueness_tensors)
        scope3_loss = ce_loss(outputs[1], scope3_tensors)

        final_loss = vagueness_loss + scope3_loss

        total_train_loss += final_loss.item()

        final_loss.backward()
        optimizer.step()

        running_vagueness_tensor = torch.cat([running_vagueness_tensor, vagueness_tensors])
        running_scope3_tensor = torch.cat([running_scope3_tensor, scope3_tensors])
        running_vagueness_pred_tensor = torch.cat([running_vagueness_pred_tensor, outputs[0]])
        running_scope3_pred_tensor = torch.cat([running_scope3_pred_tensor, outputs[1]])

        vagueness_metrics = calculate_metrics(running_vagueness_pred_tensor, running_vagueness_tensor, vagueness_class_mapping)
        scope3_metrics = calculate_metrics(running_scope3_pred_tensor, running_scope3_tensor, scope3_class_mapping)

        average_train_loss = total_train_loss / (i+1)
    
        print(f'\rBatch [{i+1}/{num_batches}], Average Train Loss: {average_train_loss:.4f}', vagueness_metrics, scope3_metrics, end='')

    print("")
    model.eval()
    with torch.no_grad():
        
        total_valid_loss = 0
        running_vagueness_tensor = torch.tensor([]).to(device)
        running_scope3_tensor = torch.tensor([]).to(device)
        running_vagueness_pred_tensor = torch.tensor([]).to(device)
        running_scope3_pred_tensor = torch.tensor([]).to(device)

        for data in valid_dataloader:
            
            input_id_tensors = data[0].to(device)
            input_mask_tensors = data[1].to(device)
            vagueness_tensors = data[2].to(device)
            scope3_tensors = data[3].to(device)

            outputs = model(input_id_tensors, mask=input_mask_tensors)

            vagueness_loss = ce_loss(outputs[0], vagueness_tensors)
            scope3_loss = ce_loss(outputs[1], scope3_tensors)

            final_loss = vagueness_loss + scope3_loss
            
            total_valid_loss += final_loss.item()

            running_vagueness_tensor = torch.cat([running_vagueness_tensor, vagueness_tensors])
            running_scope3_tensor = torch.cat([running_scope3_tensor, scope3_tensors])
            running_vagueness_pred_tensor = torch.cat([running_vagueness_pred_tensor, outputs[0]])
            running_scope3_pred_tensor = torch.cat([running_scope3_pred_tensor, outputs[1]])
        
        average_valid_loss = total_valid_loss / len(valid_dataloader)
        scheduler.step(average_valid_loss)

        if average_valid_loss < best_val_loss:
            best_val_loss = average_valid_loss
            best_val_epoch = epoch_i
            torch.save(model.state_dict(), 'best_model.pth')

        vagueness_metrics = calculate_metrics(running_vagueness_pred_tensor, running_vagueness_tensor, vagueness_class_mapping)
        scope3_metrics = calculate_metrics(running_scope3_pred_tensor, running_scope3_tensor, scope3_class_mapping)

        

        print(f'Avg Validation Loss: {average_valid_loss:.4f} | Best Validation Loss: {best_val_loss:.4f} | Best Epoch: {best_val_epoch}',vagueness_metrics, scope3_metrics)
        
print("\nTraining complete!")

Batch [3291/3291], Average Train Loss: 1.2059 Accuracy_specific: 0.7999 | Precision_specific: 0.5440 | Recall_specific: 0.3384| Accuracy_ambiguous: 0.8145 | Precision_ambiguous: 0.4788 | Recall_ambiguous: 0.2671| Accuracy_generic: 0.8640 | Precision_generic: 0.4708 | Recall_generic: 0.1141| Accuracy_notESG: 0.6679 | Precision_notESG: 0.5959 | Recall_notESG: 0.9256| Accuracy_yes: 0.9434 | Precision_yes: 0.6842 | Recall_yes: 0.0258| Accuracy_no: 0.9434 | Precision_no: 0.9439 | Recall_no: 0.9993|
Avg Validation Loss: 1.0805 | Best Validation Loss: 1.0805 | Best Epoch: 0 Accuracy_specific: 0.8297 | Precision_specific: 0.5990 | Recall_specific: 0.5756| Accuracy_ambiguous: 0.8082 | Precision_ambiguous: 0.5118 | Recall_ambiguous: 0.4555| Accuracy_generic: 0.8472 | Precision_generic: 0.4912 | Recall_generic: 0.3810| Accuracy_notESG: 0.7805 | Precision_notESG: 0.7206 | Recall_notESG: 0.8241| Accuracy_yes: 0.9344 | Precision_yes: 0.5128 | Recall_yes: 0.3077| Accuracy_no: 0.9344 | Precision_no: 0

KeyboardInterrupt: 

In [11]:
class InferenceDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.texts = self.df['text'].values

        self.input_ids, self.attention_masks = tokenize_and_format(self.texts)
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx]

In [12]:
# load the model
model = BERTMultiTask(encoder_model=model_name)
model.load_state_dict(torch.load('best_model.pth'))
model.to(device)

model.eval()


inference_dataset = InferenceDataset(test_df)
inference_dataloader = DataLoader(inference_dataset, batch_size=1, shuffle=False)

# inference
vagueness_inference = []
scope3_inference = []

reverse_vagueness_class_mapping = {v:k for k,v in vagueness_class_mapping.items()}
reverse_scope3_class_mapping = {v:k for k,v in scope3_class_mapping.items()}
with torch.no_grad():
    for data in inference_dataloader:
        
        input_id_tensors = data[0].to(device)
        input_mask_tensors = data[1].to(device)

        outputs = model(input_id_tensors, mask=input_mask_tensors)

        vagueness_pred = torch.argmax(outputs[0], dim=-1).to('cpu').tolist()
        scope3_pred = torch.argmax(outputs[1], dim=-1).to('cpu').tolist()

        vagueness_inference.extend([reverse_vagueness_class_mapping[x] for x in vagueness_pred])
        scope3_inference.extend([reverse_scope3_class_mapping[x] for x in scope3_pred])

test_df['vagueness_pred'] = vagueness_inference
test_df['scope3_pred'] = scope3_inference

In [13]:
test_df.pivot_table(index='vague', columns='vagueness_pred', values='text', aggfunc='count', fill_value=0)

vagueness_pred,ambiguous,generic,notESG,specific
vague,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ambiguous,108,22,47,45
generic,36,48,49,16
notESG,22,16,394,58
specific,33,4,47,138


In [14]:
test_df.pivot_table(index='scope3', columns='scope3_pred', values='text', aggfunc='count', fill_value=0)

scope3_pred,no,yes
scope3,Unnamed: 1_level_1,Unnamed: 2_level_1
no,995,29
yes,27,32


# Scope 3 Model

In [1]:
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [2]:
# Variables
model_name = 'Alibaba-NLP/gte-large-en-v1.5'
scope3_class_mapping = {"yes":1, "no":0}
train_file_path = 'final_annotated_data.csv'
test_file_path = 'NYSE_DE_2022_results.csv'
device = torch.device("cuda:0")
batch_size = 8
lr = 1e-5
epochs = 30
train_test_split = 0.1

In [3]:
def tokenize_and_format(sentences, max_sentence_length=200):
    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True, use_fast=False)

    input_ids = []
    attention_masks = []

    for sentence in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sentence,
                            add_special_tokens = True,
                            max_length = max_sentence_length,
                            padding = 'max_length',
                            truncation = True,
                            return_attention_mask = True,
                            return_tensors = 'pt',
                        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

In [4]:
class MultiTaskDataset(Dataset):
    def __init__(self, df, scope3_class_mapping):
        self.df = df
        self.texts = self.df['text'].values
        self.scope3 = self.df['scope3'].apply(lambda x: scope3_class_mapping[x]).values

        self.input_ids, self.attention_masks = tokenize_and_format(self.texts)
        self.scope3 = torch.tensor(self.scope3)
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx], self.scope3[idx]

In [5]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

train_len = len(train_df)
test_len = len(test_df)

num_val = int(train_test_split * train_len)

train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
validation_df = train_df.iloc[:num_val]
train_df = train_df.iloc[num_val:]


In [6]:
len(train_df), len(validation_df), len(test_df)

(9751, 1083, 791)

In [7]:
print("Train set statistics")
print(train_df['scope3'].value_counts())

print("\nValidation set statistics")
print(validation_df['scope3'].value_counts())

print("\nTest set statistics")
print(test_df['scope3'].value_counts())


Train set statistics
scope3
no     9182
yes     569
Name: count, dtype: int64

Validation set statistics
scope3
no     1024
yes      59
Name: count, dtype: int64

Test set statistics
scope3
no     762
yes     29
Name: count, dtype: int64


In [8]:
train_dataset = MultiTaskDataset(train_df, scope3_class_mapping)
valid_dataset = MultiTaskDataset(validation_df, scope3_class_mapping)
test_dataset = MultiTaskDataset(test_df, scope3_class_mapping)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

In [22]:
class BERTMultiTask(torch.nn.Module):
    def __init__(self, encoder_model='bert-base-uncased'):
        super(BERTMultiTask, self).__init__()

        self.encoder = AutoModel.from_pretrained(encoder_model)
        hidden_size = self.encoder.config.hidden_size
        self.linear1 = torch.nn.Linear(hidden_size, 256)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear2 = torch.nn.Linear(256, 256)
        self.scope3_out = torch.nn.Linear(256, 2)
        self.relu = torch.nn.ReLU()

    def forward(self, input_ids, mask):
        outputs = self.encoder(input_ids, attention_mask=mask)

        linear1_out = self.relu(self.linear1(outputs.last_hidden_state[:,0,:]))
        # linear1_out = self.dropout(linear1_out)
        linear2_out = self.relu(self.linear2(linear1_out))
        scope3_out = self.scope3_out(linear2_out)
        return scope3_out

In [24]:
model = BERTMultiTask(encoder_model=model_name)

In [25]:
model.to(device)

BERTMultiTask(
  (encoder): NewModel(
    (embeddings): NewEmbeddings(
      (word_embeddings): Embedding(30528, 1024, padding_idx=0)
      (rotary_emb): NTKScalingRotaryEmbedding()
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): NewEncoder(
      (layer): ModuleList(
        (0-23): 24 x NewLayer(
          (attention): NewAttention(
            (qkv_proj): Linear(in_features=1024, out_features=3072, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (o_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (mlp): NewGatedMLP(
            (up_gate_proj): Linear(in_features=1024, out_features=8192, bias=False)
            (down_proj): Linear(in_features=4096, out_features=1024, bias=True)
            (act_fn): GELUActivation()
            (hidden_dropout): Dropout(p=0.1, inplace=False)
     

In [26]:
model_param_size = sum([p.nelement() for p in model.parameters()])
print(f"Model parameters: {model_param_size/1e6}M")

Model parameters: 434.467842M


In [27]:
encoder_params = list(model.encoder.named_parameters())
new_layer_params = list(model.scope3_out.named_parameters()) + list(model.linear1.named_parameters())
no_decay = {'bias', 'LayerNorm.weight'}

base_learning_rate = 5e-7
new_learning_rate = 1e-4

optimizer_grouped_parameters = [
    {'params': [p for n, p in encoder_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01, 'lr': base_learning_rate},
    {'params': [p for n, p in encoder_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': base_learning_rate},
    {'params': [p for n, p in new_layer_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01, 'lr': new_learning_rate},
    {'params': [p for n, p in new_layer_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': new_learning_rate}
]

# optimizer = AdamW(optimizer_grouped_parameters, lr=base_learning_rate, eps=1e-8)
optimizer = AdamW(model.parameters(), lr=new_learning_rate, eps=1e-6, weight_decay=0.01)
scheduler = ReduceLROnPlateau(optimizer, factor=0.33, patience=2, verbose=True)
ce_loss = torch.nn.CrossEntropyLoss(weight=torch.tensor([1.0, 10.0]).to(device))


In [28]:
def calculate_metrics(y_pred, y_true, class_mapping):
    
    y_pred_class = torch.argmax(y_pred, dim=-1)
    reverse_class_mapping = {v:k for k,v in class_mapping.items()}

    metrics = []
    for i in reverse_class_mapping:

        true_positives = torch.sum((y_pred_class == i) & (y_true == i)).item()
        true_negatives = torch.sum((y_pred_class != i) & (y_true != i)).item()
        false_positives = torch.sum((y_pred_class == i) & (y_true != i)).item()
        false_negatives = torch.sum((y_pred_class != i) & (y_true == i)).item()

        accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
        precision = true_positives / (true_positives + false_positives) if true_positives + false_positives != 0 else 0
        recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives != 0 else 0

        class_name = reverse_class_mapping[i]
        metrics.append(f'Accuracy_{class_name}: {accuracy:.4f} | Precision_{class_name}: {precision:.4f} | Recall_{class_name}: {recall:.4f}|')

    metrics = " ".join(metrics)
    return metrics

In [29]:
best_val_loss = 10000
best_val_epoch = -1

In [30]:
next(iter(train_dataloader))

[tensor([[  101,  2949,  1996,  ...,     0,     0,     0],
         [  101,  3891,  2241,  ...,     0,     0,     0],
         [  101,  1996,  4254,  ...,     0,     0,     0],
         ...,
         [  101,  1999,  2047,  ...,     0,     0,     0],
         [  101, 18368,  2038,  ...,     0,     0,     0],
         [  101,  4031,  3737,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([0, 0, 1, 0, 0, 0, 0, 0])]

In [31]:
model(next(iter(train_dataloader))[0].to(device), next(iter(train_dataloader))[1].to(device))

tensor([[ 0.0128, -0.2004],
        [ 0.0771, -0.1036],
        [ 0.0242, -0.1750],
        [ 0.1064, -0.1178],
        [ 0.0180, -0.1614],
        [ 0.1002, -0.0969],
        [ 0.1553, -0.1944],
        [ 0.0689, -0.1395]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [32]:
b = torch.tensor([[-0.2923, -0.1847],
        [-0.1975, -0.1444],
        [-0.1531, -0.1217],
        [-0.2194, -0.1925],
        [-0.1369, -0.1668],
        [-0.2941, -0.3014],
        [-0.1777, -0.1966],
        [-0.2073, -0.1600]]).to(device)
c = next(iter(train_dataloader))[2].to(device)

In [33]:
ce_loss(b, c)

tensor(0.7066, device='cuda:0')

In [34]:
for param in model.encoder.parameters():
    param.requires_grad = False

In [35]:
for epoch_i in range(0, epochs):

    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

    total_train_loss = 0

    model.train()

    running_scope3_tensor = torch.tensor([]).to(device)
    running_scope3_pred_tensor = torch.tensor([]).to(device)

    num_batches = len(train_dataloader)

    for i, data in enumerate(train_dataloader):
        
        input_id_tensors = data[0].to(device)
        input_mask_tensors = data[1].to(device)
        scope3_tensors = data[2].to(device)

        model.zero_grad()

        outputs = model(input_id_tensors, mask=input_mask_tensors)

        scope3_loss = ce_loss(outputs, scope3_tensors)

        final_loss = scope3_loss

        total_train_loss += final_loss.item()

        final_loss.backward()
        optimizer.step()

        running_scope3_tensor = torch.cat([running_scope3_tensor, scope3_tensors])
        running_scope3_pred_tensor = torch.cat([running_scope3_pred_tensor, outputs])

        scope3_metrics = calculate_metrics(running_scope3_pred_tensor, running_scope3_tensor, scope3_class_mapping)

        average_train_loss = total_train_loss / (i+1)
    
        print(f'\rBatch [{i+1}/{num_batches}], Average Train Loss: {average_train_loss:.4f}', scope3_metrics, end='')

    print("")
    model.eval()
    with torch.no_grad():
        
        total_valid_loss = 0
        running_scope3_tensor = torch.tensor([]).to(device)
        running_scope3_pred_tensor = torch.tensor([]).to(device)

        for data in valid_dataloader:
            
            input_id_tensors = data[0].to(device)
            input_mask_tensors = data[1].to(device)
            scope3_tensors = data[2].to(device)

            outputs = model(input_id_tensors, mask=input_mask_tensors)

            scope3_loss = ce_loss(outputs, scope3_tensors)

            final_loss = scope3_loss
            
            total_valid_loss += final_loss.item()

            running_scope3_tensor = torch.cat([running_scope3_tensor, scope3_tensors])
            running_scope3_pred_tensor = torch.cat([running_scope3_pred_tensor, outputs])
        
        average_valid_loss = total_valid_loss / len(valid_dataloader)
        scheduler.step(average_valid_loss)

        if average_valid_loss < best_val_loss:
            best_val_loss = average_valid_loss
            best_val_epoch = epoch_i
            torch.save(model.state_dict(), 'best_model.pth')

        scope3_metrics = calculate_metrics(running_scope3_pred_tensor, running_scope3_tensor, scope3_class_mapping)

        

        print(f'Avg Validation Loss: {average_valid_loss:.4f} | Best Validation Loss: {best_val_loss:.4f} | Best Epoch: {best_val_epoch}',scope3_metrics)
        
print("\nTraining complete!")

Batch [1219/1219], Average Train Loss: 0.3206 Accuracy_yes: 0.9153 | Precision_yes: 0.3657 | Recall_yes: 0.6151| Accuracy_no: 0.9153 | Precision_no: 0.9751 | Recall_no: 0.9339|
Avg Validation Loss: 0.2734 | Best Validation Loss: 0.2734 | Best Epoch: 0 Accuracy_yes: 0.9391 | Precision_yes: 0.4578 | Recall_yes: 0.6441| Accuracy_no: 0.9391 | Precision_no: 0.9790 | Recall_no: 0.9561|
Batch [1219/1219], Average Train Loss: 0.2468 Accuracy_yes: 0.9189 | Precision_yes: 0.3980 | Recall_yes: 0.7610| Accuracy_no: 0.9189 | Precision_no: 0.9843 | Recall_no: 0.9287|
Avg Validation Loss: 0.3062 | Best Validation Loss: 0.2734 | Best Epoch: 0 Accuracy_yes: 0.9372 | Precision_yes: 0.4444 | Recall_yes: 0.6102| Accuracy_no: 0.9372 | Precision_no: 0.9770 | Recall_no: 0.9561|
Batch [1219/1219], Average Train Loss: 0.2178 Accuracy_yes: 0.9281 | Precision_yes: 0.4367 | Recall_yes: 0.7996| Accuracy_no: 0.9281 | Precision_no: 0.9869 | Recall_no: 0.9361|
Avg Validation Loss: 0.2830 | Best Validation Loss: 0.273

KeyboardInterrupt: 

In [37]:
class InferenceDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.texts = self.df['text'].values

        self.input_ids, self.attention_masks = tokenize_and_format(self.texts)
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx]

In [40]:
# load the model
model = BERTMultiTask(encoder_model=model_name)
model.load_state_dict(torch.load('best_model.pth'))
model.to(device)

model.eval()

# inference
vagueness_inference = []
scope3_inference = []

reverse_scope3_class_mapping = {v:k for k,v in scope3_class_mapping.items()}
with torch.no_grad():
    for data in valid_dataloader:
        
        input_id_tensors = data[0].to(device)
        input_mask_tensors = data[1].to(device)

        outputs = model(input_id_tensors, mask=input_mask_tensors)

        scope3_pred = torch.argmax(outputs, dim=-1).to('cpu').tolist()

        scope3_inference.extend([reverse_scope3_class_mapping[x] for x in scope3_pred])

# result_df = pd.DataFrame({'text':inference_data, 'scope3':scope3_inference})
# print(result_df)
validation_df['prediction'] = scope3_inference


In [43]:
validation_df['text'] = validation_df['text'].apply(lambda x: x.replace('\n', ' '))

In [44]:
validation_df.to_csv('validation_results.csv', index=False)

: 

In [56]:
validation_df[validation_df['scope3'] == 'yes']

Unnamed: 0,text,gpt_responses,scope3,vague,file_name
43,"At NIKE, we have a long history of working tow...","{'scope3': 'yes', 'vague': 'generic'}",yes,generic,NYSE_NKE_2022_results.csv
52,SECTION SCOPE: In this section our climate emi...,"{'scope3': 'yes', 'vague': 'ambiguous'}",yes,ambiguous,NYSE_KO_2022_results.csv
98,Progress: We are partnering in the collection ...,"{'scope3': 'yes', 'vague': 'ambiguous'}",yes,ambiguous,NYSE_MCD_2022_results.csv
111,The Company’s goal is to provide its customers...,"{'scope3': 'yes', 'vague': 'specific'}",yes,specific,NYSE_TTE_2022_results.csv
112,How a company implements policies and procedur...,"{'scope3': 'yes', 'vague': 'ambiguous'}",yes,ambiguous,NYSE_KO_2022_results.csv
125,These emissions are broken out into Purchased ...,"{'scope3': 'yes', 'vague': 'ambiguous'}",yes,ambiguous,NASDAQ_BKNG_2022_results.csv
137,Loss of riders and Uber Eats users: Consumer p...,"{'scope3': 'yes', 'vague': 'ambiguous'}",yes,ambiguous,NYSE_UBER_2022_results.csv
160,Transparent processes and systems to help ensu...,"{'scope3': 'yes', 'vague': 'specific'}",yes,specific,NYSE_NKE_2022_results.csv
167,"Acquired 1,500 EV charge points in Singapore E...","{'scope3': 'yes', 'vague': 'ambiguous'}",yes,ambiguous,NYSE_TTE_2022_results.csv
176,Because approximately 85% of our total carbon ...,"{'scope3': 'yes', 'vague': 'specific'}",yes,specific,NYSE_KO_2022_results.csv


In [None]:
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Variables
model_name = 'intfloat/e5-base-v2'
scope3_class_mapping = {"yes":1, "no":0}
train_file_path = 'final_annotated_data.csv'
test_file_path = 'NYSE_DE_2022_results.csv'
device = torch.device("cuda:0")
batch_size = 8
lr = 1e-5
epochs = 30
train_test_split = 0.1

def tokenize_and_format(sentences, max_sentence_length=200):
    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True, use_fast=False)

    input_ids = []
    attention_masks = []

    for sentence in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sentence,
                            add_special_tokens = True,
                            max_length = max_sentence_length,
                            padding = 'max_length',
                            truncation = True,
                            return_attention_mask = True,
                            return_tensors = 'pt',
                        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)
class MultiTaskDataset(Dataset):
    def __init__(self, df, scope3_class_mapping):
        self.df = df
        self.texts = self.df['text'].values
        self.scope3 = self.df['scope3'].apply(lambda x: scope3_class_mapping[x]).values

        self.input_ids, self.attention_masks = tokenize_and_format(self.texts)
        self.scope3 = torch.tensor(self.scope3)
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx], self.scope3[idx]
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

train_len = len(train_df)
test_len = len(test_df)

num_val = int(train_test_split * train_len)

train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
validation_df = train_df.iloc[:num_val]
train_df = train_df.iloc[num_val:]

len(train_df), len(validation_df), len(test_df)
print("Train set statistics")
print(train_df['scope3'].value_counts())

print("\nValidation set statistics")
print(validation_df['scope3'].value_counts())

print("\nTest set statistics")
print(test_df['scope3'].value_counts())

train_dataset = MultiTaskDataset(train_df, scope3_class_mapping)
valid_dataset = MultiTaskDataset(validation_df, scope3_class_mapping)
test_dataset = MultiTaskDataset(test_df, scope3_class_mapping)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
class BERTMultiTask(torch.nn.Module):
    def __init__(self, encoder_model='bert-base-uncased'):
        super(BERTMultiTask, self).__init__()

        self.encoder = AutoModel.from_pretrained(encoder_model)
        hidden_size = self.encoder.config.hidden_size
        self.linear1 = torch.nn.Linear(hidden_size, 256)
        self.scope3_out = torch.nn.Linear(256, 2)
        self.relu = torch.nn.ReLU()

    def forward(self, input_ids, mask):
        outputs = self.encoder(input_ids, attention_mask=mask)

        linear1_out = self.relu(self.linear1(outputs.last_hidden_state[:,0,:]))
        scope3_out = self.scope3_out(linear1_out)
        return scope3_out
model = BERTMultiTask(encoder_model=model_name)
model.to(device)
model_param_size = sum([p.nelement() for p in model.parameters()])
print(f"Model parameters: {model_param_size/1e6}M")
encoder_params = list(model.encoder.named_parameters())
new_layer_params = list(model.scope3_out.named_parameters()) + list(model.linear1.named_parameters())
no_decay = {'bias', 'LayerNorm.weight'}


optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
scheduler = ReduceLROnPlateau(optimizer, factor=0.33, patience=2, verbose=True)
ce_loss = torch.nn.CrossEntropyLoss()

def calculate_metrics(y_pred, y_true, class_mapping):
    
    y_pred_class = torch.argmax(y_pred, dim=-1)
    reverse_class_mapping = {v:k for k,v in class_mapping.items()}

    metrics = []
    for i in reverse_class_mapping:

        true_positives = torch.sum((y_pred_class == i) & (y_true == i)).item()
        true_negatives = torch.sum((y_pred_class != i) & (y_true != i)).item()
        false_positives = torch.sum((y_pred_class == i) & (y_true != i)).item()
        false_negatives = torch.sum((y_pred_class != i) & (y_true == i)).item()

        accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
        precision = true_positives / (true_positives + false_positives) if true_positives + false_positives != 0 else 0
        recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives != 0 else 0

        class_name = reverse_class_mapping[i]
        metrics.append(f'Accuracy_{class_name}: {accuracy:.4f} | Precision_{class_name}: {precision:.4f} | Recall_{class_name}: {recall:.4f}|')

    metrics = " ".join(metrics)
    return metrics
best_val_loss = 10000
best_val_epoch = -1

for epoch_i in range(0, epochs):

    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

    total_train_loss = 0

    model.train()

    running_scope3_tensor = torch.tensor([]).to(device)
    running_scope3_pred_tensor = torch.tensor([]).to(device)

    num_batches = len(train_dataloader)

    for i, data in enumerate(train_dataloader):
        
        input_id_tensors = data[0].to(device)
        input_mask_tensors = data[1].to(device)
        scope3_tensors = data[2].to(device)

        model.zero_grad()

        outputs = model(input_id_tensors, mask=input_mask_tensors)

        scope3_loss = ce_loss(outputs, scope3_tensors)

        final_loss = scope3_loss

        total_train_loss += final_loss.item()

        final_loss.backward()
        optimizer.step()

        running_scope3_tensor = torch.cat([running_scope3_tensor, scope3_tensors])
        running_scope3_pred_tensor = torch.cat([running_scope3_pred_tensor, outputs])

        scope3_metrics = calculate_metrics(running_scope3_pred_tensor, running_scope3_tensor, scope3_class_mapping)

        average_train_loss = total_train_loss / (i+1)
    
        print(f'\rBatch [{i+1}/{num_batches}], Average Train Loss: {average_train_loss:.4f}', scope3_metrics, end='')

    print("")
    model.eval()
    with torch.no_grad():
        
        total_valid_loss = 0
        running_scope3_tensor = torch.tensor([]).to(device)
        running_scope3_pred_tensor = torch.tensor([]).to(device)

        for data in valid_dataloader:
            
            input_id_tensors = data[0].to(device)
            input_mask_tensors = data[1].to(device)
            scope3_tensors = data[2].to(device)

            outputs = model(input_id_tensors, mask=input_mask_tensors)

            scope3_loss = ce_loss(outputs, scope3_tensors)

            final_loss = scope3_loss
            
            total_valid_loss += final_loss.item()

            running_scope3_tensor = torch.cat([running_scope3_tensor, scope3_tensors])
            running_scope3_pred_tensor = torch.cat([running_scope3_pred_tensor, outputs])
        
        average_valid_loss = total_valid_loss / len(valid_dataloader)
        scheduler.step(average_valid_loss)

        if average_valid_loss < best_val_loss:
            best_val_loss = average_valid_loss
            best_val_epoch = epoch_i
            torch.save(model.state_dict(), 'best_model.pth')

        scope3_metrics = calculate_metrics(running_scope3_pred_tensor, running_scope3_tensor, scope3_class_mapping)

        

        print(f'Avg Validation Loss: {average_valid_loss:.4f} | Best Validation Loss: {best_val_loss:.4f} | Best Epoch: {best_val_epoch}',scope3_metrics)
        
print("\nTraining complete!")