# Torch the DL Project

To train these models please configure the experiments utilizing config files in the /configs directory of this repository.

Acknowledgements: This notebook is derived from Pre-trained Roberta Solution in Pytorch <https://www.kaggle.com/andretugan/pre-trained-roberta-solution-in-pytorch>

In [None]:
# Mount your google drive if you want
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Install some libraries in your environment
!pip install transformers
!pip install sentencepiece



In [None]:
import os
import math
import random
import time
import json

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold

import gc
gc.enable()

# Class & Function Definitions

In [None]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

In [None]:
def get_data(data_path='drive/MyDrive/ColabNotebooks/7643project/Data'):
  train_df = pd.read_csv(os.path.join(data_path, "train.csv"))
  # Remove incomplete entries if any.
  train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
                  inplace=True)
  train_df.reset_index(drop=True, inplace=True)

  test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
  submission_df = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))
  return train_df, test_df, submission_df

In [None]:
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False, max_len=256):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
    
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = max_len,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)

In [None]:
class LitModel(nn.Module):
    def __init__(self, model_path): #ROBERTA_PATH
        super().__init__()

        config = AutoConfig.from_pretrained(model_path)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(model_path, config=config)  
        
        #TODO make these configurable
        self.attention = nn.Sequential(            
            nn.Linear(768, 1024),            
            nn.Tanh(),                       
            nn.Linear(1024, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        self.regressor2 = nn.Sequential(                        
            nn.Linear(768 * 2, 1)                        
        )
        self.regressor4 = nn.Sequential(                        
            nn.Linear(768 * 4, 1)                        
        )

        # some additional layers
        self.cnn1 = nn.Conv1d(768, 256, kernel_size=2, padding=1)
        self.cnn2 = nn.Conv1d(256, 1, kernel_size=2, padding=1)
        
        self.relu = nn.Sequential(
            nn.ReLU()
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)    


        # Original model configurations #

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)



        # # MEAN POOLING #
        # last_hidden_state = roberta_output[0]
        # input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        # sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        # sum_mask = input_mask_expanded.sum(1)
        # sum_mask = torch.clamp(sum_mask, min=1e-9)
        # mean_embeddings = sum_embeddings / sum_mask
        # logits = self.regressor(mean_embeddings)
        # return logits



        # # MAX POOLING #
        # last_hidden_state = roberta_output[0]  
        # input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        # last_hidden_state[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
        # max_embeddings = torch.max(last_hidden_state, 1)[0]
        # logits = self.regressor(max_embeddings)
        # return logits



        # # MEAN-MAX POOLING #
        # last_hidden_state = roberta_output[0]
        # mean_pooling_embeddings = torch.mean(last_hidden_state, 1)
        # max_pooling_embeddings = torch.max(last_hidden_state, 1)[0]
        # mean_max_embeddings = torch.cat((mean_pooling_embeddings, max_pooling_embeddings), 1)
        # logits = self.regressor2(mean_max_embeddings)
        # return logits



        # # CONV-1D POOLING #
        # last_hidden_state = last_hidden_state.permute(0, 2, 1)
        # cnn_embeddings = self.relu(self.cnn1(last_hidden_state))
        # cnn_embeddings = self.cnn2(cnn_embeddings)
        # logits, _ = torch.max(cnn_embeddings, 2)
        # return logits



        # # CONCATENATE POOLING #
        # all_hidden_states = torch.stack(roberta_output[2])
        # concatenate_pooling = torch.cat(
        #     (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]),-1
        # )
        # concatenate_pooling = concatenate_pooling[:, 0]

        # logits = self.regressor4(concatenate_pooling) # regression head
        # return logits

In [None]:
def eval_mse(model, data_loader):
    """Evaluates the mean squared error of the |model| on |data_loader|"""
    model.eval()            
    mse_sum = 0

    with torch.no_grad():
        for batch_num, (input_ids, attention_mask, target) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)                        
            target = target.to(DEVICE)           
            
            pred = model(input_ids, attention_mask)                       

            mse_sum += nn.MSELoss(reduction="sum")(pred.flatten(), target).item()
                

    return mse_sum / len(data_loader.dataset)

In [None]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

In [None]:
def train(model, model_path, train_loader, val_loader,
          optimizer, scheduler=None, num_epochs=3, val_schedule=None):    
    best_val_rmse = None
    best_epoch = 0
    step = 0
    last_eval_step = 0
    eval_period = val_schedule[0][1]    

    start = time.time()

    for epoch in range(num_epochs):                           
        val_rmse = None         

        for batch_num, (input_ids, attention_mask, target) in enumerate(train_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)            
            target = target.to(DEVICE)                        

            optimizer.zero_grad()
            
            model.train()

            pred = model(input_ids, attention_mask)
                                                        
            mse = nn.MSELoss(reduction="mean")(pred.flatten(), target)
                        
            mse.backward()

            optimizer.step()
            if scheduler:
                scheduler.step()
            
            if step >= last_eval_step + eval_period:
                # Evaluate the model on val_loader.
                elapsed_seconds = time.time() - start
                num_steps = step - last_eval_step
                print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                last_eval_step = step
                
                val_rmse = math.sqrt(eval_mse(model, val_loader))                            

                print(f"Epoch: {epoch} batch_num: {batch_num}", 
                      f"val_rmse: {val_rmse:0.4}")

                for rmse, period in val_schedule:
                    if val_rmse >= rmse:
                        eval_period = period
                        break                               
                
                if not best_val_rmse or val_rmse < best_val_rmse:                    
                    best_val_rmse = val_rmse
                    best_epoch = epoch
                    torch.save(model.state_dict(), model_path)
                    print(f"New best_val_rmse: {best_val_rmse:0.4}")
                else:       
                    print(f"Still best_val_rmse: {best_val_rmse:0.4}",
                          f"(from epoch {best_epoch})")                                    
                    
                start = time.time()
                                            
            step += 1
                        
    
    return best_val_rmse

In [None]:
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    roberta_parameters = named_parameters[:197]    
    attention_parameters = named_parameters[199:203]
    regressor_parameters = named_parameters[203:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = 2e-5

        if layer_num >= 69:        
            lr = 5e-5

        if layer_num >= 133:
            lr = 1e-4

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return AdamW(parameters)

# Training

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
config_path = 'drive/MyDrive/ColabNotebooks/7643project/configs/roberta-base.json'
with open(config_path) as f:
  config = json.load(f)

training_config = config["training"]
dataset_config = config["dataset"]
model_config = config["model"]
evaluation_config = config["evaluation"]

gc.collect()
tokenizer = AutoTokenizer.from_pretrained(model_config["base_model"])
list_val_rmse = []

kfold = KFold(n_splits=training_config["num_folds"], random_state=training_config["seed"], shuffle=True)
train_df, test_df, sample_submission = get_data(dataset_config["data_path"])

for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df)): 
    num_folds = training_config["num_folds"]
    print(f"\nFold {fold + 1}/{num_folds}")
    model_path = f"model_{fold + 1}.pth"
        
    set_random_seed(training_config["seed"] + fold)
    
    train_dataset = LitDataset(train_df.loc[train_indices])    
    val_dataset = LitDataset(train_df.loc[val_indices])    
        
    train_loader = DataLoader(train_dataset, batch_size=dataset_config["batch_size"],
                              drop_last=True, shuffle=True, num_workers=dataset_config["num_workers"])    
    val_loader = DataLoader(val_dataset, batch_size=dataset_config["batch_size"],
                            drop_last=False, shuffle=False, num_workers=dataset_config["num_workers"])    
        
    set_random_seed(training_config["seed"] + fold)    
    
    model = LitModel(model_config["base_model"]).to(DEVICE)
    
    optimizer = create_optimizer(model)                        
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_training_steps=training_config["num_epochs"] * len(train_loader),
        num_warmup_steps=50)    
    
    list_val_rmse.append(train(model, model_path, train_loader,
                               val_loader, optimizer, scheduler=scheduler,
                               num_epochs=training_config["num_epochs"], 
                               val_schedule=training_config["val_schedule"]))

    del model
    gc.collect()
    
    print("\nPerformance estimates:")
    print(list_val_rmse)
    print("Mean:", np.array(list_val_rmse).mean())


Fold 1/5


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



16 steps took 7.18 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9055
New best_val_rmse: 0.9055

16 steps took 6.52 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7035
New best_val_rmse: 0.7035

16 steps took 6.54 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6519
New best_val_rmse: 0.6519

16 steps took 6.54 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.613
New best_val_rmse: 0.613

16 steps took 6.55 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5872
New best_val_rmse: 0.5872

16 steps took 6.55 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6914
Still best_val_rmse: 0.5872 (from epoch 0)

16 steps took 6.53 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7822
Still best_val_rmse: 0.5872 (from epoch 0)

16 steps took 6.54 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6247
Still best_val_rmse: 0.5872 (from epoch 0)

16 steps took 6.82 seconds
Epoch: 1 batch_num: 3 val_rmse: 0.6231
Still best_val_rmse: 0.5872 (from epoch 0)

16 steps took 6.53 seconds
Epoch: 1 batch_num: 19 val_rmse: 0.5494
New best_val_rmse: 

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



16 steps took 7.16 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.13
New best_val_rmse: 1.13

16 steps took 6.51 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9548
New best_val_rmse: 0.9548

16 steps took 6.51 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.684
New best_val_rmse: 0.684

16 steps took 6.53 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6209
New best_val_rmse: 0.6209

16 steps took 6.51 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5579
New best_val_rmse: 0.5579

16 steps took 6.51 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5453
New best_val_rmse: 0.5453

16 steps took 6.55 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.538
New best_val_rmse: 0.538

16 steps took 6.51 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5332
New best_val_rmse: 0.5332

16 steps took 6.84 seconds
Epoch: 1 batch_num: 3 val_rmse: 0.5567
Still best_val_rmse: 0.5332 (from epoch 0)

16 steps took 6.5 seconds
Epoch: 1 batch_num: 19 val_rmse: 0.5297
New best_val_rmse: 0.5297

16 steps took 6.54 seconds
Epoch: 1 batch_num: 35 

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



16 steps took 7.21 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.22
New best_val_rmse: 1.22

16 steps took 6.55 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9518
New best_val_rmse: 0.9518

16 steps took 6.56 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7003
New best_val_rmse: 0.7003

16 steps took 6.56 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6381
New best_val_rmse: 0.6381

16 steps took 6.56 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6306
New best_val_rmse: 0.6306

16 steps took 6.56 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6332
Still best_val_rmse: 0.6306 (from epoch 0)

16 steps took 6.55 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5775
New best_val_rmse: 0.5775

16 steps took 6.57 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5401
New best_val_rmse: 0.5401

16 steps took 6.88 seconds
Epoch: 1 batch_num: 3 val_rmse: 0.5843
Still best_val_rmse: 0.5401 (from epoch 0)

16 steps took 6.54 seconds
Epoch: 1 batch_num: 19 val_rmse: 0.5249
New best_val_rmse: 0.5249

16 steps took 6.54 seconds
E

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



16 steps took 7.21 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9139
New best_val_rmse: 0.9139

16 steps took 6.55 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6786
New best_val_rmse: 0.6786

16 steps took 6.54 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6368
New best_val_rmse: 0.6368

16 steps took 6.54 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5899
New best_val_rmse: 0.5899

16 steps took 6.52 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5999
Still best_val_rmse: 0.5899 (from epoch 0)

16 steps took 6.54 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6035
Still best_val_rmse: 0.5899 (from epoch 0)

16 steps took 6.54 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6227
Still best_val_rmse: 0.5899 (from epoch 0)

16 steps took 6.54 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5555
New best_val_rmse: 0.5555

16 steps took 6.85 seconds
Epoch: 1 batch_num: 3 val_rmse: 0.5431
New best_val_rmse: 0.5431

16 steps took 6.53 seconds
Epoch: 1 batch_num: 19 val_rmse: 0.5287
New best_val_rmse: 0.5287

16 step

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



16 steps took 7.22 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9178
New best_val_rmse: 0.9178

16 steps took 6.54 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7791
New best_val_rmse: 0.7791

16 steps took 6.55 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6109
New best_val_rmse: 0.6109

16 steps took 6.55 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7195
Still best_val_rmse: 0.6109 (from epoch 0)

16 steps took 6.54 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5772
New best_val_rmse: 0.5772

16 steps took 6.54 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5917
Still best_val_rmse: 0.5772 (from epoch 0)

16 steps took 6.54 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6578
Still best_val_rmse: 0.5772 (from epoch 0)

16 steps took 6.54 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5138
New best_val_rmse: 0.5138

16 steps took 6.88 seconds
Epoch: 1 batch_num: 3 val_rmse: 0.5219
Still best_val_rmse: 0.5138 (from epoch 0)

16 steps took 6.54 seconds
Epoch: 1 batch_num: 19 val_rmse: 0.5111
New best_val_rmse

# Evaluation

In [None]:
test_dataset = LitDataset(test_df, inference_only=True)
all_predictions = np.zeros((len(list_val_rmse), len(test_df)))

test_dataset = LitDataset(test_df, inference_only=True)
test_loader = DataLoader(test_dataset, batch_size=dataset_config["batch_size"],
                         drop_last=False, shuffle=False, num_workers=dataset_config["num_workers"])

for index in range(len(list_val_rmse)):            
    model_path = f"model_{index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = LitModel(model_config["base_model"])
    model.load_state_dict(torch.load(model_path))    
    model.to(DEVICE)
    
    all_predictions[index] = predict(model, test_loader)
    
    del model
    gc.collect()



Using model_1.pth


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Using model_2.pth


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Using model_3.pth


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Using model_4.pth


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Using model_5.pth


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Generate Submission
#_, _, submission_df = get_data()
submission_df = sample_submission
predictions = all_predictions.mean(axis=0)
submission_df.target = predictions
print(submission_df)
submission_df.to_csv(evaluation_config["submission_path"], index=False)


          id    target
0  c0f722661 -0.525537
1  f0953f0a5 -0.493696
2  0df072751 -0.341860
3  04caf4e0c -2.563208
4  0e63f8bea -1.846457
5  12537fe78 -1.380682
6  965e592c0  0.255189
