# Common Readability Prize using a regression model

We need to predict the readability score of a text. I'm going to treat this problem as a regression problem.

In [4]:
!nvidia-smi
!nvcc -V

# Verify pytorch is using the GPU
import torch
print(f'Torch: {torch.__version__}')

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Thu Jun  3 20:38:08 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    26W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Dataset

In [5]:
import pandas as pd

data_dir = '../input/commonlitreadabilityprize/'
train_filepath = f'{data_dir}train.csv'
test_filepath = f'{data_dir}test.csv'
unigrams_filepath = '../input/gwordlist/frequency-all.txt'
bigrams_filepath = '../input/peter-norvig-count-2wtxt/count_2w.txt'

train_data = pd.read_csv(train_filepath)


# License and URL
from urllib.parse import urlsplit

def get_base_url(url):
    url = str(url)
    split_url = urlsplit(url)
    return split_url.netloc

#license_dict = {x: index for index, x in enumerate(train_data['license'].unique())}
license_dict = {
    'CC BY 4.0': 0,
    'CC BY-SA 3.0 and GFDL': 1,
    'CC BY-SA 3.0': 2,
    'CC BY-NC-SA 2.0': 3,
    'CC BY 3.0': 4
}
train_data['license'] = train_data['license'].replace(license_dict)
train_data['url_legal'] = train_data['url_legal'].apply(get_base_url)
#url_dict = {x: index for index, x in enumerate(train_data['url_legal'].unique())}
url_dict = {
    '': 0,
    'simple.wikipedia.org': 1,
    'kids.frontiersin.org': 2,
    'en.wikipedia.org': 3,
    'www.africanstorybook.org': 4,
    'www.commonlit.org': 5,
    'www.digitallibrary.io': 6,
    'freekidsbooks.org': 7,
    'en.wikibooks.org': 8,
    'static.ehe.osu.edu': 9
}
train_data['url_legal'] = train_data['url_legal'].replace(url_dict)

test_data = pd.read_csv(test_filepath)
test_data['license'] = test_data['license'].map(license_dict)
test_data['url_legal'] = test_data['url_legal'].map(url_dict)

# Removing 1% of high loss labels after training
high_loss_labels = [
    "04ade0eb2", "bcd734621", "76f92b721", "03b761fd9", "afeb324bd", "9cbc92ce1",
    "83f9c17b9", "62dceff46", "78006971c", "23ff6b3c9", "6ee4f1df3", "15e2e9e7a",
    "99a602911", "dd54ca86d", "f28a4261d", "47e98a5c8", "4cf4a2fa3", "f04e03fd8",
    "02817cbd1", "dc05f5cbd", "f317805ab", "9ea0d2788", "322e67244", "8cc328cc3",
    "060fc57c6", "c913c40e9", "6923a71bd", "3c1674b21", "04fe69def", "551e0fc0b"
]
#train_data = train_data[~train_data['id'].isin(high_loss_labels)]

# Convert datasets to list of dictionaries
train_data = train_data.to_dict('records')
test_data = test_data.to_dict('records')

In [6]:
# I'm using the unigrams available at https://github.com/hackerb9/gwordlist
raw_unigrams = pd.read_csv(unigrams_filepath, sep='\t', )
raw_unigrams.columns = ['gold_content', 'count', 'percent', 'cumulative']
unigrams = pd.DataFrame(
    raw_unigrams.gold_content.str.split(' ', 1).tolist(),
    columns = ['ranking','unigram'])
unigrams['unigram'] = unigrams['unigram'].str.strip()
unigrams['ranking'] = pd.to_numeric(unigrams["ranking"])
unigrams_dict = dict(zip(unigrams['unigram'], unigrams['ranking']))
del unigrams
del raw_unigrams

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
# I'm using the bigrams available at https://norvig.com/ngrams/count_2w.txt
raw_bigrams = pd.read_csv(bigrams_filepath, sep='\t', header=None)
raw_bigrams.columns = ['bigram', 'frequency']
raw_bigrams.sort_values(by='frequency', inplace=True, ignore_index=True, ascending=False)
bigrams_dict = {x['bigram']: index + 1 for index, x in enumerate(raw_bigrams.to_dict('records'))}
del raw_bigrams

# Features

## Manual Features

In [8]:
import spacy
import numpy as np
from nltk.util import ngrams
import nltk
nlp = spacy.load("en_core_web_lg")

def get_manual_features(text, license, url):
    # Inspiration: https://arxiv.org/pdf/2103.04083v1.pdf
    
    if url not in url_dict:
        url = len(url_dict)

    if license not in license_dict:
        license = len(license_dict)

    doc = nlp(text)
    difficulty_embedding = []
    unigram_embedding_size = 256
    bigram_embedding_size = 128
    NUM_UNIGRAMS = 7919828

    num_words = len(doc)
    num_sentences = len(list(doc.sents))
    num_characters = len(text)
    num_pronouns = 0
    num_verbs = 0
    num_adjectives = 0
    num_adverbs = 0
    num_nouns = 0
    num_determinants = 0
    num_punctuation = 0
    num_long_words = 0
    num_letters = 0
    num_unigrams_without_difficulty = 0
    words = {}

    for index, token in enumerate(doc):
        word_difficulty = unigrams_dict.get(token.text, None)
        if word_difficulty:
            difficulty_embedding.append(word_difficulty/NUM_UNIGRAMS)
        else:
            num_unigrams_without_difficulty += 1
        if token.pos_ == 'PRON':
            num_pronouns += 1
        if token.pos_ == 'VERB':
            num_verbs += 1
        if token.pos_ == 'ADJ':
            num_adjectives += 1
        if token.pos_ == 'ADV':
            num_adverbs += 1
        if token.pos_ in ['NOUN', 'PROPN']:
            num_nouns += 1
        if token.pos_ == 'DET':
            num_determinants += 1
        if token.pos_ == 'PUNCT':
            num_punctuation += 1
        if len(token.text) > 6:
            num_long_words += 1
        if token.is_alpha:
            num_letters += len(token.text)
        words[token.text] = 1
    
    # Difficulty of bigrams
    num_bigrams_without_difficulty = 0
    lower_text = text
    lower_text = lower_text.lower()
    tokens = nltk.word_tokenize(lower_text)
    bigrams = [' '.join(x) for x in ngrams(tokens, 2)]
    bigram_difficulty_embedding = []
    NUM_BIGRAMS = len(bigrams_dict)

    for bigram in bigrams:
        bigram_difficulty = bigrams_dict.get(bigram, None)
        if bigram_difficulty:
            bigram_difficulty_embedding.append(bigram_difficulty / NUM_BIGRAMS)
        else:
            num_bigrams_without_difficulty += 1

    # Document level features
    words_by_sentence = num_words / num_sentences
    characters_by_word = num_characters / num_words
    letters_by_word = num_letters / num_words
    long_words_by_word = num_long_words / num_words
    unique_words_by_word = len(words) / num_words
    avg_difficulty = sum(difficulty_embedding) / len(difficulty_embedding)
    avg_bigram_difficulty = sum(bigram_difficulty_embedding) / len(bigram_difficulty_embedding)

    # Difficulty features
    #difficulty_embedding = list(set(difficulty_embedding))
    difficulty_embedding.sort(reverse=True)
    difficulty_embedding = difficulty_embedding[:unigram_embedding_size] 
    difficulty_embedding = difficulty_embedding + [0] * (unigram_embedding_size - len(difficulty_embedding))
    #bigram_difficulty_embedding = list(set(bigram_difficulty_embedding))
    bigram_difficulty_embedding.sort(reverse=True)
    bigram_difficulty_embedding = bigram_difficulty_embedding[:bigram_embedding_size]
    bigram_difficulty_embedding = bigram_difficulty_embedding + [0] * (bigram_embedding_size - len(bigram_difficulty_embedding))

    manual_embedding = [
        license, url, num_words, num_sentences, num_characters, num_pronouns,
        num_verbs, num_adjectives, num_adverbs, num_nouns,
        num_determinants, num_punctuation,
        num_long_words, num_letters, words_by_sentence, characters_by_word,
        letters_by_word, long_words_by_word, unique_words_by_word,
        unique_words_by_word, avg_difficulty, avg_bigram_difficulty,
        num_unigrams_without_difficulty, num_bigrams_without_difficulty
    ]
    manual_embedding.extend(difficulty_embedding)
    manual_embedding.extend(bigram_difficulty_embedding)
    return np.array(manual_embedding)
text_sample = '''
While I was hailing the brig, I spied a tract of water lying between us, where no great waves came, but which yet boiled white all over and bristled in the moon with rings and bubbles. Sometimes the whole tract swung to one side, like the tail of a live serpent; sometimes, for a glimpse, it would all disappear and then boil up again. What it was I had no guess, which for the time increased my fear of it; but I now know it must have been the roost or tide-race, which had carried me away so fast and tumbled me about so cruelly, and at last, as if tired of that play, had flung out me and the spare yard upon its landward margin.
I now lay quite becalmed, and began to feel that a man can die of cold as well as of drowning. The shores of Earraid were close in; I could see in the moonlight the dots of heather and the sparkling of the mica in the rocks.
'''
features = get_manual_features(text_sample, 0, 0)
print(features.shape)
print(features)

(408,)
[5.00000000e+00 1.00000000e+01 1.98000000e+02 7.00000000e+00
 8.59000000e+02 1.50000000e+01 2.40000000e+01 1.00000000e+01
 2.00000000e+01 3.10000000e+01 2.50000000e+01 2.10000000e+01
 2.00000000e+01 6.64000000e+02 2.82857143e+01 4.33838384e+00
 3.35353535e+00 1.01010101e-01 6.11111111e-01 6.11111111e-01
 7.94848684e-04 1.64017255e-01 1.30000000e+01 9.40000000e+01
 7.65414350e-02 8.20169327e-03 6.04230294e-03 5.68774978e-03
 5.34557064e-03 4.12925634e-03 3.89010973e-03 2.56886892e-03
 2.55535853e-03 2.28578196e-03 2.21974518e-03 2.11393480e-03
 1.81897890e-03 1.61556539e-03 1.55760958e-03 1.54296280e-03
 1.38285831e-03 1.32894300e-03 1.24245122e-03 1.19447038e-03
 1.16669200e-03 1.16454549e-03 9.81586974e-04 8.16179341e-04
 7.59233660e-04 7.52667861e-04 6.74383333e-04 5.53168579e-04
 5.53168579e-04 5.51400864e-04 5.34355039e-04 5.01778574e-04
 5.00389655e-04 4.64404025e-04 3.68189814e-04 3.43568067e-04
 3.42684210e-04 3.40411433e-04 2.25636213e-04 1.68816798e-04
 1.44826378e-04 1

In [9]:
for index in range(len(train_data)):
    train_data[index]['manual_embedding'] = get_manual_features(
        train_data[index]['excerpt'], train_data[index]['license'], train_data[index]['url_legal'])

for index in range(len(test_data)):
    test_data[index]['manual_embedding'] = get_manual_features(
        test_data[index]['excerpt'], test_data[index]['license'], test_data[index]['url_legal'])

# Free some memory
del unigrams_dict
del bigrams_dict

## Text embeddings using Language Models: ROBERTA

ROBERTA produces useful sentence embeddings for the `<s>` token (equivalent to the [CLS] token in BERT). However, we can use average pooling to increase the semantics of embeddings.

In [10]:
# Mean Pooling - Take attention mask into account for correct averaging
# Source: https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens
def mean_pooling(token_embeddings, attention_mask):
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

def get_lm_embeddings(text, tokenizer, language_model, mask_words=True):
    tokens = tokenizer(
        text, return_tensors='pt', padding='max_length', max_length=350
    )
    # Source: https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/159505
    if mask_words:
        input_ids = np.array(tokens['input_ids'])
        uniform_dist = np.random.uniform(0, 1, input_ids.shape[0])
        # Do not mask <s> and </s> tokens
        uniform_dist[0] = 1
        uniform_dist[input_ids.shape[0] - 1] = 1
        # Replace 5% random tokens by <mask> token
        input_ids[uniform_dist<0.05] = 50264
        tokens['input_ids'] = torch.LongTensor(input_ids)
    tokens.to(device)
    outputs = language_model(output_hidden_states=True, **tokens)
    token_embeddings = outputs.last_hidden_state
    hidden_states = outputs.hidden_states

    # Concatenate the average pooling of each of the 13 layers
    attention_mask = tokens['attention_mask'].to(device)
    avg_embedding = None
    for layer_id in range(13):
        hidden_embeddings = hidden_states[layer_id]
        layer_avg_embedding = mean_pooling(hidden_embeddings, tokens['attention_mask'].to(device)).detach().to('cpu').numpy()
        if avg_embedding is None:
            avg_embedding = layer_avg_embedding[0]
        else:
            avg_embedding = np.concatenate((layer_avg_embedding[0], avg_embedding), axis=0)
    return avg_embedding

## Data Generators

In [11]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import numpy as np

class CommonLitReadabilityDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        data = self.dataset[index]
        target = 0.0
        if 'target' in data:
            target = data['target']

        response = {
            'lm_embedding': data['lm_embedding'],
            'manual_embedding': data['manual_embedding'],
            'target': np.array([target])
        }

        return response

def custom_collate(batch):
    """Get features and targets"""
    # Features
    lm_embedding = torch.from_numpy(np.array([item['lm_embedding'] for item in batch]))
    lm_embedding = torch.squeeze(lm_embedding)
    manual_embedding = torch.from_numpy(np.array([item['manual_embedding'] for item in batch]))
    manual_embedding = torch.squeeze(manual_embedding)

    # Targets
    target = torch.FloatTensor([item['target'] for item in batch])

    return {
        'lm_embedding': lm_embedding,
        'manual_embedding': manual_embedding,
        'target': target
    }

## Get Data Loaders

In [12]:
def load_embeddings_for_dataset(dataset, tokenizer, language_model, mask_words=True):
    """Get LM embeddings for a dataset"""
    for index in range(len(dataset)):
        dataset[index]['lm_embedding'] = get_lm_embeddings(
            dataset[index]['excerpt'], tokenizer, language_model, mask_words)
    return dataset

def get_data_loader(data, indices, shuffle=True, batch_size=32):
    """Get a Data Loader for a given dataset"""
    dataset = [data[idx].copy() for idx in indices]
    ds = CommonLitReadabilityDataset(dataset)
    loader = DataLoader(
        ds, batch_size=batch_size, num_workers=0, collate_fn=custom_collate,
        pin_memory=True, shuffle=shuffle)
    return loader

# Model

In [13]:
import torch.nn as nn
from torch.nn import MSELoss
import torch

class CommonLitReadabilityModel(nn.Module):
    def __init__(self, EMBEDDING_SIZE=768, hidden_size=768):
        super().__init__()
        DROPOUT_RATE = 0.0
        NUM_MANUAL_FEATURES = 408

        self.dense = nn.Linear(EMBEDDING_SIZE * 13 + NUM_MANUAL_FEATURES, hidden_size)
        self.dropout = nn.Dropout(DROPOUT_RATE)
        self.target_proj = nn.Linear(hidden_size, 1)
    
    def forward(
        self, manual_embedding, lm_embedding, target=None):
        x = torch.cat((manual_embedding, lm_embedding), dim=-1)
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)

        # Computing the logits
        logits_target = self.target_proj(x)

        # Computing the loss
        loss = None
        if target is not None:
            loss_fct = MSELoss()
            loss = loss_fct(logits_target.view(-1, 1), target)
            loss = torch.sqrt(loss)

        return logits_target, loss

In [14]:
from copy import deepcopy
from fastprogress import master_bar, progress_bar
from sklearn.metrics import mean_squared_error
from transformers import (
    AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
)
import torch.optim.lr_scheduler as lr_scheduler
import torch.nn as nn
import torch
import os


def get_trained_model(config, train_indices=None, val_loader=None, embedding_size=768):
    """Train a model"""
    train_loader = get_data_loader(train_data, train_indices, batch_size=config['batch_size'])

    model = CommonLitReadabilityModel(embedding_size, config['hidden_size'])
    model.to(device)

    # Karphaty LR = 3e-4
    optimizer = AdamW(
        model.parameters(),
        lr=config['lr'], # args.learning_rate - default is 5e-5.
        weight_decay=config['weight_decay']
    )

    mb = master_bar(range(config['num_epochs']))

    # Total number of training steps
    TOTAL_STEPS = len(train_loader) * config['num_epochs']

    # Create the learning rate scheduler.
    # Source: https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-fit
    if config['scheduler_name'] == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(config['warmup_steps'] * TOTAL_STEPS),
            num_training_steps = TOTAL_STEPS)
    elif config['scheduler_name'] == 'cosine':
        scheduler = lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=TOTAL_STEPS
        )
    elif config['scheduler_name'] == 'cosine_warmup':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(config['warmup_steps'] * TOTAL_STEPS),
            num_training_steps=TOTAL_STEPS
        )

    # Define variables to keep track of models
    best_model_state = None
    best_rmse = 3.0
    best_loss = []
    best_optimizer = None
    
    for epoch in mb:
        # Reset the total loss for this epoch in training phase
        total_train_loss = 0
        num_train_samples = 0

        # Training phase
        model.train()
        for index, batch in enumerate(progress_bar(train_loader, parent=mb)):
            # Extract features
            b_manual_embedding = batch['manual_embedding'].to(device).float()
            b_lm_embedding = batch['lm_embedding'].to(device).float()
            b_target = batch['target'].to(device).float()

            # Reset the optimizer: Don't reuse info about the last batches
            # It seems it is safer to zero_grad() the model instead of the optimizer
            # Source: https://discuss.pytorch.org/t/model-zero-grad-or-optimizer-zero-grad/28426/6
            model.zero_grad()

            # Calculate the forwards pass and the loss
            _, loss = model(b_manual_embedding, b_lm_embedding, b_target)

            # backward + optimize + schedule only if the model is in training phase
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Accumulate the training loss
            total_train_loss += loss.item()
            num_train_samples += list(batch['target'].size())[0]
            
            if index % 10 == 0:
                # Reset the total loss for this epoch in validation phase.
                total_val_loss = 0
                num_val_samples = 0

                # Tracking variables
                val_labels = []
                val_predictions = []
                local_loss = []

                # Validation phase
                model.eval()
                for batch in progress_bar(val_loader, parent=mb):
                    # Extract features
                    b_manual_embedding = batch['manual_embedding'].to(device).float()
                    b_lm_embedding = batch['lm_embedding'].to(device).float()
                    b_target = batch['target'].to(device).float()

                    # There is no need to compute the graph for the forward pass
                    # because we only need it for backprop (training)
                    with torch.no_grad():
                        # Calculate the forward pass and the loss
                        logits, loss = model(b_manual_embedding, b_lm_embedding, b_target)

                    # Accumulate the validation loss
                    total_val_loss += loss.item()
                    local_loss.append(loss.item())
                    num_val_samples += list(batch['target'].size())[0]

                    # Move labels and logits to CPU
                    labels = b_target.to('cpu').numpy()
                    val_labels.extend(labels.tolist())
                    predictions = logits.to('cpu').numpy()
                    val_predictions.extend(predictions.tolist())
                # Report validation metrics
                val_rmse = mean_squared_error(val_labels, val_predictions, squared=False)

                # Save best model
                if val_rmse < best_rmse:
                    best_rmse = val_rmse
                    best_model_state = deepcopy(model.state_dict())
                    best_loss = local_loss
                    best_optimizer = deepcopy(optimizer.state_dict())
                model.train()

    # Load the best model
    best_model = CommonLitReadabilityModel(embedding_size, config['hidden_size'])
    best_model.load_state_dict(best_model_state)
    best_model.to(device)
    return best_model, best_rmse, best_loss

In [15]:
def generate_predictions_from_model(model, test_loader):
    """Get predictions from a model"""
    test_predictions = []
    model.eval()
    for batch in test_loader:
        # Extract features
        b_manual_embedding = batch['manual_embedding'].to(device).float()
        b_lm_embedding = batch['lm_embedding'].to(device).float()
        b_target = batch['target'].to(device).float()

        # There is no need to compute the graph for the forward pass
        # because we only need it for backprop (training)
        with torch.no_grad():
            # Calculate the forward pass and the loss
            logits, _ = model(b_manual_embedding, b_lm_embedding)

        # Move logits to CPU
        predictions = logits.to('cpu').numpy()
        test_predictions.extend(predictions.tolist())
    return np.array(test_predictions)

# Utils

In [16]:
def get_loss_df(loss_array, val_idx):
    return pd.DataFrame({
        'id': [train_data[index]['id'] for index in val_idx],
        'loss': loss_array
    })

# Training

In [17]:
from sklearn.model_selection import KFold
from transformers import RobertaTokenizer, RobertaModel
from scipy.stats import loguniform


def get_predictions(model_name, embedding_size=768, num_folds=5):
    global train_data
    global test_data

    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    language_model = RobertaModel.from_pretrained(model_name)
    language_model.to(device)
    
    # Calculating LM embeddings for a dataset
    train_data = load_embeddings_for_dataset(
        train_data, tokenizer, language_model)
    test_data = load_embeddings_for_dataset(
        test_data, tokenizer, language_model, mask_words=False)
    
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    predictions = np.zeros(len(test_data))
    loss_df = None
    sum_rmse = 0

    for index, (train_idx, val_idx) in enumerate(kf.split(train_data)):
        print(f'----------------- KFold = {index} ----------------------')
        # Create Data Loaders
        val_loader = get_data_loader(
            train_data, val_idx, shuffle=False, batch_size=1)
        test_loader = get_data_loader(
            test_data, range(len(test_data)), shuffle=False)

        # Train the model with hyperparameters
        best_model = None
        best_loss_array = []
        best_rmse = 3.0
        num_hyperparameter_samples = 10
        for index in range(num_hyperparameter_samples):
            config = {
                'batch_size': int(np.random.randint(4, 10)),
                'hidden_size': np.random.choice([768, 896, 1024, 1280, 1536, 1792, 2048]),
                'lr': loguniform(8e-6, 7e-5).rvs(1)[0],
                'weight_decay': np.random.uniform(0, 5e-2),
                'num_epochs': np.random.randint(6, 14),
                'scheduler_name': np.random.choice(['linear', 'cosine', 'cosine_warmup']),
                'warmup_steps': np.random.uniform(0, 0.3)
            }
            model, rmse, loss_array = get_trained_model(
                config, train_idx, val_loader, embedding_size=embedding_size)
            
            # Print metrics
            print(f'Hyperparameters: {config}')
            print(f'RMSE: {rmse}')
            print('-------------')
            
            # Getting the model with the best hyperparameters
            if rmse < best_rmse:
                best_model = model
                best_loss_array = loss_array
                best_rmse = rmse

        print(f'Best RMSE in this fold: {best_rmse}')

        sum_rmse += best_rmse
        if index == 0:
            loss_df = get_loss_df(best_loss_array, val_idx)
        else:
            loss_df = pd.concat([loss_df, get_loss_df(best_loss_array, val_idx)])

        # Get predictions
        local_predictions = generate_predictions_from_model(best_model, test_loader)
        local_predictions = local_predictions.reshape(-1)
        predictions += local_predictions
    print('--------------------------------------------')
    print(f'CV RMSE: {sum_rmse / num_folds}')
    return (predictions / num_folds), loss_df

# Generating the outputs

In [18]:
predictions, loss_df = get_predictions('../input/robertalarge', embedding_size=1024)
test_ids = [x['id'] for x in test_data]
submission = pd.DataFrame({'id': test_ids,'target': predictions})
submission.to_csv('submission.csv',index=False)

# Ordering samples by loss
# Source: https://twitter.com/karpathy/status/1311884485676294151
loss_df.sort_values(by='loss', ascending=False, inplace=True)
loss_df.to_csv('loss.csv', index=False)
loss_df

----------------- KFold = 0 ----------------------


Hyperparameters: {'batch_size': 4, 'hidden_size': 2048, 'lr': 8.748689697732768e-06, 'weight_decay': 0.004245788207908829, 'num_epochs': 9, 'scheduler_name': 'cosine', 'warmup_steps': 0.10505472954335983}
RMSE: 0.5066691802865106
-------------


Hyperparameters: {'batch_size': 4, 'hidden_size': 1024, 'lr': 4.86147908735894e-05, 'weight_decay': 0.03304807365607703, 'num_epochs': 11, 'scheduler_name': 'cosine', 'warmup_steps': 0.24823091272585526}
RMSE: 0.4979806868149192
-------------


Hyperparameters: {'batch_size': 9, 'hidden_size': 2048, 'lr': 3.50505580927901e-05, 'weight_decay': 0.03913888570350431, 'num_epochs': 11, 'scheduler_name': 'linear', 'warmup_steps': 0.17109979761006525}
RMSE: 0.49857411734320517
-------------


Hyperparameters: {'batch_size': 6, 'hidden_size': 1536, 'lr': 1.8393148242755756e-05, 'weight_decay': 0.046867982030249694, 'num_epochs': 11, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.1757376538627818}
RMSE: 0.499610433362183
-------------


Hyperparameters: {'batch_size': 6, 'hidden_size': 1536, 'lr': 9.453732619537727e-06, 'weight_decay': 0.03696072905751617, 'num_epochs': 11, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.01992667071380484}
RMSE: 0.5051983595465287
-------------


Hyperparameters: {'batch_size': 8, 'hidden_size': 1024, 'lr': 1.9396630498151293e-05, 'weight_decay': 0.014514432548442702, 'num_epochs': 10, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.17655301861633826}
RMSE: 0.5036248891106291
-------------


Hyperparameters: {'batch_size': 6, 'hidden_size': 1536, 'lr': 3.169240293362089e-05, 'weight_decay': 0.046662754539125675, 'num_epochs': 8, 'scheduler_name': 'cosine', 'warmup_steps': 0.20157921143778298}
RMSE: 0.502835181587306
-------------


Hyperparameters: {'batch_size': 7, 'hidden_size': 1536, 'lr': 3.895895882361935e-05, 'weight_decay': 0.04819304716472199, 'num_epochs': 9, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.029598888261904887}
RMSE: 0.5008730864843179
-------------


Hyperparameters: {'batch_size': 6, 'hidden_size': 768, 'lr': 4.551821463147036e-05, 'weight_decay': 0.00735471528780069, 'num_epochs': 7, 'scheduler_name': 'cosine', 'warmup_steps': 0.23678793671504653}
RMSE: 0.502290502429344
-------------


Hyperparameters: {'batch_size': 8, 'hidden_size': 1792, 'lr': 3.8578517958472054e-05, 'weight_decay': 0.018229248292004403, 'num_epochs': 9, 'scheduler_name': 'cosine', 'warmup_steps': 0.18438404602820765}
RMSE: 0.5012800942160024
-------------
Best RMSE in this fold: 0.4979806868149192
----------------- KFold = 1 ----------------------


Hyperparameters: {'batch_size': 8, 'hidden_size': 768, 'lr': 3.5477549297560426e-05, 'weight_decay': 0.0004948255422831838, 'num_epochs': 6, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.06263851104736624}
RMSE: 0.5162361538913575
-------------


Hyperparameters: {'batch_size': 7, 'hidden_size': 1024, 'lr': 3.9999843850448205e-05, 'weight_decay': 0.021433679084563326, 'num_epochs': 10, 'scheduler_name': 'cosine', 'warmup_steps': 0.0958546409463325}
RMSE: 0.5074048550510822
-------------


Hyperparameters: {'batch_size': 6, 'hidden_size': 1792, 'lr': 1.3924325146914747e-05, 'weight_decay': 0.013496314910723612, 'num_epochs': 8, 'scheduler_name': 'linear', 'warmup_steps': 0.13022211676073428}
RMSE: 0.5106800626019734
-------------


Hyperparameters: {'batch_size': 8, 'hidden_size': 1536, 'lr': 1.5082806136707312e-05, 'weight_decay': 0.0137664329684909, 'num_epochs': 13, 'scheduler_name': 'cosine', 'warmup_steps': 0.2530133618752249}
RMSE: 0.509098218786536
-------------


Hyperparameters: {'batch_size': 9, 'hidden_size': 1024, 'lr': 1.4722864024058411e-05, 'weight_decay': 0.03524908144259702, 'num_epochs': 6, 'scheduler_name': 'cosine', 'warmup_steps': 0.26671178037777893}
RMSE: 0.5234145114650307
-------------


Hyperparameters: {'batch_size': 4, 'hidden_size': 1024, 'lr': 3.2402803103667186e-05, 'weight_decay': 0.029409438837385362, 'num_epochs': 6, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.10909099847267352}
RMSE: 0.5157627610822956
-------------


Hyperparameters: {'batch_size': 5, 'hidden_size': 768, 'lr': 4.403581868506273e-05, 'weight_decay': 0.04497062569111342, 'num_epochs': 6, 'scheduler_name': 'cosine', 'warmup_steps': 0.07499738402305879}
RMSE: 0.5206544261528179
-------------


Hyperparameters: {'batch_size': 4, 'hidden_size': 896, 'lr': 1.2583231380129073e-05, 'weight_decay': 0.046249791139150884, 'num_epochs': 12, 'scheduler_name': 'linear', 'warmup_steps': 0.2721467610647034}
RMSE: 0.5063282994205504
-------------


Hyperparameters: {'batch_size': 4, 'hidden_size': 896, 'lr': 8.915979439538967e-06, 'weight_decay': 0.00140078878210278, 'num_epochs': 9, 'scheduler_name': 'linear', 'warmup_steps': 0.24679667061815394}
RMSE: 0.5140121299612722
-------------


Hyperparameters: {'batch_size': 7, 'hidden_size': 1280, 'lr': 2.2549937333726784e-05, 'weight_decay': 0.03628806588544357, 'num_epochs': 6, 'scheduler_name': 'cosine', 'warmup_steps': 0.049526019645682325}
RMSE: 0.5229443838633221
-------------
Best RMSE in this fold: 0.5063282994205504
----------------- KFold = 2 ----------------------


Hyperparameters: {'batch_size': 9, 'hidden_size': 1280, 'lr': 8.30380973333873e-06, 'weight_decay': 0.012384542787557535, 'num_epochs': 6, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.12854772584008414}
RMSE: 0.5197040503547814
-------------


Hyperparameters: {'batch_size': 7, 'hidden_size': 2048, 'lr': 5.94426575642029e-05, 'weight_decay': 0.03737005537206673, 'num_epochs': 10, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.22735931742235616}
RMSE: 0.4981763361723056
-------------


Hyperparameters: {'batch_size': 8, 'hidden_size': 896, 'lr': 8.600425421016075e-06, 'weight_decay': 0.04270199591151267, 'num_epochs': 8, 'scheduler_name': 'cosine', 'warmup_steps': 0.15990707917831776}
RMSE: 0.5115493454961639
-------------


Hyperparameters: {'batch_size': 5, 'hidden_size': 768, 'lr': 6.345827191501508e-05, 'weight_decay': 0.013286383350653292, 'num_epochs': 13, 'scheduler_name': 'cosine', 'warmup_steps': 0.157498632963248}
RMSE: 0.4967094074487955
-------------


Hyperparameters: {'batch_size': 6, 'hidden_size': 896, 'lr': 1.0249753788753402e-05, 'weight_decay': 0.04110117774968526, 'num_epochs': 11, 'scheduler_name': 'cosine', 'warmup_steps': 0.1814086541328904}
RMSE: 0.5016072924455439
-------------


Hyperparameters: {'batch_size': 6, 'hidden_size': 1280, 'lr': 1.9055718638400564e-05, 'weight_decay': 0.00768206363301981, 'num_epochs': 11, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.2737426017433533}
RMSE: 0.4968877179611011
-------------


Hyperparameters: {'batch_size': 8, 'hidden_size': 1792, 'lr': 5.160404676911979e-05, 'weight_decay': 8.171588245448614e-05, 'num_epochs': 6, 'scheduler_name': 'linear', 'warmup_steps': 0.13820679169706077}
RMSE: 0.4999441770525927
-------------


Hyperparameters: {'batch_size': 8, 'hidden_size': 1024, 'lr': 6.129337316294255e-05, 'weight_decay': 0.018889324980580338, 'num_epochs': 13, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.15122501953546796}
RMSE: 0.4977932745258283
-------------


Hyperparameters: {'batch_size': 9, 'hidden_size': 1024, 'lr': 1.3443538880908077e-05, 'weight_decay': 0.04200517328017774, 'num_epochs': 10, 'scheduler_name': 'cosine', 'warmup_steps': 0.06348614061686582}
RMSE: 0.49926185589276545
-------------


Hyperparameters: {'batch_size': 9, 'hidden_size': 1280, 'lr': 1.736619249185331e-05, 'weight_decay': 0.01484904883830106, 'num_epochs': 7, 'scheduler_name': 'linear', 'warmup_steps': 0.1992322018252112}
RMSE: 0.5032434785362141
-------------
Best RMSE in this fold: 0.4967094074487955
----------------- KFold = 3 ----------------------


Hyperparameters: {'batch_size': 5, 'hidden_size': 1536, 'lr': 2.1758974513877916e-05, 'weight_decay': 0.003345761141172954, 'num_epochs': 11, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.13911050720361692}
RMSE: 0.5168468686717327
-------------


Hyperparameters: {'batch_size': 7, 'hidden_size': 1792, 'lr': 4.61869723825839e-05, 'weight_decay': 0.03994917050733861, 'num_epochs': 12, 'scheduler_name': 'linear', 'warmup_steps': 0.24846212097882378}
RMSE: 0.5144203655663465
-------------


Hyperparameters: {'batch_size': 8, 'hidden_size': 1792, 'lr': 1.265128332500921e-05, 'weight_decay': 0.0396060273583875, 'num_epochs': 8, 'scheduler_name': 'cosine', 'warmup_steps': 0.20315723323584658}
RMSE: 0.5232528260616728
-------------


Hyperparameters: {'batch_size': 5, 'hidden_size': 768, 'lr': 3.8547586679010496e-05, 'weight_decay': 0.028985615527994748, 'num_epochs': 6, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.12205467176545304}
RMSE: 0.5220968633337245
-------------


Hyperparameters: {'batch_size': 4, 'hidden_size': 768, 'lr': 1.767519419942351e-05, 'weight_decay': 0.02894133463653958, 'num_epochs': 11, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.15309907331954814}
RMSE: 0.5159306031475913
-------------


Hyperparameters: {'batch_size': 8, 'hidden_size': 1792, 'lr': 9.267751706192318e-06, 'weight_decay': 0.004318358832349978, 'num_epochs': 8, 'scheduler_name': 'cosine', 'warmup_steps': 0.10551949746662818}
RMSE: 0.5273700922068304
-------------


Hyperparameters: {'batch_size': 5, 'hidden_size': 768, 'lr': 1.1046079231264008e-05, 'weight_decay': 0.0452930799575003, 'num_epochs': 13, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.03310971456854917}
RMSE: 0.5150867274822962
-------------


Hyperparameters: {'batch_size': 8, 'hidden_size': 1536, 'lr': 1.5568253552614374e-05, 'weight_decay': 0.0158080696654828, 'num_epochs': 11, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.2595951792834051}
RMSE: 0.5156468691670459
-------------


Hyperparameters: {'batch_size': 8, 'hidden_size': 768, 'lr': 2.724276886872198e-05, 'weight_decay': 0.013445295202269931, 'num_epochs': 6, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.2856203235346349}
RMSE: 0.5201253876992531
-------------


Hyperparameters: {'batch_size': 9, 'hidden_size': 896, 'lr': 5.841468444320575e-05, 'weight_decay': 0.004901881272604808, 'num_epochs': 9, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.24770474019515062}
RMSE: 0.5154642458631632
-------------
Best RMSE in this fold: 0.5144203655663465
----------------- KFold = 4 ----------------------


Hyperparameters: {'batch_size': 7, 'hidden_size': 1536, 'lr': 6.752259723554071e-05, 'weight_decay': 0.03449615422727958, 'num_epochs': 6, 'scheduler_name': 'cosine', 'warmup_steps': 0.014263622722490587}
RMSE: 0.5240104043395085
-------------


Hyperparameters: {'batch_size': 9, 'hidden_size': 1024, 'lr': 1.2440844520563466e-05, 'weight_decay': 0.017633578526945755, 'num_epochs': 13, 'scheduler_name': 'linear', 'warmup_steps': 0.21036119884228602}
RMSE: 0.5187535934496875
-------------


Hyperparameters: {'batch_size': 5, 'hidden_size': 1536, 'lr': 1.2852983174754012e-05, 'weight_decay': 0.045545918202767564, 'num_epochs': 10, 'scheduler_name': 'cosine', 'warmup_steps': 0.09195042281844595}
RMSE: 0.5180052634813648
-------------


Hyperparameters: {'batch_size': 8, 'hidden_size': 1024, 'lr': 2.029848825066884e-05, 'weight_decay': 0.007177684912383015, 'num_epochs': 11, 'scheduler_name': 'cosine', 'warmup_steps': 0.2509877646721011}
RMSE: 0.5179403465578645
-------------


Hyperparameters: {'batch_size': 6, 'hidden_size': 1280, 'lr': 1.9147402583610943e-05, 'weight_decay': 0.02813516155396439, 'num_epochs': 8, 'scheduler_name': 'linear', 'warmup_steps': 0.13869784383705938}
RMSE: 0.5196396997878777
-------------


Hyperparameters: {'batch_size': 5, 'hidden_size': 1024, 'lr': 1.8141786847665872e-05, 'weight_decay': 0.01695064066791705, 'num_epochs': 12, 'scheduler_name': 'cosine', 'warmup_steps': 0.22136253256341412}
RMSE: 0.518635572920111
-------------


Hyperparameters: {'batch_size': 5, 'hidden_size': 1024, 'lr': 1.2252701959714999e-05, 'weight_decay': 0.028373042098492774, 'num_epochs': 8, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.0922344284477524}
RMSE: 0.5200845445903509
-------------


Hyperparameters: {'batch_size': 4, 'hidden_size': 1536, 'lr': 1.2132093459294584e-05, 'weight_decay': 0.004253160790899435, 'num_epochs': 6, 'scheduler_name': 'cosine', 'warmup_steps': 0.17539112207886923}
RMSE: 0.5245911143066301
-------------


Hyperparameters: {'batch_size': 4, 'hidden_size': 896, 'lr': 1.0401291696853708e-05, 'weight_decay': 0.04511981099300953, 'num_epochs': 13, 'scheduler_name': 'cosine_warmup', 'warmup_steps': 0.14703105146791415}
RMSE: 0.5200062053541403
-------------


Hyperparameters: {'batch_size': 4, 'hidden_size': 1536, 'lr': 1.3272207946288699e-05, 'weight_decay': 0.030757515329971744, 'num_epochs': 7, 'scheduler_name': 'linear', 'warmup_steps': 0.17985072578004174}
RMSE: 0.5239829100884746
-------------
Best RMSE in this fold: 0.5179403465578645
--------------------------------------------
CV RMSE: 0.5066758211616952


Unnamed: 0,id,loss
529,bcd734621,1.976578
215,03b761fd9,1.975299
526,04ade0eb2,1.945604
377,76f92b721,1.920907
175,4cf4a2fa3,1.651317
...,...,...
346,a21bfa111,0.000364
439,a852fb41d,0.000306
135,e882f463d,0.000082
19,3723e3a8f,0.000052
