# Prediction of readability score using text

We verify that GPU is available for training using pytorch.

In [1]:
!nvidia-smi
# Verify CUDA version is 11.0+
!nvcc -V

# Verify pytorch is using the GPU
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Thu May 13 21:51:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 466.27       Driver Version: 466.27       CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   43C    P8    N/A /  N/A |     37MiB /  2048MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Preprocessing

First, we need to load all the datasets that we are going to use to train the model and generate the submissions.

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd

# data_dir = '../input/commonlitreadabilityprize/'
data_dir = 'inputs/'
train_filepath = f'{data_dir}train.csv'
test_filepath = f'{data_dir}test.csv'
output_filepath = 'outputs/submission.csv'

train_data = pd.read_csv(train_filepath)
train_data, val_data, _, _ = train_test_split(
    train_data, train_data['target'], test_size=0.2, random_state=42)
test_data = pd.read_csv(test_filepath)

# Convert datasets to list of dictionaries
train_data = train_data.to_dict('records')
val_data = val_data.to_dict('records')
test_data = test_data.to_dict('records')

Additionally, we build a dictionary with unigram frequencies from a external source.

In [3]:
# I'm using the ngrams available in https://github.com/hackerb9/gwordlist
unigram_url = 'https://raw.githubusercontent.com/hackerb9/gwordlist/master/frequency-alpha-gcide.txt'
raw_unigrams = pd.read_csv(unigram_url, sep='\t')
raw_unigrams.columns = ['gold_content', 'count', 'percent', 'cumulative']
unigrams = pd.DataFrame(raw_unigrams.gold_content.str.split(' ', 1).tolist(),
                                 columns = ['ranking','unigram'])
unigrams['unigram'] = unigrams['unigram'].str.strip()
unigrams['ranking'] = pd.to_numeric(unigrams["ranking"])
unigrams_dict = dict(zip(unigrams['unigram'], unigrams['ranking']))
unigrams

Unnamed: 0,ranking,unigram
0,1,the
1,2,of
2,3,and
3,4,to
4,5,in
...,...,...
65532,65533,nitrifier
65533,65534,fordo
65534,65535,elusory
65535,65536,foamless


## Feature Engineering

References:

https://www.kaggle.com/abhilashreddyy/a-deeper-eda-on-pos-tags-topic-modelling-more

https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline


## Pooling Layers

ROBERTA produces useful sentence embeddings for the `<s>` token (equivalent to the [CLS] token in BERT). However, we can use (max and average) pooling to increase the semantics of embeddings.

In [4]:
# Mean Pooling - Take attention mask into account for correct averaging
# Source: https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens
def mean_pooling(token_embeddings, attention_mask):
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

# Max Pooling - Take attention mask into account for correct max
# Source: https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens
def max_pooling(token_embeddings, attention_mask):
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    max_embeddings, max_indexes = torch.max(token_embeddings * input_mask_expanded, 1)
    return max_embeddings

## Data Generator

I started in Deep Learning with Tensorflow so I call Datasets as Data Generators

In [25]:
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaModel

MODEL_NAME = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
roberta_model = RobertaModel.from_pretrained(MODEL_NAME)
roberta_model.to(device)

class CommonLitReadabilityDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        data = self.dataset[index]
        
        tokens = tokenizer(data['excerpt'], return_tensors='pt')
        tokens.to(device)
        outputs = roberta_model(**tokens)
        token_embeddings = outputs.last_hidden_state

        # Create the ROBERTA features for the model
        s_token = token_embeddings[:, 0, :].detach().to('cpu').numpy()  # take <s> token (equiv. to [CLS])
        max_embedding = mean_pooling(token_embeddings, tokens['attention_mask'].to(device)).detach().to('cpu').numpy()
        avg_embedding = max_pooling(token_embeddings, tokens['attention_mask'].to(device)).detach().to('cpu').numpy()
        token_embeddings = token_embeddings.detach().to('cpu').numpy()
        
        # Prepare the features for fine-tuning and additional features
        response = {
            'token_embeddings': token_embeddings,
            's_token_embedding': s_token,
            'max_embedding': max_embedding,
            'avg_embedding': avg_embedding,
            'target': np.array([data['target']]),
            'standard_error': np.array([data['standard_error']])
        }

        return response

Get the data loader for training and validation

In [40]:
from torch.utils.data import DataLoader
import numpy as np


def custom_collate(batch):
    """Get features and targets"""
    # Features
    s_token_embedding = torch.from_numpy(np.array([item['s_token_embedding'] for item in batch]))
    s_token_embedding = torch.squeeze(s_token_embedding)
    max_embedding = torch.from_numpy(np.array([item['max_embedding'] for item in batch]))
    max_embedding = torch.squeeze(max_embedding)
    avg_embedding = torch.from_numpy(np.array([item['avg_embedding'] for item in batch]))
    avg_embedding = torch.squeeze(avg_embedding)
    
    # Targets
    target = torch.FloatTensor([item['target'] for item in batch])
    standard_error = torch.FloatTensor([item['standard_error'] for item in batch])

    return {
        'token_embeddings': torch.from_numpy(np.array([])),
        's_token_embedding': s_token_embedding,
        'max_embedding': max_embedding,
        'avg_embedding': avg_embedding,
        'target': target,
        'standard_error': standard_error
    }

BATCH_SIZE = 16

ds_train = CommonLitReadabilityDataset(train_data)
training_loader = DataLoader(
    ds_train, batch_size=BATCH_SIZE, num_workers=0, collate_fn=custom_collate,
    pin_memory=True, shuffle=True)

ds_val = CommonLitReadabilityDataset(val_data)
validation_loader = DataLoader(
    ds_val, batch_size=BATCH_SIZE, num_workers=0, collate_fn=custom_collate,
    pin_memory=True)

## Model

In [50]:
import torch.nn as nn
from torch.nn import MSELoss
import torch
# Inspiration: https://github.com/huggingface/transformers/blob/c40c7e213bdd0479bdca69df0c500004a7294d39/src/transformers/models/roberta/modeling_roberta.py#L1384

class CommonLitReadabilityModel(nn.Module):
    def __init__(self):
        super().__init__()
        EMBEDDING_SIZE = 768
        HIDDEN_SIZE = 32
        DROPOUT_RATE = 0.1
        
        self.dense = nn.Linear(EMBEDDING_SIZE * 3, HIDDEN_SIZE)
        self.dropout = nn.Dropout(DROPOUT_RATE)
        self.target_proj = nn.Linear(HIDDEN_SIZE, 1)
        self.std_dev_proj = nn.Linear(HIDDEN_SIZE, 1)
    
    def forward(self, token_embeddings, s_token_embedding, max_embedding, avg_embedding, target=None, standard_error=None):
        x = torch.cat((s_token_embedding, max_embedding, avg_embedding), dim=1)
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)

        # Computing the logits
        logits_target = self.target_proj(x)
        logits_std_dev = self.std_dev_proj(x)
        
        # Computing the loss
        loss = None
        if target is not None:
            loss_fct = MSELoss()
            loss_target = loss_fct(logits_target.view(-1, 1), target)
            loss = loss_target
            # loss_fct = MSELoss()
            # loss_std_dev = loss_fct(logits_std_dev.view(-1, 1), standard_error)
            # loss = loss_target + loss_std_dev
        
        return logits_target, loss

## Training settings

In [51]:
from fastprogress import master_bar, progress_bar
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
import torch

model = CommonLitReadabilityModel()
model.to(device)

optimizer = AdamW(
    model.parameters(),
    lr = 3e-4, # args.learning_rate - default is 5e-5, our notebook had 2e-5
    eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
)

NUM_EPOCHS = 4
mb = master_bar(range(NUM_EPOCHS))

# Total number of training steps
TOTAL_STEPS = len(training_loader) * NUM_EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = TOTAL_STEPS)

## Training

In [None]:
from sklearn.metrics import mean_squared_error

for epoch in mb:
    print(f'======================= Epoch {epoch + 1} / {NUM_EPOCHS} =========================')
    
    # Reset the total loss for this epoch in training phase
    total_train_loss = 0
    
    # Training phase
    model.train()
    for batch in progress_bar(training_loader, parent=mb):
        # Extract features
        b_token_embeddings = batch['token_embeddings'].to(device).float()
        b_s_token_embedding = batch['s_token_embedding'].to(device).float()
        b_max_embedding = batch['max_embedding'].to(device).float()
        b_avg_embedding = batch['avg_embedding'].to(device).float()
        b_target = batch['target'].to(device).float()
        b_standard_error = batch['standard_error'].to(device).float()

        # Reset the optimizer: Don't reuse info about the last batches
        # It seems it is safer to zero_grad() the model instead of the optimizer
        # Source: https://discuss.pytorch.org/t/model-zero-grad-or-optimizer-zero-grad/28426/6
        model.zero_grad()
        
        # Calculate the forwards pass and the loss
        _, loss = model(
            b_token_embeddings, b_s_token_embedding, b_max_embedding,
            b_avg_embedding, b_target, b_standard_error)

        # backward + optimize + schedule only if the model is in training phase
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        # Accumulate the training loss
        total_train_loss += loss.item()
    print(f"Training loss: {total_train_loss / len(train_data)}")
    
    # Reset the total loss for this epoch in validation phase.
    total_val_loss = 0

    # Tracking variables
    val_labels = []
    val_predictions = []
    
    # Validation phase
    model.eval()
    for batch in progress_bar(validation_loader, parent=mb):
        # Extract features
        b_token_embeddings = batch['token_embeddings'].to(device).float()
        b_s_token_embedding = batch['s_token_embedding'].to(device).float()
        b_max_embedding = batch['max_embedding'].to(device).float()
        b_avg_embedding = batch['avg_embedding'].to(device).float()
        b_target = batch['target'].to(device).float()
        b_standard_error = batch['standard_error'].to(device).float()
        
        # There is no need to compute the graph for the forward pass
        # because we only need it for backprop (training)
        with torch.no_grad():
            # Calculate the forward pass and the loss
            logits, loss = model(
            b_token_embeddings, b_s_token_embedding, b_max_embedding,
            b_avg_embedding, b_target, b_standard_error)

        # Accumulate the validation loss
        total_val_loss += loss.item()
        
        # Move labels and logits to CPU
        labels = b_target.to('cpu').numpy()
        val_labels.extend(labels.tolist())
        predictions = logits.to('cpu').numpy()
        val_predictions.extend(predictions.tolist())
    # Report validation metrics
    print(f'Validation loss = {total_val_loss / len(val_data)}')
    print(f'Validation RSME = {mean_squared_error(val_labels, val_predictions, squared=False)}')

## Error Analysis

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

plt.scatter(val_labels, val_predictions, s=2, c='g', alpha=0.5)
plt.plot(val_labels, val_labels)
plt.xlabel("Target")
plt.ylabel("Predictions")
plt.title(f'R^2: {r2_score(val_labels, val_predictions)}')
plt.show()