# Predict readability score using text

We verify that GPU is available for training using pytorch.

## Ideas

* Cross validation K-Folds
* Ensembles of 6 models

In [None]:
!nvidia-smi
# Verify CUDA version is 11.0+
!nvcc -V

# Verify pytorch is using the GPU
import torch
print(f'Torch: {torch.__version__}')

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## Preprocessing
First, we need to load all the datasets that we are going to use to train the model and generate the submissions.

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

data_dir = '../input/commonlitreadabilityprize/'
train_filepath = f'{data_dir}train.csv'
test_filepath = f'{data_dir}test.csv'
unigrams_filepath = '../input/gwordlist/frequency-all.txt'

train_data = pd.read_csv(train_filepath)
train_data.standard_error = np.maximum(train_data.standard_error, 0.4)
# train_data = train_data[((train_data['target'] != 0) & (train_data['standard_error'] != 0))]
train_data, val_data, _, _ = train_test_split(
    train_data, train_data['target'], test_size=0.2, random_state=18)
train_data = train_data.round(1)
#train_data.sort_values(by='target', inplace=True)
test_data = pd.read_csv(test_filepath)
test_data['target'] = 0
test_data['standard_error'] = 0

# Convert datasets to list of dictionaries
train_data = train_data.to_dict('records')
val_data = val_data.to_dict('records')
test_data = test_data.to_dict('records')

We need a external file with unigram frequencies: https://github.com/hackerb9/gwordlist/blob/master/frequency-alpha-gcide.txt

In [None]:
raw_unigrams = pd.read_csv(unigrams_filepath, sep='\t')
raw_unigrams.columns = ['gold_content', 'count', 'percent', 'cumulative']
unigrams = pd.DataFrame(raw_unigrams.gold_content.str.split(' ', 1).tolist(),
                                 columns = ['ranking','unigram'])
unigrams['unigram'] = unigrams['unigram'].str.strip()
unigrams['ranking'] = pd.to_numeric(unigrams["ranking"])
unigrams_dict = dict(zip(unigrams['unigram'], unigrams['ranking']))
unigrams

## Oversampling

Top 500 samples that are near the quadratic function that best fit target vs standard error.

In [None]:
"""
subset = pd.DataFrame(train_data)
subset_1 = subset[((subset['target'] >= -0.5) & (subset['target'] < 0))]
subset_2 = subset[subset['target'] < -2.5]
subset_3 = subset[((subset['target'] >= -1.5) & (subset['target'] < -1.0))]
train_data.extend(subset_1.to_dict('records'))
train_data.extend(subset_2.to_dict('records'))
train_data.extend(subset_2.to_dict('records'))
train_data.extend(subset_3.to_dict('records'))

oversampled_data = pd.DataFrame(train_data)
oversampled_data['residuals'] = 0.489 + 0.0377 * oversampled_data['target'] + 0.0197 * oversampled_data['target'] * oversampled_data['target'] - oversampled_data['standard_error']
oversampled_data['residuals'] = oversampled_data['residuals'].abs()
oversampled_data.sort_values(by='residuals', inplace=True)
oversampled_data = oversampled_data.to_dict('records')
train_data.extend(oversampled_data[:500])
"""

## Manual features

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg")

def get_manual_features(text):
    # Inspiration: https://arxiv.org/pdf/2103.04083v1.pdf
    doc = nlp(text)
    difficulty_embedding = []
    embedding_size = 32
    
    num_words = len(doc)
    num_sentences = len(list(doc.sents))
    num_characters = len(text)
    num_pronouns = 0
    num_long_words = 0
    num_letters = 0
    words = {}

    for index, token in enumerate(doc):
        word_difficulty = unigrams_dict.get(token.text, None)
        if word_difficulty:
            difficulty_embedding.append(1 / word_difficulty)
        if token.pos_ == 'PRON':
            num_pronouns += 1
        if len(token.text) > 6:
            num_long_words += 1
        if token.is_alpha:
            num_letters += len(token.text)
        words[token.text] = 1
    
    # Document level features
    words_by_sentence = num_words / num_sentences
    characters_by_word = num_characters / num_words
    letters_by_word = num_letters / num_words
    long_words_by_word = num_long_words / num_words
    unique_words_by_word = len(words) / num_words
    avg_difficulty = sum(difficulty_embedding) / len(difficulty_embedding)
    
    # Difficulty features
    difficulty_embedding = list(set(difficulty_embedding))
    difficulty_embedding.sort()
    
    manual_embedding = [
        num_words, num_sentences, num_characters, num_pronouns,
        num_long_words, num_letters, words_by_sentence, characters_by_word,
        letters_by_word, long_words_by_word, unique_words_by_word,
        unique_words_by_word, avg_difficulty
    ]
    manual_embedding.extend(difficulty_embedding[:embedding_size])
    return manual_embedding
text_sample = '''
More people came to the bus stop just before 9am. Half an hour later they are all still waiting. Sam is worried. "Maybe the bus broke down," he thinks. "Maybe we won't go to town today. Maybe I won't get my new school uniform." At 9:45am some people give up and go home. Sam starts to cry. "We will wait a bit longer," says his mother. Suddenly, they hear a noise. The bus is coming! The bus arrives at the stop at 10 o'clock. "Get in! Get in!" calls the driver. "We are very late today!" People get on the bus and sit down. The bus leaves the stop at 10:10am. "What time is the return bus this afternoon?" asks Sam's mother. "The blue bus leaves town at 2:30pm," replies the driver. Sam thinks, "We will get to town at 11 o'clock." "How much time will we have in town before the return bus?" wonders Sam.
'''
features = get_manual_features(text_sample)
features

In [None]:
for index, item in enumerate(train_data):
    train_data[index]['manual_embedding'] = get_manual_features(train_data[index]['excerpt'])

for index, item in enumerate(val_data):
    val_data[index]['manual_embedding'] = get_manual_features(val_data[index]['excerpt'])

for index, item in enumerate(test_data):
    test_data[index]['manual_embedding'] = get_manual_features(test_data[index]['excerpt'])

## Pooling Layers

ROBERTA produces useful sentence embeddings for the `<s>` token (equivalent to the [CLS] token in BERT). However, we can use (max and average) pooling to increase the semantics of embeddings.

In [None]:
# Mean Pooling - Take attention mask into account for correct averaging
# Source: https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens
def mean_pooling(token_embeddings, attention_mask):
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

# Max Pooling - Take attention mask into account for correct max
# Source: https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens
def max_pooling(token_embeddings, attention_mask):
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    max_embeddings, max_indexes = torch.max(token_embeddings * input_mask_expanded, 1)
    return max_embeddings

## Data Generator

I started in Deep Learning with Tensorflow so I call Datasets as Data Generators

In [None]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel

MODEL_NAME = '../input/roberta-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
roberta_model = AutoModel.from_pretrained(MODEL_NAME)
roberta_model.to(device)

class CommonLitReadabilityDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        data = self.dataset[index]
        
        tokens = tokenizer(
            data['excerpt'], return_tensors='pt', padding='max_length',
            max_length=330
        )
        tokens.to(device)
        outputs = roberta_model(**tokens)
        token_embeddings = outputs.last_hidden_state

        # Create the ROBERTA features for the model
        s_token = token_embeddings[:, 0, :].detach().to('cpu').numpy()  # take <s> token (equiv. to [CLS])
        max_embedding = max_pooling(token_embeddings, tokens['attention_mask'].to(device)).detach().to('cpu').numpy()
        avg_embedding = mean_pooling(token_embeddings, tokens['attention_mask'].to(device)).detach().to('cpu').numpy()
        token_embeddings = token_embeddings.detach().to('cpu').numpy()
        
        # Prepare the features for fine-tuning and additional features
        response = {
            'token_embeddings': token_embeddings,
            's_token_embedding': s_token,
            'max_embedding': max_embedding,
            'avg_embedding': avg_embedding,
            'manual_embedding': np.array([data['manual_embedding']]),
            'target': np.array([data['target']]),
            'standard_error': np.array([data['standard_error']])
        }

        return response

Get the data loader for training and validation

In [None]:
from torch.utils.data import DataLoader
import numpy as np


def custom_collate(batch):
    """Get features and targets"""
    # Features
    token_embeddings = torch.from_numpy(np.array([item['token_embeddings'] for item in batch]))
    token_embeddings = torch.squeeze(token_embeddings)
    s_token_embedding = torch.from_numpy(np.array([item['s_token_embedding'] for item in batch]))
    s_token_embedding = torch.squeeze(s_token_embedding)
    max_embedding = torch.from_numpy(np.array([item['max_embedding'] for item in batch]))
    max_embedding = torch.squeeze(max_embedding)
    avg_embedding = torch.from_numpy(np.array([item['avg_embedding'] for item in batch]))
    avg_embedding = torch.squeeze(avg_embedding)
    manual_embedding = torch.from_numpy(np.array([item['manual_embedding'] for item in batch]))
    manual_embedding = torch.squeeze(manual_embedding)
    
    # Targets
    target = torch.FloatTensor([item['target'] for item in batch])
    standard_error = torch.FloatTensor([item['standard_error'] for item in batch])

    return {
        'token_embeddings': token_embeddings,
        's_token_embedding': s_token_embedding,
        'max_embedding': max_embedding,
        'avg_embedding': avg_embedding,
        'manual_embedding': manual_embedding,
        'target': target,
        'standard_error': standard_error
    }

BATCH_SIZE = 24

ds_train = CommonLitReadabilityDataset(train_data)
training_loader = DataLoader(
    ds_train, batch_size=BATCH_SIZE, num_workers=0, collate_fn=custom_collate,
    pin_memory=True, shuffle=True)

ds_val = CommonLitReadabilityDataset(val_data)
validation_loader = DataLoader(
    ds_val, batch_size=BATCH_SIZE, num_workers=0, collate_fn=custom_collate,
    pin_memory=True)

ds_test = CommonLitReadabilityDataset(test_data)
test_loader = DataLoader(
    ds_test, batch_size=BATCH_SIZE, num_workers=0, collate_fn=custom_collate,
    pin_memory=True)

## Model

In [None]:
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import MSELoss
from torch.distributions import Normal
import torch
# Inspiration: https://github.com/huggingface/transformers/blob/c40c7e213bdd0479bdca69df0c500004a7294d39/src/transformers/models/roberta/modeling_roberta.py#L1384

class CommonLitReadabilityModel(nn.Module):
    def __init__(self):
        super().__init__()
        EMBEDDING_SIZE = 768
        HIDDEN_SIZE = 256
        DROPOUT_RATE = 0.1
        NUM_MANUAL_FEATURES = 45
        # NUM_MANUAL_FEATURES = 0
        
        # Convolutional network to handle bigrams, trigrams, fourgrams, fivegrams and sixgrams
        filter_sizes = [2, 3, 4, 5, 6, 7, 8, 9, 10]
        cnn_kernel_size = 128
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(
                in_channels=EMBEDDING_SIZE,
                out_channels=cnn_kernel_size,
                kernel_size=filter_sizes[i]
            )
            for i in range(len(filter_sizes))
        ])
        
        self.dense = nn.Linear(EMBEDDING_SIZE * 3 + cnn_kernel_size * len(filter_sizes) + NUM_MANUAL_FEATURES, HIDDEN_SIZE)
        self.dropout = nn.Dropout(DROPOUT_RATE)
        self.target_proj = nn.Linear(HIDDEN_SIZE, 1)
        self.std_dev_proj = nn.Linear(HIDDEN_SIZE, 1)
    
    def forward(
        self, token_embeddings, s_token_embedding, max_embedding, avg_embedding,
        manual_embedding, target=None, standard_error=None):
        # Permute the token_embedding to match the input shape for
        # nn.Conv1d: (batch_size, embedding_size, max_sequence_length)
        embedding_reshaped = token_embeddings.permute(0, 2, 1)
        
        # Apply CNN and ReLU. Output shape: (batch_size, num_filters[i], L_out)
        x_conv_list = [F.relu(conv1d(embedding_reshaped)) for conv1d in self.conv1d_list]

        # Max pooling. Output shape: (batch_size, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]

        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (batch_size, sum(num_filters))
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)

        x = torch.cat((x_fc, s_token_embedding, max_embedding, avg_embedding, manual_embedding), dim=1)
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)

        # Computing the logits
        logits_target = self.target_proj(x)
        logits_std_dev = self.std_dev_proj(x)
        
        # Computing the loss
        loss = None
        if target is not None:
            
            loss_fct = MSELoss()
            loss_target = loss_fct(logits_target.view(-1, 1), target)
            # loss = loss_target
            loss_fct = MSELoss()
            loss_std_dev = loss_fct(logits_std_dev.view(-1, 1), standard_error)
            loss = loss_target + loss_std_dev * 0.65
            """
            # https://www.kaggle.com/c/commonlitreadabilityprize/discussion/239421
            # Not available for torch 1.7
            loss_fct = GaussianNLLLoss()
            loss = loss_fct(input=logits_target.view(-1, 1), target=target, var=standard_error ** 2)
            
            p = torch.distributions.Normal(logits_target.view(-1, 1), logits_std_dev.view(-1, 1))
            q = torch.distributions.Normal(target, standard_error)
            loss = torch.distributions.kl_divergence(p, q).mean()
            """
        
        return logits_target, loss

## Training settings

In [None]:
from fastprogress import master_bar, progress_bar
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
import torch

model = CommonLitReadabilityModel()
model.to(device)

# Karphaty LR = 3e-4
optimizer = AdamW(
    model.parameters(),
    lr = 5e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
    eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
)

NUM_EPOCHS = 8
mb = master_bar(range(NUM_EPOCHS))

# Total number of training steps
TOTAL_STEPS = len(training_loader) * NUM_EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = TOTAL_STEPS)

## Training

In [None]:
from sklearn.metrics import mean_squared_error
from copy import deepcopy

best_model_state = None
best_rmse = 1.0

for epoch in mb:
    print(f'======================= Epoch {epoch + 1} / {NUM_EPOCHS} =========================')
    
    # Reset the total loss for this epoch in training phase
    total_train_loss = 0
    
    # Training phase
    model.train()
    for batch in progress_bar(training_loader, parent=mb):
        # Extract features
        b_token_embeddings = batch['token_embeddings'].to(device).float()
        b_s_token_embedding = batch['s_token_embedding'].to(device).float()
        b_max_embedding = batch['max_embedding'].to(device).float()
        b_avg_embedding = batch['avg_embedding'].to(device).float()
        b_manual_embedding = batch['manual_embedding'].to(device).float()
        b_target = batch['target'].to(device).float()
        b_standard_error = batch['standard_error'].to(device).float()

        # Reset the optimizer: Don't reuse info about the last batches
        # It seems it is safer to zero_grad() the model instead of the optimizer
        # Source: https://discuss.pytorch.org/t/model-zero-grad-or-optimizer-zero-grad/28426/6
        model.zero_grad()
        
        # Calculate the forwards pass and the loss
        _, loss = model(
            b_token_embeddings, b_s_token_embedding, b_max_embedding,
            b_avg_embedding, b_manual_embedding, b_target, b_standard_error)

        # backward + optimize + schedule only if the model is in training phase
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        # Accumulate the training loss
        total_train_loss += loss.item()
    print(f"Training loss: {total_train_loss / len(train_data)}")
    
    # Reset the total loss for this epoch in validation phase.
    total_val_loss = 0

    # Tracking variables
    val_labels = []
    val_predictions = []
    
    # Validation phase
    model.eval()
    for batch in progress_bar(validation_loader, parent=mb):
        # Extract features
        b_token_embeddings = batch['token_embeddings'].to(device).float()
        b_s_token_embedding = batch['s_token_embedding'].to(device).float()
        b_max_embedding = batch['max_embedding'].to(device).float()
        b_avg_embedding = batch['avg_embedding'].to(device).float()
        b_manual_embedding = batch['manual_embedding'].to(device).float()
        b_target = batch['target'].to(device).float()
        b_standard_error = batch['standard_error'].to(device).float()
        
        # There is no need to compute the graph for the forward pass
        # because we only need it for backprop (training)
        with torch.no_grad():
            # Calculate the forward pass and the loss
            logits, loss = model(
            b_token_embeddings, b_s_token_embedding, b_max_embedding,
            b_avg_embedding, b_manual_embedding, b_target, b_standard_error)

        # Accumulate the validation loss
        total_val_loss += loss.item()
        
        # Move labels and logits to CPU
        labels = b_target.to('cpu').numpy()
        val_labels.extend(labels.tolist())
        predictions = logits.to('cpu').numpy()
        val_predictions.extend(predictions.tolist())
    # Report validation metrics
    val_rmse = mean_squared_error(val_labels, val_predictions, squared=False)
    print(f'Validation loss = {total_val_loss / len(val_data)}')
    print(f'Validation RSME = {val_rmse}')
    
    # Save best model
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        best_model_state = deepcopy(model.state_dict())

## Error Analysis

In [None]:
best_model = CommonLitReadabilityModel()
best_model.load_state_dict(best_model_state)
best_model.to(device)
best_model.eval()

val_predictions = []
for batch in validation_loader:
    # Extract features
    b_token_embeddings = batch['token_embeddings'].to(device).float()
    b_s_token_embedding = batch['s_token_embedding'].to(device).float()
    b_max_embedding = batch['max_embedding'].to(device).float()
    b_avg_embedding = batch['avg_embedding'].to(device).float()
    b_manual_embedding = batch['manual_embedding'].to(device).float()

    # There is no need to compute the graph for the forward pass
    # because we only need it for backprop (training)
    with torch.no_grad():
        # Calculate the forward pass and the loss
        logits, _ = best_model(
        b_token_embeddings, b_s_token_embedding, b_max_embedding,
        b_avg_embedding, b_manual_embedding)

    # Move logits to CPU
    predictions = logits.to('cpu').numpy()
    val_predictions.extend(predictions.tolist())

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

plt.scatter(val_labels, val_predictions, s=2, c='g', alpha=0.5)
plt.plot(val_labels, val_labels)
plt.xlabel("Target")
plt.ylabel("Predictions")
plt.title(f'R^2: {r2_score(val_labels, val_predictions)}')
plt.show()

In [None]:
# Map from readability scores to a "class"

def map_readability_scores(target_list):
    new_targets = []
    for target in target_list:
        new_target = None
        if target[0] < -2.5:
            new_target = 0
        elif target[0] >= -2.5 and target[0] < -2.0:
            new_target = 1
        elif target[0] >= -2.0 and target[0] < -1.5:
            new_target = 2
        elif target[0] >= -1.5 and target[0] < -1.0:
            new_target = 3
        elif target[0] >= -1.0 and target[0] < -0.5:
            new_target = 4
        elif target[0] >= -0.5 and target[0] < -0.0:
            new_target = 5
        elif target[0] >= 0.0 and target[0] < 0.5:
            new_target = 6
        elif target[0] >= 0.5 and target[0] < 1.0:
            new_target = 7
        elif target[0] >= 1.0 and target[0] < 1.5:
            new_target = 8
        elif target[0] >= 1.5 and target[0] < 2.0:
            new_target = 9
        elif target[0] >= 2.0 and target[0] < 2.5:
            new_target = 10
        elif target[0] >= 2.5:
            new_target = 11
        new_targets.append(new_target)
    return new_targets

label_names = ['-2.5', '-2.0', '-1.5', '-1.0', '-0.5', '0.0', '0.5', '1.0', '1.5', '2.0', '2.5', '3.0']
class_labels = map_readability_scores(val_labels)
class_predictions = map_readability_scores(val_predictions)

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(class_labels, class_predictions, labels=range(12))
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm, display_labels=label_names)
disp.plot(cmap=plt.cm.Blues)

## Generating outputs

Predict the outputs

In [None]:

test_predictions = []
best_model.eval()
for batch in test_loader:
    # Extract features
    b_token_embeddings = batch['token_embeddings'].to(device).float()
    b_s_token_embedding = batch['s_token_embedding'].to(device).float()
    b_max_embedding = batch['max_embedding'].to(device).float()
    b_avg_embedding = batch['avg_embedding'].to(device).float()
    b_manual_embedding = batch['manual_embedding'].to(device).float()

    # There is no need to compute the graph for the forward pass
    # because we only need it for backprop (training)
    with torch.no_grad():
        # Calculate the forward pass and the loss
        logits, _ = best_model(
        b_token_embeddings, b_s_token_embedding, b_max_embedding,
        b_avg_embedding, b_manual_embedding)

    # Move logits to CPU
    predictions = logits.to('cpu').numpy()
    test_predictions.extend(predictions.tolist())

Create a pandas dataframe and save it as csv

In [None]:
import pandas as pd

submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
submission.target = np.array([test_predictions])[0]
submission.to_csv("submission.csv", index=False)
submission