In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/roberta-base/config.json
/kaggle/input/roberta-base/merges.txt
/kaggle/input/roberta-base/vocab.json
/kaggle/input/roberta-base/pytorch_model.bin
/kaggle/input/commonlitreadabilityprize/sample_submission.csv
/kaggle/input/commonlitreadabilityprize/train.csv
/kaggle/input/commonlitreadabilityprize/test.csv
/kaggle/input/robertalarge/config.json
/kaggle/input/robertalarge/merges.txt
/kaggle/input/robertalarge/vocab.json
/kaggle/input/robertalarge/pytorch_model.bin
/kaggle/input/robertalarge/modelcard.json
/kaggle/input/glove-vec/glove.6B.100d.word2vec


In [11]:
import pandas
import numpy


df = pandas.read_csv("../input/commonlitreadabilityprize/train.csv")
num_records = len(df)

ids = numpy.arange(num_records)
ids = numpy.random.permutation(ids)

train_size = 0.8
partition = int(num_records * train_size)

train_ids, valid_ids = ids[:partition], ids[partition:]

df.loc[train_ids].to_csv("./processed_train.csv", index=False)
df.loc[valid_ids].to_csv("./processed_valid.csv", index=False)


In [12]:
from typing import Any, Dict, Iterable, MutableMapping, Optional
from urllib.parse import urlparse

from allennlp.data import DatasetReader
from allennlp.data import Tokenizer
from allennlp.data.fields.field import Field
from allennlp.data.fields import ArrayField, TextField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.token_indexers.token_indexer import TokenIndexer
from allennlp.data.tokenizers.token_class import Token
import pandas
import numpy
from overrides import overrides


@DatasetReader.register("commonlit_reader")
class CommonlitDatasetReader(DatasetReader):
    def __init__(
        self,
        tokenizer: Tokenizer,
        excerpt_token_indexers: Optional[Dict[str, TokenIndexer]] = None,
        hostname_token_indexers: Optional[Dict[str, TokenIndexer]] = None,
    ) -> None:

        super().__init__()

        self.tokenizer = tokenizer
        self.excerpt_token_indexers: Dict[str, TokenIndexer] = excerpt_token_indexers or {
            "tokens": SingleIdTokenIndexer(),
        }
        self.hostname_token_indexers: Dict[str, TokenIndexer] = hostname_token_indexers or {
            "tokens": SingleIdTokenIndexer(),
        }

    def _read(self, file_path: str) -> Iterable[Instance]:
        instances = []

        dataframe = pandas.read_csv(file_path)
        dataframe["hostname"] = dataframe \
            .url_legal \
            .apply(lambda url: urlparse(url).hostname if isinstance(url, str) else "EMPTY_HOSTNAME")

        for _, row in dataframe.iterrows():
            excerpt = row.excerpt
            hostname = row.hostname
            target = row.target if hasattr(row, "target") else None
            instances.append(self.text_to_instance(excerpt, hostname, target))

        return instances

    @overrides
    def text_to_instance(self, excerpt: str, hostname: str, target: Optional[float] = None) -> Instance:
        excerpt_tokens = self.tokenizer.tokenize(excerpt)
        hostname_tokens = [Token(text=hostname)]
        fields: MutableMapping[str, Field[Any]] = {
            "excerpt": TextField(excerpt_tokens),
            "hostname": TextField(hostname_tokens),
        }
        if target is not None:
            fields["target"] = ArrayField(numpy.asarray(target, dtype=numpy.float32))
        return Instance(fields=fields)

    def apply_token_indexers(self, instance: Instance) -> None:
        assert isinstance(instance.fields["excerpt"], TextField)
        instance.fields["excerpt"].token_indexers = self.excerpt_token_indexers
        assert isinstance(instance.fields["hostname"], TextField)
        instance.fields["hostname"].token_indexers = self.hostname_token_indexers


from typing import Dict, Optional
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules import TextFieldEmbedder
from allennlp.modules import Seq2VecEncoder
from allennlp.nn.util import get_text_field_mask
from allennlp.data.fields.text_field import TextFieldTensors
from overrides.overrides import overrides
from torch import FloatTensor
from torch.functional import Tensor
from torch.nn.functional import mse_loss
from torch import cat
from torch import sqrt
from torch.nn import Linear


EPS = 1e-8


@Model.register("naive")
class NaiveRegressor(Model):

    def __init__(
        self,
        vocab: Vocabulary,
        excerpt_embedder: TextFieldEmbedder,
        excerpt_encoder: Seq2VecEncoder,
        hostname_embedder: Optional[TextFieldEmbedder] = None,
    ) -> None:

        super().__init__(vocab)

        self.vocab = vocab
        self.excerpt_embedder = excerpt_embedder
        self.excerpt_encoder = excerpt_encoder
        self.hostname_embedder = hostname_embedder

        in_features = self.excerpt_encoder.get_output_dim()
        if hostname_embedder is not None:
            in_features += hostname_embedder.get_output_dim()

        self.classification_layer = Linear(
            in_features=in_features,
            out_features=1,
        )

    @overrides
    def forward(
        self,
        excerpt: TextFieldTensors,
        hostname: Optional[TextFieldTensors] = None,
        target: Optional[FloatTensor] = None,
    ) -> Dict[str, Tensor]:

        mask = get_text_field_mask(excerpt)
        excerpt_emb = self.excerpt_embedder(excerpt)
        hidden_state = self.excerpt_encoder(excerpt_emb, mask=mask)

        if self.hostname_embedder is not None and hostname is not None:
            hostname_emb = self.hostname_embedder(hostname)
            hidden_state = cat((hidden_state, hostname_emb.squeeze(dim=1)), dim=1)

        logit = self.classification_layer(hidden_state)

        output_dict = {"logit": logit}
        if target is not None:
            output_dict["loss"] = sqrt(mse_loss(logit.view(-1), target) + EPS)

        return output_dict

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {}


from allennlp.common.util import JsonDict
from allennlp.data.instance import Instance
from allennlp.predictors import Predictor


@Predictor.register("regressor_predictor")
class RegressorPredictor(Predictor):
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        return self._dataset_reader.text_to_instance(**json_dict)  # type: ignore


In [13]:
!ls ../input

commonlitreadabilityprize  glove-vec  roberta-base  robertalarge


In [14]:
jsonnet_text = """\
{
    dataset_reader: {
        type: "commonlit_reader",
        tokenizer: {
            type: "pretrained_transformer",
            model_name: "../input/robertalarge",
        },
        excerpt_token_indexers: {
            tokens: {
                type: "pretrained_transformer",
                model_name: "../input/robertalarge",
            },
        },
    },
    train_data_path: "./processed_train.csv",
    validation_data_path: "./processed_valid.csv",
    model: {
        type: "naive",
        excerpt_embedder: {
            type: "basic",
            token_embedders: {
                tokens: {
                    type: "pretrained_transformer",
                    model_name: "../input/robertalarge",
                },
            },
        },
        excerpt_encoder: {
            type: "bert_pooler",
            pretrained_model: "../input/robertalarge",
        },
        hostname_embedder: {
            type: "basic",
            token_embedders: {
                tokens: {
                    embedding_dim: 50,
                },
            },
        },
    },
    trainer: {
        num_epochs: 15,
        learning_rate_scheduler: {
            type: "slanted_triangular",
            num_epochs: 10,
            num_steps_per_epoch: 3088,
            cut_frac: 0.06
        },
        optimizer: {
            type: "huggingface_adamw",
            lr: 5e-7,
            weight_decay: 0.05,
        },
        validation_metric: "-loss"
    },
    data_loader: {
        batch_size: 8,
        shuffle: true
    }
}
"""

f = open("baseline.jsonnet", "w")
f.write(jsonnet_text)
f.close()

In [None]:
import allennlp.commands

allennlp.commands.train.train_model_from_file(
    parameter_filename="./baseline.jsonnet",
    serialization_dir="./serialization/1",
)

2021-07-27 05:46:10,427 - INFO - allennlp.common.params - random_seed = 13370
2021-07-27 05:46:10,428 - INFO - allennlp.common.params - numpy_seed = 1337
2021-07-27 05:46:10,430 - INFO - allennlp.common.params - pytorch_seed = 133
2021-07-27 05:46:10,486 - INFO - allennlp.common.checks - Pytorch version: 1.7.0
2021-07-27 05:46:10,487 - INFO - allennlp.common.params - type = default
2021-07-27 05:46:10,490 - INFO - allennlp.common.params - dataset_reader.type = commonlit_reader
2021-07-27 05:46:10,493 - INFO - allennlp.common.params - dataset_reader.tokenizer.type = pretrained_transformer
2021-07-27 05:46:10,494 - INFO - allennlp.common.params - dataset_reader.tokenizer.model_name = ../input/robertalarge
2021-07-27 05:46:10,495 - INFO - allennlp.common.params - dataset_reader.tokenizer.add_special_tokens = True
2021-07-27 05:46:10,497 - INFO - allennlp.common.params - dataset_reader.tokenizer.max_length = None
2021-07-27 05:46:10,501 - INFO - allennlp.common.params - dataset_reader.toke

loading instances: 0it [00:00, ?it/s]

2021-07-27 05:46:17,048 - INFO - allennlp.common.params - data_loader.type = multiprocess
2021-07-27 05:46:17,050 - INFO - allennlp.common.params - data_loader.batch_size = 8
2021-07-27 05:46:17,057 - INFO - allennlp.common.params - data_loader.drop_last = False
2021-07-27 05:46:17,058 - INFO - allennlp.common.params - data_loader.shuffle = True
2021-07-27 05:46:17,060 - INFO - allennlp.common.params - data_loader.batch_sampler = None
2021-07-27 05:46:17,061 - INFO - allennlp.common.params - data_loader.batches_per_epoch = None
2021-07-27 05:46:17,063 - INFO - allennlp.common.params - data_loader.num_workers = 0
2021-07-27 05:46:17,064 - INFO - allennlp.common.params - data_loader.max_instances_in_memory = None
2021-07-27 05:46:17,065 - INFO - allennlp.common.params - data_loader.start_method = fork
2021-07-27 05:46:17,066 - INFO - allennlp.common.params - data_loader.cuda_device = None
2021-07-27 05:46:17,067 - INFO - allennlp.common.params - data_loader.quiet = False
2021-07-27 05:46

loading instances: 0it [00:00, ?it/s]

2021-07-27 05:46:18,316 - INFO - allennlp.common.params - type = from_instances
2021-07-27 05:46:18,318 - INFO - allennlp.common.params - min_count = None
2021-07-27 05:46:18,319 - INFO - allennlp.common.params - max_vocab_size = None
2021-07-27 05:46:18,320 - INFO - allennlp.common.params - non_padded_namespaces = ('*tags', '*labels')
2021-07-27 05:46:18,321 - INFO - allennlp.common.params - pretrained_files = None
2021-07-27 05:46:18,322 - INFO - allennlp.common.params - only_include_pretrained_words = False
2021-07-27 05:46:18,325 - INFO - allennlp.common.params - tokens_to_add = None
2021-07-27 05:46:18,333 - INFO - allennlp.common.params - min_pretrained_embeddings = None
2021-07-27 05:46:18,334 - INFO - allennlp.common.params - padding_token = @@PADDING@@
2021-07-27 05:46:18,336 - INFO - allennlp.common.params - oov_token = @@UNKNOWN@@
2021-07-27 05:46:18,337 - INFO - allennlp.data.vocabulary - Fitting token dictionary from dataset.


building vocab: 0it [00:00, ?it/s]

2021-07-27 05:46:18,521 - INFO - allennlp.common.params - model.type = naive
2021-07-27 05:46:18,523 - INFO - allennlp.common.params - model.excerpt_embedder.type = basic
2021-07-27 05:46:18,525 - INFO - allennlp.common.params - model.excerpt_embedder.token_embedders.tokens.type = pretrained_transformer
2021-07-27 05:46:18,527 - INFO - allennlp.common.params - model.excerpt_embedder.token_embedders.tokens.model_name = ../input/robertalarge
2021-07-27 05:46:18,528 - INFO - allennlp.common.params - model.excerpt_embedder.token_embedders.tokens.max_length = None
2021-07-27 05:46:18,530 - INFO - allennlp.common.params - model.excerpt_embedder.token_embedders.tokens.sub_module = None
2021-07-27 05:46:18,531 - INFO - allennlp.common.params - model.excerpt_embedder.token_embedders.tokens.train_parameters = True
2021-07-27 05:46:18,537 - INFO - allennlp.common.params - model.excerpt_embedder.token_embedders.tokens.eval_mode = False
2021-07-27 05:46:18,538 - INFO - allennlp.common.params - mode

Some weights of the model checkpoint at ../input/robertalarge were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2021-07-27 05:46:38,523 - INFO - allennlp.common.params - model.excerpt_encoder.type = bert_pooler
2021-07-27 05:46:38,524 - INFO - allennlp.common.params - model.excerpt_encoder.pretrained_model = ../input/robertalarge
2021-07-27 05:46:38,526 - INFO - allennlp.common.params - model.excerpt_encoder.override_weights_file = None
2021-07-27 05:46:38,530 - INFO - allennlp.common.params - model.excerpt_encoder.override_weights_strip_prefix = None
2021-07-27 05:46:38,531 - INFO - allennlp.common.params - model.excerpt_encoder.load_weights = True
2021-07-27 05:46:38,532 - INFO - allennlp.common.params - model.excerpt_encoder.requires_grad = True
2021-07-27 05:46:38,533 - INFO - allennlp.common.params - model.excerpt_encoder.dropout = 0.0
2021-07-27 05:46:38,534 - INFO - allennlp.common.params - model.excerpt_encoder.transformer_kwargs = None
2021-07-27 05:46:38,539 - INFO - allennlp.common.params - model.hostname_embedder.type = basic
2021-07-27 05:46:38,542 - INFO - allennlp.common.params - 

  0%|          | 0/284 [00:00<?, ?it/s]

2021-07-27 05:46:48,756 - INFO - allennlp.training.callbacks.console_logger - Batch inputs
2021-07-27 05:46:48,757 - INFO - allennlp.training.callbacks.console_logger - batch_input/excerpt/tokens/token_ids (Shape: 8 x 245)
tensor([[    0,  1779,  7393,  ...,     1,     1,     1],
        [    0,   133,  3939,  ...,     1,     1,     1],
        [    0,  1121,  4634,  ...,     1,     1,     1],
        ...,
        [    0,   100,    56,  ...,     1,     1,     1],
        [    0, 46150, 23736,  ...,  1085,    72,     2],
        [    0, 47003,  9013,  ...,     1,     1,     1]], device='cuda:0')
2021-07-27 05:46:48,764 - INFO - allennlp.training.callbacks.console_logger - batch_input/excerpt/tokens/mask (Shape: 8 x 245)
tensor([[ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        ...,
        [ True,  True,  True,  ..., False, False, False],
        [ True,  True,  Tr

### Inference

In [None]:
from allennlp.models.archival import load_archive

archive = load_archive("serialization/1/model.tar.gz")
predictor = RegressorPredictor.from_archive(archive)

In [None]:
from urllib.parse import urlparse


test_df = pandas.read_csv("../input/commonlitreadabilityprize/test.csv")
print(test_df.head())

test_df["hostname"] = test_df \
    .url_legal \
    .apply(lambda url: urlparse(url).hostname if isinstance(url, str) else "EMPTY_HOSTNAME")

batch_json = test_df.apply(lambda row: {"excerpt": row.excerpt, "hostname": row.hostname}, axis=1).tolist()
predictor.predict_batch_json(batch_json)

In [None]:
class BatchIterator:
        def __init__(self, data, batch_size):
                self.data = data
                self.batch_size = batch_size
                self.cur = 0
            
        def __iter__(self):
                return self
            
        def __next__(self):
                batch = self.data[self.cur:self.cur+self.batch_size]
                self.cur += self.batch_size
                if len(batch) == 0:
                    raise StopIteration
                return batch


predictions = []
batch_iterator = BatchIterator(batch_json, batch_size=32)

for batch in batch_iterator:
    predictions += predictor.predict_batch_json(batch)

In [None]:
test_df["target"] = list(map(lambda p: p["logit"][0], predictions))
test_df[["id", "target"]].to_csv("submission.csv", index=False)
test_df