In [1]:
from urllib.parse import urlparse

import pandas

In [2]:
train_df = pandas.read_csv("data/train.csv")
train_df["hostname"] = train_df \
    .url_legal \
    .apply(lambda url: urlparse(url).hostname if isinstance(url, str) else None) \
    .fillna("EMPTY_HOSTNAME")

train_df["hostname"]

0          EMPTY_HOSTNAME
1          EMPTY_HOSTNAME
2          EMPTY_HOSTNAME
3          EMPTY_HOSTNAME
4          EMPTY_HOSTNAME
              ...        
2829    sites.ehe.osu.edu
2830     en.wikibooks.org
2831     en.wikibooks.org
2832     en.wikibooks.org
2833     en.wikibooks.org
Name: hostname, Length: 2834, dtype: object

In [3]:
# !poetry run allennlp train training_configs/baseline.jsonnet --serialization-dir serialization/1

In [4]:
import allennlp.commands

In [5]:
from typing import Any, Dict, Iterable, MutableMapping, Optional
from urllib.parse import urlparse

from allennlp.data import DatasetReader
from allennlp.data import Tokenizer
from allennlp.data.fields.field import Field
from allennlp.data.fields import ArrayField, TextField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.token_indexers.token_indexer import TokenIndexer
import pandas
import numpy
from overrides import overrides


@DatasetReader.register("commonlit_reader")
class CommonlitDatasetReader(DatasetReader):
    def __init__(self, tokenizer: Tokenizer) -> None:
        super().__init__()

        self.tokenizer = tokenizer
        self.token_indexers: Dict[str, TokenIndexer] = {
            "tokens": SingleIdTokenIndexer(),
        }

    def _read(self, file_path: str) -> Iterable[Instance]:
        instances = []

        dataframe = pandas.read_csv(file_path)
        dataframe["hostname"] = dataframe \
            .url_legal \
            .apply(lambda url: urlparse(url).hostname if isinstance(url, str) else "EMPTY_HOSTNAME")

        for _, row in dataframe.iterrows():
            excerpt = row.excerpt
            target = row.target if hasattr(row, "target") else None
            instances.append(self.text_to_instance(excerpt, target))

        return instances

    @overrides
    def text_to_instance(self, excerpt: str, target: Optional[float] = None) -> Instance:
        tokens = self.tokenizer.tokenize(excerpt)
        fields: MutableMapping[str, Field[Any]] = {
            "excerpt": TextField(tokens),
        }
        if target is not None:
            fields["target"] = ArrayField(numpy.asarray(target, dtype=numpy.float32))
        return Instance(fields=fields)

    def apply_token_indexers(self, instance: Instance) -> None:
        assert isinstance(instance.fields["excerpt"], TextField)
        instance.fields["excerpt"].token_indexers = self.token_indexers

        
from typing import Dict, Optional
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules import TextFieldEmbedder
from allennlp.modules import Seq2VecEncoder
from allennlp.nn.util import get_text_field_mask
from allennlp.data.fields.text_field import TextFieldTensors
from overrides.overrides import overrides
from torch import FloatTensor
from torch.functional import Tensor
from torch.nn.functional import mse_loss
from torch.nn import Linear


@Model.register("baseline")
class BaselineRegressor(Model):

    def __init__(
        self,
        vocab: Vocabulary,
        excerpt_embedder: TextFieldEmbedder,
        excerpt_encoder: Seq2VecEncoder,
    ) -> None:

        super().__init__(vocab)

        self.vocab = vocab
        self.excerpt_embedder = excerpt_embedder
        self.excerpt_encoder = excerpt_encoder

        self.dense = Linear(
            in_features=self.excerpt_encoder.get_output_dim(),
            out_features=1,
        )

    @overrides
    def forward(
        self,
        excerpt: TextFieldTensors,
        target: Optional[FloatTensor] = None,
    ) -> Dict[str, Tensor]:

        mask = get_text_field_mask(excerpt)
        excerpt_emb = self.excerpt_embedder(excerpt)
        logit = self.dense(self.excerpt_encoder(excerpt_emb, mask=mask))

        output_dict = {"logit": logit}
        if target is not None:
            output_dict["loss"] = mse_loss(logit.view(-1), target)

        return output_dict

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {}

    
from allennlp.common.util import JsonDict
from allennlp.data.instance import Instance
from allennlp.predictors import Predictor


@Predictor.register("regressor_predictor")
class RegressorPredictor(Predictor):
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        return self._dataset_reader.text_to_instance(**json_dict)  # type: ignore

In [7]:
allennlp.commands.train.train_model_from_file(
    parameter_filename="./training_configs/baseline.jsonnet",
    serialization_dir="./serialization/3"
)

2021-07-26 12:13:09,986 - INFO - allennlp.common.params - random_seed = 13370
2021-07-26 12:13:09,987 - INFO - allennlp.common.params - numpy_seed = 1337
2021-07-26 12:13:09,987 - INFO - allennlp.common.params - pytorch_seed = 133
2021-07-26 12:13:09,990 - INFO - allennlp.common.checks - Pytorch version: 1.9.0+cu102
2021-07-26 12:13:09,991 - INFO - allennlp.common.params - type = default
2021-07-26 12:13:09,993 - INFO - allennlp.common.params - dataset_reader.type = commonlit_reader
2021-07-26 12:13:09,994 - INFO - allennlp.common.params - dataset_reader.tokenizer = whitespace
2021-07-26 12:13:09,995 - INFO - allennlp.common.params - type = whitespace
2021-07-26 12:13:09,996 - INFO - allennlp.common.params - train_data_path = data/train.csv
2021-07-26 12:13:09,999 - INFO - allennlp.common.params - vocabulary = <allennlp.common.lazy.Lazy object at 0x7f7218d509a0>
2021-07-26 12:13:10,002 - INFO - allennlp.common.params - datasets_for_vocab_creation = None
2021-07-26 12:13:10,003 - INFO -

loading instances: 0it [00:00, ?it/s]

2021-07-26 12:13:11,388 - INFO - allennlp.common.params - data_loader.type = multiprocess
2021-07-26 12:13:11,389 - INFO - allennlp.common.params - data_loader.batch_size = 16
2021-07-26 12:13:11,390 - INFO - allennlp.common.params - data_loader.drop_last = False
2021-07-26 12:13:11,391 - INFO - allennlp.common.params - data_loader.shuffle = True
2021-07-26 12:13:11,392 - INFO - allennlp.common.params - data_loader.batch_sampler = None
2021-07-26 12:13:11,393 - INFO - allennlp.common.params - data_loader.batches_per_epoch = None
2021-07-26 12:13:11,394 - INFO - allennlp.common.params - data_loader.num_workers = 0
2021-07-26 12:13:11,396 - INFO - allennlp.common.params - data_loader.max_instances_in_memory = None
2021-07-26 12:13:11,396 - INFO - allennlp.common.params - data_loader.start_method = fork
2021-07-26 12:13:11,398 - INFO - allennlp.common.params - data_loader.cuda_device = None
2021-07-26 12:13:11,398 - INFO - allennlp.common.params - data_loader.quiet = False
2021-07-26 12:1

loading instances: 0it [00:00, ?it/s]

2021-07-26 12:13:11,442 - INFO - allennlp.common.params - type = from_instances
2021-07-26 12:13:11,445 - INFO - allennlp.common.params - min_count = None
2021-07-26 12:13:11,446 - INFO - allennlp.common.params - max_vocab_size = None
2021-07-26 12:13:11,448 - INFO - allennlp.common.params - non_padded_namespaces = ('*tags', '*labels')
2021-07-26 12:13:11,449 - INFO - allennlp.common.params - pretrained_files = None
2021-07-26 12:13:11,450 - INFO - allennlp.common.params - only_include_pretrained_words = False
2021-07-26 12:13:11,451 - INFO - allennlp.common.params - tokens_to_add = None
2021-07-26 12:13:11,451 - INFO - allennlp.common.params - min_pretrained_embeddings = None
2021-07-26 12:13:11,452 - INFO - allennlp.common.params - padding_token = @@PADDING@@
2021-07-26 12:13:11,453 - INFO - allennlp.common.params - oov_token = @@UNKNOWN@@
2021-07-26 12:13:11,453 - INFO - allennlp.data.vocabulary - Fitting token dictionary from dataset.


building vocab: 0it [00:00, ?it/s]

2021-07-26 12:13:11,806 - INFO - allennlp.common.params - model.type = baseline
2021-07-26 12:13:11,807 - INFO - allennlp.common.params - model.excerpt_embedder.type = basic
2021-07-26 12:13:11,808 - INFO - allennlp.common.params - model.excerpt_embedder.token_embedders.tokens.type = embedding
2021-07-26 12:13:11,809 - INFO - allennlp.common.params - model.excerpt_embedder.token_embedders.tokens.embedding_dim = 50
2021-07-26 12:13:11,810 - INFO - allennlp.common.params - model.excerpt_embedder.token_embedders.tokens.num_embeddings = None
2021-07-26 12:13:11,811 - INFO - allennlp.common.params - model.excerpt_embedder.token_embedders.tokens.projection_dim = None
2021-07-26 12:13:11,811 - INFO - allennlp.common.params - model.excerpt_embedder.token_embedders.tokens.weight = None
2021-07-26 12:13:11,813 - INFO - allennlp.common.params - model.excerpt_embedder.token_embedders.tokens.padding_index = None
2021-07-26 12:13:11,814 - INFO - allennlp.common.params - model.excerpt_embedder.token_

  0%|          | 0/178 [00:00<?, ?it/s]

2021-07-26 12:13:12,423 - INFO - allennlp.training.callbacks.console_logger - Batch inputs
2021-07-26 12:13:12,426 - INFO - allennlp.training.callbacks.console_logger - batch_input/excerpt/tokens/tokens (Shape: 16 x 198)
tensor([[   81,  1452,  1228,  ...,     0,     0,     0],
        [  181,     6,    90,  ...,     0,     0,     0],
        [  155,    40,  2439,  ...,     0,     0,     0],
        ...,
        [46411,     2,   371,  ...,     0,     0,     0],
        [   76, 49403, 22407,  ..., 49408,     4, 14702],
        [  154,    95,    27,  ...,     0,     0,     0]])
2021-07-26 12:13:12,430 - INFO - allennlp.training.callbacks.console_logger - batch_input/target (Shape: 16)
tensor([-0.1954, -0.8521, -2.0287,  ..., -3.1648, -0.8207, -1.7276])


KeyboardInterrupt: 

In [None]:
from allennlp.models.archival import load_archive
from commonlitreadabilityprize.predictors import RegressorPredictor

In [None]:
archive = load_archive("serialization/2")
predictor = RegressorPredictor.from_archive(archive)

In [None]:
test_df = pandas.read_csv("data/test.csv")
print(test_df.head())

batch_json = test_df.excerpt.apply(lambda excerpt: {"excerpt": excerpt}).tolist()
predictor.predict_batch_json(batch_json)

In [None]:
class BatchIterator:
        def __init__(self, data, batch_size):
                self.data = data
                self.batch_size = batch_size
                self.cur = 0
            
        def __iter__(self):
                return self
            
        def __next__(self):
                batch = self.data[self.cur:self.cur+self.batch_size]
                self.cur += self.batch_size
                if len(batch) == 0:
                    raise StopIteration
                return batch


predictions = []
batch_iterator = BatchIterator(batch_json, batch_size=2)

for batch in batch_iterator:
    predictions += predictor.predict_batch_json(batch)

In [None]:
test_df["target"] = list(map(lambda p: p["logit"][0], predictions))
test_df