In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import *

from overrides import overrides
import warnings

from allennlp.data import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.nn import util as nn_util


DATA_ROOT = Path("../data/jigsaw")

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
label_cols = ["toxic", "severe_toxic", "obscene",
              "threat", "insult", "identity_hate"]

In [3]:
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.fields import TextField, SequenceLabelField, LabelField, MetadataField, ArrayField
import csv

max_seq_len = 512

class JigsawDatasetReader(DatasetReader):
    def __init__(self, tokenizer: Callable[[str], List[str]]=lambda x: x.split(),
                 token_indexers: Dict[str, TokenIndexer] = None, # TODO: Handle mapping from BERT
                 max_seq_len: Optional[int]=max_seq_len) -> None:
        super().__init__(lazy=False)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_seq_len = max_seq_len

    @overrides
    def text_to_instance(self, tokens: List[str], id: str,
                         labels: np.ndarray) -> Instance:
        sentence_field = MemoryOptimizedTextField([proc(x) for x in tokens],
                                   self.token_indexers)
        fields = {"tokens": sentence_field}
        
        wl_feats = np.array([[func(w) for func in word_level_features] for w in tokens])
        fields["word_level_features"] = ArrayField(array=wl_feats)
        
        sl_feats = np.array([func(tokens) for func in sentence_level_features])
        fields["sentence_level_features"] = ArrayField(array=sl_feats)

        label_field = ArrayField(array=labels)
        fields["label"] = label_field

        return Instance(fields)
    
    @overrides
    def _read(self, file_path: str) -> Iterator[Instance]:
        with open(file_path) as f:
            reader = csv.reader(f)
            next(reader)
            for i, line in enumerate(reader):
                if len(line) == 9:
                    _, id_, text, *labels = line
                elif len(line) == 8:
                    id_, text, *labels = line
                else: raise ValueError(f"line has {len(line)} values")
                yield self.text_to_instance(
                    self.tokenizer(text),
                    id_, np.array([int(x) for x in labels]),
                )

In [4]:
#from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from pytorch_pretrained_bert.tokenization import BasicTokenizer
from allennlp.data.token_indexers import WordpieceIndexer, SingleIdTokenIndexer
import re

#_spacy_tok = SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words
_bert_tok = BasicTokenizer(do_lower_case=True)

from allennlp.data.token_indexers import SingleIdTokenIndexer
token_indexer = SingleIdTokenIndexer(
    lowercase_tokens=True,
)

from itertools import groupby

def remove_url(s):
    return re.sub(r"http\S+", "", s)

def remove_extra_chars(s, max_qty=2):
    res = [c * min(max_qty, len(list(group_iter))) for c, group_iter in groupby(s)] 
    return ''.join(res)

def tokenizer(x: str):
    return [remove_extra_chars(w) for w in _bert_tok.tokenize(remove_url(x))]
    #return [w.text for w in _spacy_tok(x.lower())]

In [5]:
reader = JigsawDatasetReader(
    tokenizer=tokenizer,
    token_indexers={"tokens": token_indexer}
)

In [6]:
from allennlp.data.fields import TextField, SequenceLabelField, LabelField, MetadataField, ArrayField
import string
alphabet = set(string.ascii_lowercase)

sentence_level_features: List[Callable[[List[str]], float]] = [
#     lambda x: (np.log1p(len(x)) - 3.628) / 1.065, # stat computed on train set
]

word_level_features: List[Callable[[str], float]] = [
    lambda x: 1 if (x.lower() == x) else 0,
    lambda x: len([c for c in x.lower() if c not in alphabet]) / len(x),
]

def proc(x: str) -> str:
    return x.lower()

class MemoryOptimizedTextField(TextField):
    @overrides
    def __init__(self, tokens: List[str], token_indexers: Dict[str, TokenIndexer]) -> None:
        self.tokens = tokens
        self._token_indexers = token_indexers
        self._indexed_tokens: Optional[Dict[str, TokenList]] = None
        self._indexer_name_to_indexed_token: Optional[Dict[str, List[str]]] = None
        # skip checks for tokens
    @overrides
    def index(self, vocab):
        super().index(vocab)
        self.tokens = None # empty tokens

In [7]:
train_ds, test_ds = (reader.read(DATA_ROOT / fname) for fname in ["train.csv", "test_proced.csv"])
len(train_ds), len(test_ds)

159571it [05:53, 451.00it/s]
63978it [02:17, 466.99it/s]


(159571, 63978)

In [8]:
full_ds = train_ds + test_ds

In [1]:
full_ds

NameError: name 'full_ds' is not defined

In [9]:
from allennlp.data.vocabulary import Vocabulary
vocab = Vocabulary.from_instances(full_ds)

100%|██████████| 223549/223549 [00:19<00:00, 11292.57it/s]


In [10]:
vocab.get_token_index("n't")

1

In [13]:
vocab.get_token_index("http:")

1

In [15]:
# vocab should have been saved using
vocab.save_to_files(DATA_ROOT / "data_ft_vocab")

To avoid memory errors, restart here and build embedding matrix

In [None]:
#from allennlp.data.vocabulary import Vocabulary
#vocab = Vocabulary.from_files(DATA_ROOT / "data_ft_vocab")

In [16]:
import fastText
ft_model = fastText.load_model(str(DATA_ROOT / "wiki.en.bin"))

In [17]:
ft_emb = []
with (DATA_ROOT / "ft_model_bert_basic_tok.txt").open("wt") as f:
    for idx, token in vocab.get_index_to_token_vocabulary().items():
        emb = ft_model.get_word_vector(token)
        emb_as_str = " ".join(["%.4f" % x for x in emb])
        ft_emb.append(np.array(emb))
        f.write(f"{token} {emb_as_str}\n")

In [18]:
ft_emb = np.vstack(ft_emb)
ft_emb.shape

(254407, 300)

In [19]:
np.save(str(DATA_ROOT / "ft_model_bert_basic_tok.npy"),ft_emb)