In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import *

from overrides import overrides
import warnings

from allennlp.data import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.nn import util as nn_util


DATA_ROOT = Path("../data/jigsaw")

In [18]:
label_cols = ["toxic", "severe_toxic", "obscene",
              "threat", "insult", "identity_hate"]

In [19]:
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.fields import TextField, SequenceLabelField, LabelField, MetadataField, ArrayField

class JigsawDatasetReader(DatasetReader):
    def __init__(self, tokenizer: Callable[[str], List[str]]=lambda x: x.split(),
                 token_indexers: Dict[str, TokenIndexer] = None, # TODO: Handle mapping from BERT
                 max_seq_len: Optional[int]=None) -> None:
        super().__init__(lazy=False)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_seq_len = max_seq_len

    @overrides
    def text_to_instance(self, tokens: List[Token], id: str,
                         labels: np.ndarray) -> Instance:
        sentence_field = TextField([Token(x) for x in tokens],
                                   self.token_indexers)
        fields = {"tokens": sentence_field}
        
        id_field = MetadataField(id)
        fields["id"] = id_field
        
        meta_field = MetadataField({"lengths": np.array([len(t) for t in tokens])})
        fields["meta"] = meta_field
        
        label_field = ArrayField(array=labels)
        fields["label"] = label_field

        return Instance(fields)
    
    @overrides
    def _read(self, file_path: str) -> Iterator[Instance]:
        df = pd.read_csv(file_path)
        for i, row in df.iterrows():
            yield self.text_to_instance(
                self.tokenizer(row["comment_text"]),
                row["id"], row[label_cols].values,
            )

In [20]:
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data.token_indexers import WordpieceIndexer, SingleIdTokenIndexer

_spacy_tok = SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words

from allennlp.data.token_indexers import SingleIdTokenIndexer
token_indexer = SingleIdTokenIndexer(
    lowercase_tokens=True,
)
def tokenizer(x: str):
    return [w.text for w in
            _spacy_tok(x.lower())]

In [21]:
reader = JigsawDatasetReader(
    tokenizer=tokenizer,
    token_indexers={"tokens": token_indexer}
)

In [22]:
train_ds, test_ds = (reader.read(DATA_ROOT / fname) for fname in ["train.csv",
                                                                  "test_proced.csv"])



0it [00:00, ?it/s][A[A

1it [00:02,  2.38s/it][A[A

9it [00:02,  1.67s/it][A[A

16it [00:02,  1.17s/it][A[A

23it [00:02,  1.21it/s][A[A

30it [00:02,  1.72it/s][A[A

36it [00:02,  2.42it/s][A[A
42it [00:03,  3.23it/s][A
47it [00:03,  4.47it/s][A
52it [00:03,  6.05it/s][A
57it [00:03,  8.20it/s][A
62it [00:03, 10.12it/s][A
70it [00:04, 13.66it/s][A
77it [00:04, 17.97it/s][A
83it [00:04, 22.07it/s][A
92it [00:04, 28.30it/s][A
99it [00:04, 34.17it/s][A
106it [00:04, 39.25it/s][A
113it [00:04, 39.56it/s][A
119it [00:04, 42.47it/s][A
127it [00:05, 43.52it/s][A
135it [00:05, 50.08it/s][A
143it [00:05, 53.66it/s][A
150it [00:05, 55.85it/s][A
157it [00:05, 51.71it/s][A
163it [00:05, 41.73it/s][A
171it [00:05, 48.67it/s][A
181it [00:05, 56.02it/s][A
188it [00:06, 57.52it/s][A
195it [00:06, 52.04it/s][A
201it [00:06, 46.20it/s][A
207it [00:06, 42.88it/s][A
212it [00:06, 43.64it/s][A
217it [00:06, 40.48it/s][A
226it [00:06, 47.54it/s][A
232it [00:07, 4

2377it [00:40, 95.89it/s][A
2402it [00:40, 117.50it/s][A
2417it [00:40, 115.94it/s][A
2431it [00:40, 57.39it/s] [A
2442it [00:40, 61.80it/s][A
2452it [00:41, 69.57it/s][A
2465it [00:41, 80.61it/s][A
2476it [00:41, 84.90it/s][A
2487it [00:41, 85.95it/s][A
2499it [00:41, 92.18it/s][A
2510it [00:41, 74.60it/s][A
2519it [00:41, 77.74it/s][A
2530it [00:41, 85.08it/s][A
2567it [00:42, 110.58it/s][A
2587it [00:42, 127.21it/s][A
2610it [00:42, 145.74it/s][A
2630it [00:42, 135.90it/s][A
2648it [00:42, 125.81it/s][A
2664it [00:42, 126.55it/s][A
2679it [00:42, 121.26it/s][A
2693it [00:42, 121.92it/s][A
2707it [00:43, 116.75it/s][A
2720it [00:43, 109.69it/s][A
2733it [00:43, 113.68it/s][A
2745it [00:43, 108.86it/s][A
2757it [00:43, 105.80it/s][A
2769it [00:43, 109.42it/s][A
2781it [00:43, 110.19it/s][A
2795it [00:43, 116.89it/s][A
2807it [00:43, 115.67it/s][A
2829it [00:44, 132.28it/s][A
2864it [00:44, 162.26it/s][A
2885it [00:44, 154.23it/s][A
2904it [00:44, 152.5

7589it [01:14, 104.71it/s][A
7603it [01:15, 108.58it/s][A
7616it [01:15, 111.34it/s][A
7629it [01:15, 113.51it/s][A
7642it [01:15, 96.68it/s] [A
7653it [01:15, 83.32it/s][A
7663it [01:15, 86.68it/s][A
7678it [01:15, 97.80it/s][A
7693it [01:15, 108.91it/s][A
7719it [01:16, 131.54it/s][A
7753it [01:16, 160.35it/s][A
7775it [01:16, 156.63it/s][A
7795it [01:16, 128.19it/s][A
7812it [01:16, 120.70it/s][A
7827it [01:16, 123.08it/s][A
7845it [01:16, 131.06it/s][A
7860it [01:17, 130.04it/s][A
7876it [01:17, 137.47it/s][A
7891it [01:17, 99.04it/s] [A
7903it [01:17, 94.41it/s][A
7914it [01:17, 91.38it/s][A
7925it [01:17, 88.62it/s][A
7935it [01:17, 81.78it/s][A
7957it [01:18, 81.44it/s][A
7970it [01:18, 90.79it/s][A
7980it [01:18, 62.16it/s][A
7988it [01:18, 64.46it/s][A
7998it [01:18, 71.85it/s][A
8010it [01:18, 81.55it/s][A
8025it [01:19, 93.01it/s][A
8038it [01:19, 101.20it/s][A
8050it [01:19, 83.01it/s] [A
8060it [01:19, 81.76it/s][A
8070it [01:19, 75.02it/s]

15363it [01:49, 291.84it/s][A
15394it [01:49, 295.88it/s][A
15429it [01:49, 308.94it/s][A
15461it [01:49, 306.25it/s][A
15503it [01:49, 332.66it/s][A
15538it [01:49, 320.96it/s][A
15571it [01:49, 286.12it/s][A
15618it [01:49, 322.60it/s][A
15662it [01:50, 350.64it/s][A
15700it [01:50, 348.25it/s][A
15737it [01:50, 332.05it/s][A
15772it [01:50, 303.24it/s][A
15804it [01:50, 282.60it/s][A
15834it [01:50, 264.25it/s][A
15862it [01:50, 239.00it/s][A
15894it [01:50, 257.34it/s][A
15927it [01:50, 275.12it/s][A
15962it [01:51, 293.09it/s][A
15997it [01:51, 304.97it/s][A
16032it [01:51, 316.57it/s][A
16069it [01:51, 329.94it/s][A
16103it [01:51, 332.39it/s][A
16137it [01:51, 317.58it/s][A
16170it [01:51, 315.77it/s][A
16205it [01:51, 324.90it/s][A
16238it [01:51, 309.55it/s][A
16279it [01:52, 332.48it/s][A
16313it [01:52, 312.95it/s][A
16346it [01:52, 303.22it/s][A
16387it [01:52, 327.47it/s][A
16421it [01:52, 319.62it/s][A
16454it [01:52, 274.21it/s][A
16484it 

24357it [02:19, 350.73it/s][A
24393it [02:19, 348.48it/s][A
24432it [02:19, 357.45it/s][A
24471it [02:19, 366.18it/s][A
24509it [02:19, 369.43it/s][A
24547it [02:20, 358.00it/s][A
24584it [02:20, 345.00it/s][A
24619it [02:20, 341.65it/s][A
24654it [02:20, 331.88it/s][A
24688it [02:20, 331.52it/s][A
24722it [02:20, 315.01it/s][A
24754it [02:20, 284.26it/s][A
24786it [02:20, 289.45it/s][A
24816it [02:20, 289.32it/s][A
24846it [02:21, 277.07it/s][A
24882it [02:21, 297.42it/s][A
24917it [02:21, 311.07it/s][A
24949it [02:21, 311.69it/s][A
24985it [02:21, 311.96it/s][A
25017it [02:21, 309.15it/s][A
25050it [02:21, 313.15it/s][A
25092it [02:21, 331.14it/s][A
25127it [02:21, 335.69it/s][A
25161it [02:22, 324.49it/s][A
25194it [02:22, 323.08it/s][A
25227it [02:22, 312.17it/s][A
25268it [02:22, 335.38it/s][A
25307it [02:22, 348.83it/s][A
25343it [02:22, 349.89it/s][A
25379it [02:22, 304.44it/s][A
25411it [02:22, 293.30it/s][A
25451it [02:22, 317.91it/s][A
25486it 

32730it [02:53, 205.52it/s][A
32752it [02:53, 189.48it/s][A
32772it [02:53, 183.24it/s][A
32791it [02:53, 172.13it/s][A
32809it [02:53, 152.21it/s][A
32837it [02:53, 176.33it/s][A
32858it [02:53, 183.14it/s][A
32893it [02:53, 211.48it/s][A
32924it [02:54, 233.23it/s][A
32962it [02:54, 262.74it/s][A
32996it [02:54, 281.04it/s][A
33028it [02:54, 290.13it/s][A
33062it [02:54, 298.19it/s][A
33094it [02:54, 298.98it/s][A
33125it [02:54, 297.52it/s][A
33156it [02:54, 278.38it/s][A
33185it [02:54, 276.04it/s][A
33223it [02:55, 299.03it/s][A
33254it [02:55, 265.35it/s][A
33282it [02:55, 219.76it/s][A
33307it [02:55, 200.38it/s][A
33335it [02:55, 218.84it/s][A
33359it [02:55, 223.55it/s][A
33388it [02:55, 239.15it/s][A
33424it [02:55, 265.81it/s][A
33453it [02:56, 251.62it/s][A
33480it [02:56, 255.73it/s][A
33514it [02:56, 271.93it/s][A
33544it [02:56, 279.35it/s][A
33573it [02:56, 280.40it/s][A
33604it [02:56, 288.07it/s][A
33634it [02:56, 269.57it/s][A
33662it 

41138it [03:24, 302.34it/s][A
41172it [03:24, 312.59it/s][A
41204it [03:24, 306.33it/s][A
41236it [03:24, 287.58it/s][A
41268it [03:24, 294.83it/s][A
41298it [03:24, 293.93it/s][A
41337it [03:24, 314.21it/s][A
41370it [03:24, 314.19it/s][A
41402it [03:25, 288.68it/s][A
41433it [03:25, 294.17it/s][A
41471it [03:25, 315.39it/s][A
41505it [03:25, 319.89it/s][A
41539it [03:25, 317.94it/s][A
41578it [03:25, 334.74it/s][A
41618it [03:25, 350.85it/s][A
41654it [03:25, 344.68it/s][A
41694it [03:25, 357.92it/s][A
41731it [03:26, 330.99it/s][A
41769it [03:26, 339.93it/s][A
41804it [03:26, 331.67it/s][A
41842it [03:26, 342.26it/s][A
41877it [03:26, 335.53it/s][A
41911it [03:26, 307.46it/s][A
41943it [03:26, 290.04it/s][A
41973it [03:26, 284.10it/s][A
42002it [03:26, 284.02it/s][A
42036it [03:27, 298.14it/s][A
42067it [03:27, 270.58it/s][A
42095it [03:27, 246.57it/s][A
42124it [03:27, 255.78it/s][A
42151it [03:27, 254.69it/s][A
42184it [03:27, 272.76it/s][A
42214it 

49517it [03:56, 123.72it/s][A
49548it [03:56, 150.91it/s][A
49581it [03:56, 178.82it/s][A
49609it [03:56, 197.78it/s][A
49641it [03:56, 221.60it/s][A
49670it [03:57, 227.87it/s][A
49702it [03:57, 249.29it/s][A
49731it [03:57, 259.21it/s][A
49769it [03:57, 284.17it/s][A
49804it [03:57, 297.33it/s][A
49840it [03:57, 312.59it/s][A
49873it [03:57, 284.19it/s][A
49904it [03:57, 272.25it/s][A
49933it [03:57, 242.25it/s][A
49961it [03:58, 248.62it/s][A
49992it [03:58, 263.19it/s][A
50035it [03:58, 295.95it/s][A
50067it [03:58, 290.50it/s][A
50105it [03:58, 311.98it/s][A
50139it [03:58, 318.39it/s][A
50172it [03:58, 289.96it/s][A
50205it [03:58, 298.24it/s][A
50236it [03:58, 291.97it/s][A
50266it [03:59, 283.94it/s][A
50297it [03:59, 276.99it/s][A
50326it [03:59, 280.11it/s][A
50356it [03:59, 285.02it/s][A
50385it [03:59, 279.43it/s][A
50414it [03:59, 276.80it/s][A
50446it [03:59, 279.26it/s][A
50475it [03:59, 276.92it/s][A
50503it [03:59, 237.02it/s][A
50528it 

57019it [04:30, 281.16it/s][A
57055it [04:30, 299.73it/s][A
57090it [04:30, 313.06it/s][A
57125it [04:31, 322.75it/s][A
57163it [04:31, 333.04it/s][A
57197it [04:31, 321.24it/s][A
57230it [04:31, 317.36it/s][A
57263it [04:31, 311.23it/s][A
57303it [04:31, 332.21it/s][A
57347it [04:31, 355.11it/s][A
57384it [04:31, 341.05it/s][A
57419it [04:31, 319.46it/s][A
57452it [04:32, 311.71it/s][A
57484it [04:32, 290.82it/s][A
57514it [04:32, 279.82it/s][A
57543it [04:32, 282.58it/s][A
57576it [04:32, 294.64it/s][A
57606it [04:32, 288.54it/s][A
57636it [04:32, 273.24it/s][A
57664it [04:32, 256.71it/s][A
57700it [04:32, 280.78it/s][A
57730it [04:33, 274.76it/s][A
57765it [04:33, 293.30it/s][A
57800it [04:33, 301.70it/s][A
57831it [04:33, 282.05it/s][A
57860it [04:33, 219.51it/s][A
57885it [04:33, 172.23it/s][A
57906it [04:33, 178.42it/s][A
57930it [04:34, 193.11it/s][A
57952it [04:34, 180.35it/s][A
57972it [04:34, 164.37it/s][A
57990it [04:34, 162.29it/s][A
58008it 

64845it [05:00, 307.50it/s][A
64877it [05:01, 296.88it/s][A
64914it [05:01, 311.36it/s][A
64946it [05:01, 302.00it/s][A
64977it [05:01, 284.31it/s][A
65013it [05:01, 300.42it/s][A
65047it [05:01, 310.78it/s][A
65079it [05:01, 309.35it/s][A
65111it [05:01, 286.33it/s][A
65141it [05:01, 263.58it/s][A
65169it [05:02, 249.35it/s][A
65201it [05:02, 266.71it/s][A
65229it [05:02, 252.79it/s][A
65260it [05:02, 267.47it/s][A
65288it [05:02, 269.28it/s][A
65316it [05:02, 271.85it/s][A
65347it [05:02, 281.85it/s][A
65376it [05:02, 278.57it/s][A
65405it [05:02, 271.68it/s][A
65433it [05:03, 250.60it/s][A
65459it [05:03, 243.29it/s][A
65484it [05:03, 197.11it/s][A
65506it [05:03, 173.96it/s][A
65526it [05:03, 147.79it/s][A
65543it [05:03, 141.10it/s][A
65560it [05:03, 148.09it/s][A
65580it [05:04, 156.91it/s][A
65599it [05:04, 164.03it/s][A
65618it [05:04, 168.63it/s][A
65638it [05:04, 176.11it/s][A
65662it [05:04, 191.22it/s][A
65682it [05:04, 192.88it/s][A
65703it 

74075it [05:31, 321.47it/s][A
74119it [05:31, 347.23it/s][A
74162it [05:31, 365.95it/s][A
74201it [05:31, 371.93it/s][A
74239it [05:31, 365.24it/s][A
74276it [05:31, 366.63it/s][A
74313it [05:31, 325.88it/s][A
74347it [05:31, 318.25it/s][A
74380it [05:32, 316.16it/s][A
74421it [05:32, 337.84it/s][A
74456it [05:32, 322.88it/s][A
74489it [05:32, 306.24it/s][A
74521it [05:32, 299.02it/s][A
74553it [05:32, 304.36it/s][A
74584it [05:32, 301.76it/s][A
74616it [05:32, 299.74it/s][A
74648it [05:32, 304.83it/s][A
74679it [05:33, 292.99it/s][A
74715it [05:33, 307.80it/s][A
74747it [05:33, 307.13it/s][A
74778it [05:33, 294.84it/s][A
74809it [05:33, 285.32it/s][A
74839it [05:33, 288.22it/s][A
74872it [05:33, 298.13it/s][A
74903it [05:33, 286.27it/s][A
74938it [05:33, 302.01it/s][A
74969it [05:34, 280.47it/s][A
74998it [05:34, 273.99it/s][A
75026it [05:34, 274.96it/s][A
75054it [05:34, 265.52it/s][A
75083it [05:34, 271.66it/s][A
75111it [05:34, 243.94it/s][A
75142it 

78953it [06:14, 60.60it/s][A
78966it [06:14, 71.20it/s][A
78975it [06:14, 66.52it/s][A
78984it [06:14, 71.59it/s][A
78993it [06:14, 73.07it/s][A
79001it [06:14, 61.41it/s][A
79008it [06:14, 56.99it/s][A
79015it [06:15, 50.05it/s][A
79021it [06:15, 48.30it/s][A
79027it [06:15, 48.90it/s][A
79033it [06:15, 47.43it/s][A
79039it [06:15, 49.64it/s][A
79045it [06:15, 47.42it/s][A
79051it [06:15, 49.41it/s][A
79057it [06:15, 50.30it/s][A
79063it [06:16, 37.55it/s][A
79068it [06:16, 31.42it/s][A
79072it [06:16, 24.03it/s][A
79076it [06:16, 27.03it/s][A
79081it [06:16, 30.41it/s][A
79090it [06:16, 37.15it/s][A
79095it [06:17, 36.06it/s][A
79101it [06:17, 39.01it/s][A
79110it [06:17, 46.40it/s][A
79127it [06:17, 59.18it/s][A
79152it [06:17, 76.67it/s][A
79167it [06:17, 85.81it/s][A
79182it [06:17, 97.81it/s][A
79199it [06:17, 111.07it/s][A
79214it [06:17, 115.15it/s][A
79228it [06:18, 91.60it/s] [A
79240it [06:18, 86.90it/s][A
79251it [06:18, 87.82it/s][A
79261it

85508it [06:50, 276.79it/s][A
85542it [06:50, 293.08it/s][A
85575it [06:50, 279.41it/s][A
85611it [06:50, 298.95it/s][A
85643it [06:50, 279.22it/s][A
85673it [06:51, 280.48it/s][A
85713it [06:51, 308.07it/s][A
85746it [06:51, 310.04it/s][A
85784it [06:51, 322.29it/s][A
85819it [06:51, 329.55it/s][A
85853it [06:51, 302.12it/s][A
85889it [06:51, 316.52it/s][A
85922it [06:51, 310.81it/s][A
85954it [06:51, 303.64it/s][A
85989it [06:52, 315.96it/s][A
86022it [06:52, 295.11it/s][A
86061it [06:52, 318.09it/s][A
86102it [06:52, 339.78it/s][A
86142it [06:52, 353.90it/s][A
86179it [06:52, 350.11it/s][A
86215it [06:52, 314.12it/s][A
86248it [06:52, 315.60it/s][A
86281it [06:52, 314.92it/s][A
86315it [06:53, 319.28it/s][A
86348it [06:53, 319.70it/s][A
86381it [06:53, 314.75it/s][A
86415it [06:53, 321.06it/s][A
86454it [06:53, 338.75it/s][A
86489it [06:53, 311.92it/s][A
86521it [06:53, 307.00it/s][A
86553it [06:53, 215.11it/s][A
86594it [06:54, 250.77it/s][A
86625it 

94081it [07:23, 316.28it/s][A
94114it [07:23, 315.04it/s][A
94147it [07:23, 308.52it/s][A
94179it [07:23, 296.35it/s][A
94210it [07:23, 284.15it/s][A
94244it [07:23, 293.06it/s][A
94277it [07:23, 301.12it/s][A
94308it [07:24, 286.71it/s][A
94343it [07:24, 302.86it/s][A
94374it [07:24, 288.39it/s][A
94413it [07:24, 312.05it/s][A
94457it [07:24, 340.47it/s][A
94493it [07:24, 316.30it/s][A
94529it [07:24, 322.22it/s][A
94563it [07:24, 323.95it/s][A
94597it [07:24, 319.33it/s][A
94630it [07:25, 312.81it/s][A
94662it [07:25, 288.94it/s][A
94692it [07:25, 273.37it/s][A
94720it [07:25, 260.29it/s][A
94757it [07:25, 285.07it/s][A
94797it [07:25, 311.43it/s][A
94833it [07:25, 323.13it/s][A
94867it [07:25, 293.81it/s][A
94906it [07:25, 311.65it/s][A
94939it [07:26, 273.60it/s][A
94975it [07:26, 292.70it/s][A
95012it [07:26, 301.34it/s][A
95044it [07:26, 284.81it/s][A
95075it [07:26, 291.07it/s][A
95105it [07:26, 292.92it/s][A
95141it [07:26, 309.13it/s][A
95174it 

100527it [07:56, 150.69it/s][A
100559it [07:56, 177.62it/s][A
100585it [07:56, 190.03it/s][A
100607it [07:56, 180.84it/s][A
100630it [07:56, 192.87it/s][A
100651it [07:56, 194.95it/s][A
100675it [07:56, 205.05it/s][A
100704it [07:56, 223.79it/s][A
100728it [07:56, 225.58it/s][A
100754it [07:56, 233.68it/s][A
100779it [07:57, 222.91it/s][A
100802it [07:57, 224.85it/s][A
100831it [07:57, 240.57it/s][A
100857it [07:57, 245.56it/s][A
100882it [07:57, 235.95it/s][A
100906it [07:57, 228.65it/s][A
100930it [07:57, 214.43it/s][A
100957it [07:57, 227.47it/s][A
100981it [07:57, 222.97it/s][A
101009it [07:58, 236.63it/s][A
101036it [07:58, 241.75it/s][A
101062it [07:58, 244.52it/s][A
101090it [07:58, 252.79it/s][A
101118it [07:58, 260.22it/s][A
101145it [07:58, 209.46it/s][A
101178it [07:58, 234.66it/s][A
101210it [07:58, 252.64it/s][A
101238it [07:59, 234.19it/s][A
101264it [07:59, 229.78it/s][A
101289it [07:59, 213.70it/s][A
101315it [07:59, 225.62it/s][A
101341it

106892it [08:30, 54.50it/s][A
106901it [08:30, 61.04it/s][A
106908it [08:31, 54.04it/s][A
106914it [08:31, 55.14it/s][A
106923it [08:31, 61.70it/s][A
106932it [08:31, 68.07it/s][A
106940it [08:31, 61.66it/s][A
106949it [08:31, 67.29it/s][A
106957it [08:31, 69.93it/s][A
106965it [08:31, 62.61it/s][A
106973it [08:31, 66.34it/s][A
106980it [08:32, 62.20it/s][A
106987it [08:32, 55.13it/s][A
106995it [08:32, 58.59it/s][A
107002it [08:32, 50.66it/s][A
107008it [08:32, 50.77it/s][A
107015it [08:32, 54.74it/s][A
107023it [08:32, 58.61it/s][A
107032it [08:33, 62.46it/s][A
107039it [08:33, 50.45it/s][A
107045it [08:33, 43.07it/s][A
107080it [08:33, 57.95it/s][A
107094it [08:33, 70.06it/s][A
107108it [08:33, 82.36it/s][A
107122it [08:33, 90.83it/s][A
107138it [08:33, 100.87it/s][A
107152it [08:34, 99.67it/s] [A
107165it [08:34, 97.78it/s][A
107177it [08:34, 102.50it/s][A
107199it [08:34, 121.44it/s][A
107228it [08:34, 146.32it/s][A
107254it [08:34, 165.53it/s][A
10

112901it [09:04, 271.95it/s][A
112929it [09:04, 265.26it/s][A
112962it [09:04, 281.23it/s][A
112998it [09:04, 296.62it/s][A
113038it [09:04, 320.13it/s][A
113077it [09:04, 335.26it/s][A
113120it [09:04, 357.58it/s][A
113159it [09:04, 366.35it/s][A
113200it [09:05, 375.67it/s][A
113239it [09:05, 341.67it/s][A
113275it [09:05, 305.22it/s][A
113307it [09:05, 298.38it/s][A
113346it [09:05, 319.73it/s][A
113380it [09:05, 320.15it/s][A
113414it [09:05, 324.81it/s][A
113451it [09:05, 332.99it/s][A
113486it [09:06, 337.83it/s][A
113521it [09:06, 316.67it/s][A
113554it [09:06, 307.27it/s][A
113590it [09:06, 319.45it/s][A
113627it [09:06, 332.77it/s][A
113661it [09:06, 317.13it/s][A
113694it [09:06, 319.49it/s][A
113727it [09:06, 274.68it/s][A
113756it [09:06, 271.60it/s][A
113785it [09:07, 227.67it/s][A
113810it [09:07, 196.01it/s][A
113832it [09:07, 182.94it/s][A
113852it [09:07, 146.02it/s][A
113869it [09:07, 148.23it/s][A
113894it [09:07, 168.25it/s][A
113913it

118752it [09:40, 62.68it/s][A
118773it [09:40, 78.92it/s][A
118785it [09:40, 81.74it/s][A
118796it [09:40, 86.13it/s][A
118807it [09:40, 73.87it/s][A
118816it [09:40, 76.81it/s][A
118826it [09:41, 80.32it/s][A
118835it [09:41, 74.45it/s][A
118844it [09:41, 62.37it/s][A
118855it [09:41, 70.60it/s][A
118866it [09:41, 78.78it/s][A
118875it [09:41, 76.26it/s][A
118884it [09:41, 60.43it/s][A
118892it [09:42, 62.43it/s][A
118906it [09:42, 74.69it/s][A
118915it [09:42, 60.69it/s][A
118923it [09:42, 64.31it/s][A
118934it [09:42, 71.22it/s][A
118943it [09:42, 72.60it/s][A
118951it [09:42, 67.50it/s][A
118960it [09:42, 72.24it/s][A
118970it [09:43, 78.10it/s][A
118996it [09:43, 98.69it/s][A
119029it [09:43, 124.84it/s][A
119062it [09:43, 153.40it/s][A
119091it [09:43, 177.47it/s][A
119116it [09:43, 185.27it/s][A
119144it [09:43, 205.74it/s][A
119169it [09:43, 214.74it/s][A
119196it [09:43, 227.63it/s][A
119222it [09:44, 217.34it/s][A
119246it [09:44, 219.36it/s][A

124801it [10:11, 240.84it/s][A
124844it [10:12, 276.70it/s][A
124880it [10:12, 291.62it/s][A
124912it [10:12, 156.19it/s][A
124937it [10:12, 135.15it/s][A
124958it [10:13, 108.69it/s][A
124975it [10:13, 108.08it/s][A
124995it [10:13, 124.94it/s][A
125027it [10:13, 152.84it/s][A
125048it [10:13, 162.64it/s][A
125079it [10:13, 189.09it/s][A
125106it [10:13, 199.69it/s][A
125130it [10:14, 170.39it/s][A
125151it [10:14, 175.83it/s][A
125171it [10:14, 158.96it/s][A
125189it [10:14, 142.80it/s][A
125205it [10:14, 124.68it/s][A
125219it [10:14, 113.56it/s][A
125240it [10:14, 131.18it/s][A
125256it [10:15, 121.77it/s][A
125270it [10:15, 100.71it/s][A
125282it [10:15, 90.02it/s] [A
125293it [10:15, 69.97it/s][A
125305it [10:15, 78.96it/s][A
125315it [10:15, 82.90it/s][A
125325it [10:16, 70.95it/s][A
125334it [10:16, 74.59it/s][A
125345it [10:16, 81.71it/s][A
125354it [10:16, 81.16it/s][A
125363it [10:16, 68.24it/s][A
125377it [10:16, 79.54it/s][A
125389it [10:16, 

129278it [10:47, 218.95it/s][A
129320it [10:47, 252.61it/s][A
129351it [10:47, 172.48it/s][A
129376it [10:48, 137.86it/s][A
129396it [10:48, 145.37it/s][A
129420it [10:48, 164.79it/s][A
129441it [10:48, 167.77it/s][A
129461it [10:48, 162.69it/s][A
129482it [10:48, 173.48it/s][A
129521it [10:48, 207.32it/s][A
129546it [10:49, 206.65it/s][A
129570it [10:49, 204.20it/s][A
129593it [10:49, 186.53it/s][A
129614it [10:49, 148.16it/s][A
129635it [10:49, 161.11it/s][A
129671it [10:49, 192.86it/s][A
129702it [10:49, 213.78it/s][A
129737it [10:49, 241.03it/s][A
129766it [10:50, 253.20it/s][A
129799it [10:50, 271.58it/s][A
129829it [10:50, 279.29it/s][A
129859it [10:50, 216.22it/s][A
129889it [10:50, 235.43it/s][A
129929it [10:50, 268.46it/s][A
129968it [10:50, 293.19it/s][A
130001it [10:50, 284.34it/s][A
130032it [10:51, 261.16it/s][A
130061it [10:51, 244.13it/s][A
130091it [10:51, 257.94it/s][A
130124it [10:51, 271.28it/s][A
130161it [10:51, 290.96it/s][A
130192it

136293it [11:33, 214.43it/s][A
136321it [11:33, 230.60it/s][A
136358it [11:33, 259.90it/s][A
136388it [11:34, 245.57it/s][A
136420it [11:34, 263.60it/s][A
136453it [11:34, 280.22it/s][A
136488it [11:34, 296.88it/s][A
136520it [11:34, 295.97it/s][A
136551it [11:34, 262.36it/s][A
136579it [11:34, 264.50it/s][A
136614it [11:34, 284.69it/s][A
136645it [11:34, 288.63it/s][A
136680it [11:35, 304.40it/s][A
136712it [11:35, 293.35it/s][A
136746it [11:35, 305.01it/s][A
136778it [11:35, 288.87it/s][A
136808it [11:35, 260.25it/s][A
136835it [11:35, 241.32it/s][A
136861it [11:35, 230.75it/s][A
136892it [11:35, 249.18it/s][A
136931it [11:35, 279.30it/s][A
136972it [11:36, 308.03it/s][A
137006it [11:36, 289.26it/s][A
137037it [11:36, 286.35it/s][A
137074it [11:36, 305.55it/s][A
137106it [11:36, 235.47it/s][A
137133it [11:36, 161.65it/s][A
137155it [11:37, 132.61it/s][A
137173it [11:37, 125.66it/s][A
137189it [11:37, 121.77it/s][A
137204it [11:37, 118.53it/s][A
137219it

143542it [12:09, 301.85it/s][A
143574it [12:09, 273.93it/s][A
143603it [12:09, 272.58it/s][A
143637it [12:09, 288.76it/s][A
143668it [12:09, 294.40it/s][A
143702it [12:09, 306.10it/s][A
143734it [12:09, 288.93it/s][A
143764it [12:09, 270.01it/s][A
143792it [12:10, 256.49it/s][A
143819it [12:10, 240.38it/s][A
143844it [12:10, 234.69it/s][A
143876it [12:10, 254.47it/s][A
143903it [12:10, 257.22it/s][A
143930it [12:10, 248.65it/s][A
143960it [12:10, 261.31it/s][A
143996it [12:10, 278.58it/s][A
144025it [12:10, 279.41it/s][A
144054it [12:11, 269.91it/s][A
144092it [12:11, 295.41it/s][A
144123it [12:11, 288.70it/s][A
144153it [12:11, 275.72it/s][A
144182it [12:11, 246.02it/s][A
144208it [12:11, 229.27it/s][A
144239it [12:11, 248.38it/s][A
144265it [12:11, 249.71it/s][A
144291it [12:11, 231.45it/s][A
144320it [12:12, 244.73it/s][A
144346it [12:12, 245.95it/s][A
144375it [12:12, 256.99it/s][A
144417it [12:12, 288.82it/s][A
144449it [12:12, 296.77it/s][A
144480it

150291it [12:40, 80.37it/s][A
150301it [12:40, 70.33it/s][A
150310it [12:40, 66.16it/s][A
150320it [12:40, 73.05it/s][A
150329it [12:40, 70.35it/s][A
150340it [12:41, 78.34it/s][A
150349it [12:41, 73.25it/s][A
150357it [12:41, 58.34it/s][A
150365it [12:41, 63.41it/s][A
150375it [12:41, 70.36it/s][A
150383it [12:41, 70.20it/s][A
150391it [12:41, 72.51it/s][A
150399it [12:41, 66.10it/s][A
150408it [12:42, 71.14it/s][A
150416it [12:42, 73.36it/s][A
150429it [12:42, 84.26it/s][A
150441it [12:42, 91.62it/s][A
150458it [12:42, 102.36it/s][A
150470it [12:42, 105.57it/s][A
150482it [12:42, 95.56it/s] [A
150494it [12:42, 101.54it/s][A
150517it [12:42, 119.83it/s][A
150539it [12:43, 137.08it/s][A
150559it [12:43, 149.67it/s][A
150586it [12:43, 171.70it/s][A
150618it [12:43, 198.26it/s][A
150656it [12:43, 231.11it/s][A
150688it [12:43, 246.74it/s][A
150716it [12:43, 245.32it/s][A
150757it [12:43, 275.13it/s][A
150788it [12:43, 282.37it/s][A
150819it [12:43, 274.42it

157070it [13:12, 34.73it/s][A
157076it [13:12, 39.25it/s][A
157083it [13:13, 44.58it/s][A
157089it [13:13, 47.40it/s][A
157095it [13:13, 47.96it/s][A
157105it [13:13, 53.94it/s][A
157114it [13:13, 57.72it/s][A
157125it [13:13, 64.80it/s][A
157133it [13:13, 56.56it/s][A
157140it [13:14, 48.16it/s][A
157146it [13:14, 42.77it/s][A
157151it [13:14, 38.61it/s][A
157169it [13:14, 50.46it/s][A
157191it [13:14, 65.04it/s][A
157218it [13:14, 84.06it/s][A
157240it [13:14, 101.54it/s][A
157258it [13:14, 105.70it/s][A
157283it [13:15, 127.30it/s][A
157302it [13:15, 134.25it/s][A
157326it [13:15, 141.05it/s][A
157343it [13:15, 71.65it/s] [A
157356it [13:16, 56.01it/s][A
157367it [13:16, 63.56it/s][A
157385it [13:16, 78.58it/s][A
157401it [13:16, 92.40it/s][A
157417it [13:16, 103.78it/s][A
157443it [13:16, 126.33it/s][A
157461it [13:16, 136.31it/s][A
157479it [13:16, 145.95it/s][A
157499it [13:17, 158.34it/s][A
157519it [13:17, 166.14it/s][A
157550it [13:17, 192.76it/s

2643it [00:28, 181.26it/s][A
2679it [00:28, 212.96it/s][A
2709it [00:28, 219.52it/s][A
2737it [00:28, 224.16it/s][A
2764it [00:28, 229.94it/s][A
2796it [00:29, 239.53it/s][A
2823it [00:29, 223.87it/s][A
2848it [00:29, 217.58it/s][A
2876it [00:29, 233.13it/s][A
2903it [00:29, 239.50it/s][A
2928it [00:29, 190.69it/s][A
2950it [00:29, 176.87it/s][A
2970it [00:30, 154.77it/s][A
2988it [00:30, 130.00it/s][A
3003it [00:30, 113.17it/s][A
3033it [00:30, 138.78it/s][A
3055it [00:30, 156.00it/s][A
3086it [00:30, 183.13it/s][A
3109it [00:30, 179.87it/s][A
3134it [00:30, 196.19it/s][A
3158it [00:31, 205.30it/s][A
3184it [00:31, 218.28it/s][A
3210it [00:31, 228.65it/s][A
3241it [00:31, 246.64it/s][A
3276it [00:31, 268.34it/s][A
3305it [00:31, 262.61it/s][A
3333it [00:31, 264.69it/s][A
3361it [00:31, 260.85it/s][A
3395it [00:31, 279.24it/s][A
3424it [00:32, 265.64it/s][A
3452it [00:32, 258.86it/s][A
3479it [00:32, 224.34it/s][A
3504it [00:32, 229.75it/s][A
3535it [00

9024it [01:07, 45.48it/s][A
9030it [01:07, 43.54it/s][A
9036it [01:07, 36.56it/s][A
9047it [01:07, 45.60it/s][A
9069it [01:07, 59.82it/s][A
9103it [01:07, 79.39it/s][A
9133it [01:07, 101.72it/s][A
9158it [01:08, 123.06it/s][A
9188it [01:08, 149.42it/s][A
9217it [01:08, 173.74it/s][A
9243it [01:08, 185.88it/s][A
9282it [01:08, 219.78it/s][A
9311it [01:08, 220.44it/s][A
9346it [01:08, 246.17it/s][A
9375it [01:09, 140.96it/s][A
9398it [01:09, 133.75it/s][A
9423it [01:09, 154.93it/s][A
9459it [01:09, 186.81it/s][A
9493it [01:09, 215.41it/s][A
9521it [01:09, 226.05it/s][A
9549it [01:09, 224.65it/s][A
9575it [01:10, 158.08it/s][A
9596it [01:10, 158.20it/s][A
9625it [01:10, 182.38it/s][A
9650it [01:10, 197.18it/s][A
9677it [01:10, 212.66it/s][A
9701it [01:19,  8.54it/s] [A
9718it [01:19, 11.45it/s][A
9732it [01:20, 15.59it/s][A
9753it [01:20, 21.57it/s][A
9768it [01:20, 28.56it/s][A
9800it [01:20, 39.18it/s][A
9823it [01:20, 52.15it/s][A
9845it [01:20, 67.27it

17212it [02:15, 323.31it/s][A
17246it [02:15, 311.23it/s][A
17281it [02:15, 318.11it/s][A
17325it [02:15, 338.70it/s][A
17360it [02:16, 311.71it/s][A
17401it [02:16, 332.00it/s][A
17437it [02:16, 336.75it/s][A
17483it [02:16, 366.02it/s][A
17526it [02:16, 382.80it/s][A
17566it [02:16, 360.41it/s][A
17604it [02:16, 352.18it/s][A
17650it [02:16, 377.87it/s][A
17689it [02:16, 365.57it/s][A
17730it [02:17, 366.00it/s][A
17768it [02:17, 347.93it/s][A
17804it [02:17, 314.74it/s][A
17839it [02:17, 310.98it/s][A
17878it [02:17, 328.91it/s][A
17912it [02:17, 314.28it/s][A
17945it [02:17, 309.59it/s][A
17979it [02:17, 317.96it/s][A
18012it [02:17, 317.82it/s][A
18046it [02:18, 322.22it/s][A
18079it [02:18, 266.31it/s][A
18109it [02:18, 274.98it/s][A
18138it [02:18, 274.79it/s][A
18167it [02:18, 269.84it/s][A
18204it [02:18, 291.74it/s][A
18236it [02:18, 298.64it/s][A
18267it [02:18, 271.08it/s][A
18306it [02:18, 298.23it/s][A
18338it [02:19, 293.75it/s][A
18375it 

25649it [02:47, 268.79it/s][A
25684it [02:47, 288.69it/s][A
25716it [02:47, 295.47it/s][A
25751it [02:47, 307.18it/s][A
25783it [02:47, 291.58it/s][A
25813it [02:47, 270.29it/s][A
25843it [02:47, 253.56it/s][A
25871it [02:47, 258.63it/s][A
25905it [02:47, 252.86it/s][A
25938it [02:48, 268.49it/s][A
25967it [02:48, 274.11it/s][A
25995it [02:48, 271.03it/s][A
26023it [02:48, 257.52it/s][A
26050it [02:48, 227.55it/s][A
26074it [02:48, 217.84it/s][A
26100it [02:48, 214.03it/s][A
26145it [02:48, 253.98it/s][A
26178it [02:49, 272.84it/s][A
26209it [02:49, 277.38it/s][A
26239it [02:49, 263.60it/s][A
26267it [02:49, 260.43it/s][A
26301it [02:49, 276.29it/s][A
26330it [02:49, 261.09it/s][A
26367it [02:49, 285.29it/s][A
26403it [02:49, 303.82it/s][A
26435it [02:49, 290.50it/s][A
26466it [02:50, 267.32it/s][A
26499it [02:50, 279.93it/s][A
26528it [02:50, 279.59it/s][A
26557it [02:50, 264.15it/s][A
26592it [02:50, 284.37it/s][A
26622it [02:50, 282.37it/s][A
26651it 

33872it [03:17, 236.28it/s][A
33897it [03:17, 232.08it/s][A
33921it [03:17, 200.23it/s][A
33948it [03:17, 215.17it/s][A
33971it [03:18, 193.46it/s][A
34000it [03:18, 213.54it/s][A
34026it [03:18, 224.90it/s][A
34061it [03:18, 233.96it/s][A
34092it [03:18, 252.16it/s][A
34131it [03:18, 280.30it/s][A
34165it [03:18, 295.59it/s][A
34199it [03:18, 303.97it/s][A
34231it [03:18, 293.02it/s][A
34262it [03:19, 254.99it/s][A
34299it [03:19, 281.18it/s][A
34329it [03:19, 266.86it/s][A
34360it [03:19, 278.28it/s][A
34389it [03:19, 255.28it/s][A
34416it [03:19, 258.91it/s][A
34446it [03:19, 269.08it/s][A
34483it [03:19, 292.32it/s][A
34514it [03:19, 271.91it/s][A
34548it [03:20, 289.20it/s][A
34582it [03:20, 301.63it/s][A
34614it [03:20, 300.59it/s][A
34645it [03:20, 299.93it/s][A
34676it [03:20, 289.67it/s][A
34708it [03:20, 297.92it/s][A
34739it [03:20, 290.88it/s][A
34772it [03:20, 300.48it/s][A
34805it [03:20, 308.72it/s][A
34837it [03:21, 299.26it/s][A
34868it 

41997it [03:50, 253.81it/s][A
42038it [03:50, 285.34it/s][A
42070it [03:50, 258.67it/s][A
42102it [03:50, 274.28it/s][A
42136it [03:50, 288.90it/s][A
42172it [03:50, 305.88it/s][A
42204it [03:50, 285.54it/s][A
42234it [03:50, 251.19it/s][A
42261it [03:51, 207.80it/s][A
42285it [03:51, 195.49it/s][A
42307it [03:51, 180.04it/s][A
42327it [03:51, 156.88it/s][A
42350it [03:51, 169.97it/s][A
42384it [03:51, 199.70it/s][A
42420it [03:51, 228.77it/s][A
42457it [03:52, 251.41it/s][A
42488it [03:52, 260.33it/s][A
42517it [03:52, 267.57it/s][A
42549it [03:52, 279.31it/s][A
42579it [03:52, 274.25it/s][A
42608it [03:52, 275.83it/s][A
42637it [03:52, 271.52it/s][A
42665it [03:52, 260.28it/s][A
42692it [03:53, 180.46it/s][A
42714it [03:53, 137.34it/s][A
42732it [03:53, 111.44it/s][A
42747it [03:53, 103.36it/s][A
42761it [03:53, 108.45it/s][A
42783it [03:53, 127.54it/s][A
42816it [03:54, 156.09it/s][A
42849it [03:54, 185.40it/s][A
42878it [03:54, 205.64it/s][A
42912it 

49951it [04:25, 257.76it/s][A
49987it [04:25, 280.84it/s][A
50017it [04:25, 181.07it/s][A
50055it [04:26, 214.42it/s][A
50083it [04:26, 226.36it/s][A
50121it [04:26, 255.22it/s][A
50151it [04:26, 155.68it/s][A
50175it [04:26, 154.91it/s][A
50204it [04:26, 179.79it/s][A
50231it [04:27, 199.10it/s][A
50263it [04:27, 221.08it/s][A
50289it [04:27, 116.21it/s][A
50309it [04:27, 108.83it/s][A
50326it [04:27, 100.83it/s][A
50341it [04:28, 103.10it/s][A
50355it [04:28, 93.46it/s] [A
50377it [04:28, 110.78it/s][A
50411it [04:28, 138.82it/s][A
50447it [04:28, 169.75it/s][A
50478it [04:28, 196.37it/s][A
50505it [04:28, 153.87it/s][A
50527it [04:29, 136.61it/s][A
50546it [04:29, 138.68it/s][A
50564it [04:29, 140.55it/s][A
50581it [04:29, 142.73it/s][A
50597it [04:29, 125.30it/s][A
50614it [04:29, 133.59it/s][A
50641it [04:29, 157.09it/s][A
50660it [04:30, 163.96it/s][A
50679it [04:30, 141.92it/s][A
50695it [04:30, 134.19it/s][A
50710it [04:30, 121.77it/s][A
50724it 

57599it [05:03, 274.55it/s][A
57630it [05:03, 280.93it/s][A
57666it [05:03, 298.52it/s][A
57697it [05:04, 281.70it/s][A
57726it [05:04, 250.79it/s][A
57753it [05:04, 223.46it/s][A
57777it [05:04, 204.06it/s][A
57800it [05:04, 208.03it/s][A
57822it [05:04, 204.53it/s][A
57844it [05:04, 178.14it/s][A
57863it [05:05, 145.92it/s][A
57880it [05:05, 150.17it/s][A
57907it [05:05, 172.87it/s][A
57945it [05:05, 206.53it/s][A
57974it [05:05, 225.99it/s][A
58006it [05:05, 244.91it/s][A
58034it [05:05, 248.50it/s][A
58068it [05:05, 269.92it/s][A
58097it [05:05, 264.51it/s][A
58126it [05:05, 270.59it/s][A
58155it [05:06, 233.85it/s][A
58180it [05:06, 157.65it/s][A
58201it [05:06, 151.72it/s][A
58220it [05:06, 160.61it/s][A
58239it [05:06, 164.99it/s][A
58281it [05:06, 200.54it/s][A
58312it [05:06, 220.44it/s][A
58338it [05:07, 191.39it/s][A
58361it [05:07, 198.70it/s][A
58394it [05:07, 224.88it/s][A
58420it [05:07, 232.27it/s][A
58456it [05:07, 244.08it/s][A
58483it 

In [23]:
from allennlp.data.vocabulary import Vocabulary
vocab = Vocabulary.from_instances(train_ds + test_ds)

02/05/2019 15:17:06 - INFO - allennlp.data.vocabulary -   Fitting token dictionary from dataset.

  0%|          | 0/223549 [00:00<?, ?it/s][A
  0%|          | 659/223549 [00:00<00:34, 6517.92it/s][A
  1%|          | 1556/223549 [00:00<00:31, 7098.65it/s][A
  1%|          | 2512/223549 [00:00<00:28, 7691.96it/s][A
  2%|▏         | 3497/223549 [00:00<00:26, 8232.61it/s][A
  2%|▏         | 4373/223549 [00:00<00:26, 8383.67it/s][A
  2%|▏         | 5230/223549 [00:00<00:25, 8438.19it/s][A
  3%|▎         | 6012/223549 [00:00<00:27, 7815.88it/s][A
  3%|▎         | 6911/223549 [00:00<00:26, 8134.16it/s][A
  3%|▎         | 7767/223549 [00:00<00:26, 8245.54it/s][A
  4%|▍         | 8578/223549 [00:01<00:26, 8063.86it/s][A
  4%|▍         | 9376/223549 [00:01<00:27, 7660.34it/s][A
  5%|▍         | 10360/223549 [00:01<00:26, 8195.87it/s][A
  5%|▌         | 11335/223549 [00:01<00:24, 8607.18it/s][A
  5%|▌         | 12208/223549 [00:01<00:24, 8602.83it/s][A
  6%|▌         | 13170/22354

 49%|████▊     | 108803/223549 [00:15<00:15, 7240.82it/s][A
 49%|████▉     | 109556/223549 [00:15<00:17, 6686.41it/s][A
 49%|████▉     | 110253/223549 [00:16<00:16, 6759.54it/s][A
 50%|████▉     | 110955/223549 [00:16<00:16, 6835.20it/s][A
 50%|████▉     | 111684/223549 [00:16<00:16, 6964.52it/s][A
 50%|█████     | 112422/223549 [00:16<00:15, 7083.53it/s][A
 51%|█████     | 113160/223549 [00:16<00:15, 7169.17it/s][A
 51%|█████     | 113935/223549 [00:16<00:14, 7325.02it/s][A
 51%|█████▏    | 114760/223549 [00:16<00:14, 7575.43it/s][A
 52%|█████▏    | 115586/223549 [00:16<00:13, 7768.15it/s][A
 52%|█████▏    | 116407/223549 [00:16<00:13, 7894.51it/s][A
 52%|█████▏    | 117252/223549 [00:16<00:13, 8052.97it/s][A
 53%|█████▎    | 118105/223549 [00:17<00:12, 8190.17it/s][A
 53%|█████▎    | 118928/223549 [00:17<00:14, 7250.09it/s][A
 54%|█████▎    | 119896/223549 [00:17<00:13, 7836.71it/s][A
 54%|█████▍    | 120752/223549 [00:17<00:12, 8039.51it/s][A
 54%|█████▍    | 121642/

In [26]:
# vocab should have been saved using
vocab.save_to_files(DATA_ROOT / "vocab")

To avoid memory errors, restart here and build embedding matrix

In [3]:
from allennlp.data.vocabulary import Vocabulary
vocab = Vocabulary.from_files(DATA_ROOT / "vocab")

02/05/2019 15:26:24 - INFO - allennlp.data.vocabulary -   Loading token dictionary from ../data/jigsaw/vocab.


In [4]:
import fastText
ft_model = fastText.load_model(str(DATA_ROOT / "ft_model.bin"))

In [5]:
with (DATA_ROOT / "ft_model.txt").open("wt") as f:
    for idx, token in vocab.get_index_to_token_vocabulary().items():
        emb = ft_model.get_word_vector(token)
        emb_as_str = " ".join(["%.4f" % x for x in emb])
        f.write(f"{token} {emb_as_str}\n")