In [1]:
import itertools
import logging
from typing import Dict, List, Iterable
import torch
import torch.optim as optim

In [2]:
#from allennlp.data.dataset_readers import SnliReader
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
# to be checked: reads a text as a list of sentences
from allennlp.data.dataset_readers import TextClassificationJsonReader
from allennlp.data.fields import Field
from allennlp.data.fields import LabelField
from allennlp.data.fields import TextField, ListField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Tokenizer
from allennlp.modules.seq2vec_encoders import BertPooler
from allennlp.modules import TextFieldEmbedder
from allennlp.modules.seq2vec_encoders import BagOfEmbeddingsEncoder
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding



from allennlp.modules import Seq2VecEncoder
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.nn.util import get_text_field_mask
from allennlp.data.vocabulary import Vocabulary


from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer, PretrainedTransformerTokenizer
from allennlp.data.token_indexers import PretrainedTransformerIndexer
from allennlp.data.vocabulary import Vocabulary
from allennlp.training.trainer import Trainer
from allennlp.data.iterators import BucketIterator
from allennlp.common import Params

In [3]:
from dummy_chat_reader import ChatReader

In [4]:
token_indexers = {"tokens": SingleIdTokenIndexer()}

tokenizer_cfg = Params({"word_splitter": {"language": "en"}})

tokenizer = Tokenizer.from_params(tokenizer_cfg)


reader = ChatReader(
    tokenizer=tokenizer,
    token_indexers=token_indexers,
    )
train_instances = reader.read("./train_dummy.tsv")
vocab = Vocabulary.from_instances(train_instances)


for i in train_instances:
    #print(i)
    i["lines"].index(vocab)
    print(i["lines"].get_padding_lengths())

2it [00:00, 1115.65it/s]
100%|██████████| 2/2 [00:00<00:00, 15917.66it/s]

{'num_fields': 3, 'list_tokens_length': 7, 'list_num_tokens': 7}
{'num_fields': 4, 'list_tokens_length': 9, 'list_num_tokens': 9}





In [5]:
i["lines"][0].__dict__

{'tokens': [another, chat, starts],
 '_token_indexers': {'tokens': <allennlp.data.token_indexers.single_id_token_indexer.SingleIdTokenIndexer at 0x7f940d963450>},
 '_indexed_tokens': {'tokens': [7, 6, 12]},
 '_indexer_name_to_indexed_token': {'tokens': ['tokens']},
 '_token_index_to_indexer_name': {'tokens': 'tokens'}}

In [6]:
turn_encoder_cfg = Params({"type":"gru",'input_size': 100, 'hidden_size': 50, 'num_layers': 1,
                  'dropout': 0.25, 'bidirectional': False
})
#can be changed dynamically encoder_cfg["type"] = "lstm"
# warning: if bidirectional, state output dimension is hidden_size x 2 -> model doesn't know that

turn_encoder = Seq2VecEncoder.from_params(turn_encoder_cfg)
turn_encoder.hidden_size = turn_encoder_cfg["hidden_size"]


chat_encoder_cfg = Params({"type":"gru",'input_size': 50, 'hidden_size': 50, 'num_layers': 1,
                  'dropout': 0.25, 'bidirectional': False
})
chat_encoder = Seq2VecEncoder.from_params(chat_encoder_cfg)
chat_encoder.hidden_size = chat_encoder_cfg["hidden_size"]



glove_text_field_embedder = Embedding.from_params(vocab,Params({"pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz",
                                                          "embedding_dim": 100,
                                                          "trainable": False
}))

token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=100)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})



# not used for now: type should be bucket
trainer_cfg = Params({"iterator": {"type": "basic",
                                   "batch_size": 32
},
                      "trainer": {
                          "optimizer": {
                              "type": "adam"
                          },
                          "num_epochs": 3,
                          "patience": 10,
                          "cuda_device": -1
                      }
})


  "num_layers={}".format(dropout, num_layers))
400000it [00:02, 160810.00it/s]


In [7]:
from hierarchical_encoder import HierarchicalChatClassification

model = HierarchicalChatClassification(vocab,word_embeddings,turn_encoder,chat_encoder)

optimizer = optim.SGD(model.parameters(), lr=0.1)
iterator = BucketIterator(batch_size=2,sorting_keys=[("lines","list_num_tokens")])
iterator.index_with(vocab)
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_instances,
                  should_log_parameter_statistics = False
)

In [8]:
trainer.train()

accuracy: 1.0000, loss: 0.6784 ||: 100%|██████████| 1/1 [00:00<00:00, 99.30it/s]
accuracy: 1.0000, loss: 0.5810 ||: 100%|██████████| 1/1 [00:00<00:00, 88.41it/s]
accuracy: 1.0000, loss: 0.5029 ||: 100%|██████████| 1/1 [00:00<00:00, 112.61it/s]
accuracy: 1.0000, loss: 0.4389 ||: 100%|██████████| 1/1 [00:00<00:00, 120.47it/s]
accuracy: 1.0000, loss: 0.3854 ||: 100%|██████████| 1/1 [00:00<00:00, 118.41it/s]
accuracy: 1.0000, loss: 0.3403 ||: 100%|██████████| 1/1 [00:00<00:00, 93.00it/s]
accuracy: 1.0000, loss: 0.3018 ||: 100%|██████████| 1/1 [00:00<00:00, 108.93it/s]
accuracy: 1.0000, loss: 0.2688 ||: 100%|██████████| 1/1 [00:00<00:00, 130.66it/s]
accuracy: 1.0000, loss: 0.2405 ||: 100%|██████████| 1/1 [00:00<00:00, 111.60it/s]
accuracy: 1.0000, loss: 0.2159 ||: 100%|██████████| 1/1 [00:00<00:00, 135.64it/s]
accuracy: 1.0000, loss: 0.1946 ||: 100%|██████████| 1/1 [00:00<00:00, 96.25it/s]
accuracy: 1.0000, loss: 0.1761 ||: 100%|██████████| 1/1 [00:00<00:00, 149.83it/s]
accuracy: 1.0000, lo

{'best_epoch': 19,
 'peak_cpu_memory_MB': 331.64,
 'peak_gpu_0_memory_MB': 959,
 'training_duration': '0:00:00.581604',
 'training_start_epoch': 0,
 'training_epochs': 19,
 'epoch': 19,
 'training_accuracy': 1.0,
 'training_loss': 0.08934460580348969,
 'training_cpu_memory_MB': 331.64,
 'training_gpu_0_memory_MB': 959}