In [1]:
import itertools
import logging
from typing import Dict, List, Iterable
import torch
import torch.optim as optim

In [2]:
from dummy_chat_reader import ChatReader, SimpleChatReader
from simple_encoder import ChatClassification

In [3]:
#from allennlp.data.dataset_readers import SnliReader
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
# to be checked: reads a text as a list of sentences
from allennlp.data.dataset_readers import TextClassificationJsonReader
from allennlp.data.fields import Field
from allennlp.data.fields import LabelField
from allennlp.data.fields import TextField, ListField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Tokenizer
from allennlp.modules.seq2vec_encoders import BertPooler
from allennlp.modules import TextFieldEmbedder
from allennlp.modules.seq2vec_encoders import BagOfEmbeddingsEncoder
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding



from allennlp.modules import Seq2VecEncoder
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.nn.util import get_text_field_mask
from allennlp.data.vocabulary import Vocabulary


from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer, PretrainedTransformerTokenizer
from allennlp.data.token_indexers import PretrainedTransformerIndexer
from allennlp.data.vocabulary import Vocabulary
from allennlp.training.trainer import Trainer
from allennlp.data.iterators import BucketIterator
from allennlp.common import Params

In [4]:
token_indexers = {"tokens": SingleIdTokenIndexer()}

tokenizer_cfg = Params({"word_splitter": {"language": "en"}})

tokenizer = Tokenizer.from_params(tokenizer_cfg)


reader = SimpleChatReader(
    tokenizer=tokenizer,
    token_indexers=token_indexers,
    )
train_instances = reader.read("./train_dummy.tsv")
vocab = Vocabulary.from_instances(train_instances)


for i in train_instances:
    #print(i)
    i["sentence"].index(vocab)
    print(i["sentence"].get_padding_lengths())

INFO:allennlp.common.from_params:instantiating class <class 'allennlp.data.tokenizers.tokenizer.Tokenizer'> from params {'word_splitter': {'language': 'en'}} and extras set()
INFO:allennlp.common.params:type = word
INFO:allennlp.common.from_params:instantiating class <class 'allennlp.data.tokenizers.word_tokenizer.WordTokenizer'> from params {'word_splitter': {'language': 'en'}} and extras set()
INFO:allennlp.common.from_params:instantiating class <class 'allennlp.data.tokenizers.word_splitter.WordSplitter'> from params {'language': 'en'} and extras set()
INFO:allennlp.common.params:word_splitter.type = spacy
INFO:allennlp.common.from_params:instantiating class <class 'allennlp.data.tokenizers.word_splitter.SpacyWordSplitter'> from params {'language': 'en'} and extras set()
INFO:allennlp.common.params:word_splitter.language = en
INFO:allennlp.common.params:word_splitter.pos_tags = False
INFO:allennlp.common.params:word_splitter.parse = False
INFO:allennlp.common.params:word_splitter.ne

{'tokens_length': 13, 'num_tokens': 13}
{'tokens_length': 11, 'num_tokens': 11}





In [5]:
encoder_cfg = Params({"type":"gru",'input_size': 100, 'hidden_size': 50, 'num_layers': 1,
                  'dropout': 0.25, 'bidirectional': False
})
#encoder_cfg["type"] = "gru"
encoder = Seq2VecEncoder.from_params(encoder_cfg)
encoder.hidden_size = encoder_cfg["hidden_size"]



glove_text_field_embedder = Embedding.from_params(vocab,Params({"pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz",
                                                          "embedding_dim": 100,
                                                          "trainable": False
}))

token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=100)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})



#text_field_embedder= TextFieldEmbedder.from_params(text_field_embedder_cfg,vocab=vocab)
# """You need to be sure that the TextFieldEmbedder is expecting the same thing that your DatasetReader is producing, but that happens in the configuration file, and we'll talk about it later."""


trainer_cfg = Params({"iterator": {"type": "basic",
                                   "batch_size": 32
},
                      "trainer": {
                          "optimizer": {
                              "type": "adam"
                          },
                          "num_epochs": 3,
                          "patience": 10,
                          "cuda_device": -1
                      }
})


model = ChatClassification(vocab,word_embeddings,encoder)

optimizer = optim.SGD(model.parameters(), lr=0.1)
iterator = BucketIterator(batch_size=1,sorting_keys=[("sentence","num_tokens")])
iterator.index_with(vocab)
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_instances,
                  should_log_parameter_statistics = False
)

INFO:allennlp.common.from_params:instantiating class <class 'allennlp.modules.seq2vec_encoders.seq2vec_encoder.Seq2VecEncoder'> from params {'type': 'gru', 'input_size': 100, 'hidden_size': 50, 'num_layers': 1, 'dropout': 0.25, 'bidirectional': False} and extras set()
INFO:allennlp.common.params:type = gru
INFO:allennlp.common.params:batch_first = True
INFO:allennlp.common.params:Converting Params object to dict; logging of default values will not occur when dictionary parameters are used subsequently.
INFO:allennlp.common.params:CURRENTLY DEFINED PARAMETERS: 
INFO:allennlp.common.params:input_size = 100
INFO:allennlp.common.params:hidden_size = 50
INFO:allennlp.common.params:num_layers = 1
INFO:allennlp.common.params:dropout = 0.25
INFO:allennlp.common.params:bidirectional = False
INFO:allennlp.common.params:batch_first = True
  "num_layers={}".format(dropout, num_layers))
INFO:allennlp.common.params:num_embeddings = None
INFO:allennlp.common.params:vocab_namespace = tokens
INFO:allen

In [6]:
trainer.train()

INFO:allennlp.training.trainer:Beginning training.
INFO:allennlp.training.trainer:Epoch 0/19
INFO:allennlp.training.trainer:Peak CPU memory usage MB: 320.816
INFO:allennlp.training.trainer:GPU 0 memory usage MB: 959
INFO:allennlp.training.trainer:Training
  0%|          | 0/2 [00:00<?, ?it/s]INFO:root:forward pass: turn encodings done
INFO:root:forward pass: turn encodings done
accuracy: 1.0000, loss: 0.6309 ||: 100%|██████████| 2/2 [00:00<00:00, 129.44it/s]
INFO:allennlp.training.tensorboard_writer:                    Training |  Validation
INFO:allennlp.training.tensorboard_writer:accuracy        |     1.000  |       N/A
INFO:allennlp.training.tensorboard_writer:loss            |     0.631  |       N/A
INFO:allennlp.training.tensorboard_writer:cpu_memory_MB   |   320.816  |       N/A
INFO:allennlp.training.tensorboard_writer:gpu_0_memory_MB |   959.000  |       N/A
INFO:allennlp.training.trainer:Epoch duration: 0:00:00.037782
INFO:allennlp.training.trainer:Estimated training time rem

INFO:allennlp.training.tensorboard_writer:accuracy        |     1.000  |       N/A
INFO:allennlp.training.tensorboard_writer:loss            |     0.093  |       N/A
INFO:allennlp.training.tensorboard_writer:cpu_memory_MB   |   330.632  |       N/A
INFO:allennlp.training.tensorboard_writer:gpu_0_memory_MB |   959.000  |       N/A
INFO:allennlp.training.trainer:Epoch duration: 0:00:00.039708
INFO:allennlp.training.trainer:Estimated training time remaining: 0:00:00
INFO:allennlp.training.trainer:Epoch 9/19
INFO:allennlp.training.trainer:Peak CPU memory usage MB: 330.632
INFO:allennlp.training.trainer:GPU 0 memory usage MB: 959
INFO:allennlp.training.trainer:Training
  0%|          | 0/2 [00:00<?, ?it/s]INFO:root:forward pass: turn encodings done
INFO:root:forward pass: turn encodings done
accuracy: 1.0000, loss: 0.0797 ||: 100%|██████████| 2/2 [00:00<00:00, 151.21it/s]
INFO:allennlp.training.tensorboard_writer:                    Training |  Validation
INFO:allennlp.training.tensorboard_

INFO:allennlp.training.trainer:Epoch 17/19
INFO:allennlp.training.trainer:Peak CPU memory usage MB: 330.632
INFO:allennlp.training.trainer:GPU 0 memory usage MB: 959
INFO:allennlp.training.trainer:Training
  0%|          | 0/2 [00:00<?, ?it/s]INFO:root:forward pass: turn encodings done
INFO:root:forward pass: turn encodings done
accuracy: 1.0000, loss: 0.0319 ||: 100%|██████████| 2/2 [00:00<00:00, 154.69it/s]
INFO:allennlp.training.tensorboard_writer:                    Training |  Validation
INFO:allennlp.training.tensorboard_writer:accuracy        |     1.000  |       N/A
INFO:allennlp.training.tensorboard_writer:loss            |     0.032  |       N/A
INFO:allennlp.training.tensorboard_writer:cpu_memory_MB   |   330.632  |       N/A
INFO:allennlp.training.tensorboard_writer:gpu_0_memory_MB |   959.000  |       N/A
INFO:allennlp.training.trainer:Epoch duration: 0:00:00.037142
INFO:allennlp.training.trainer:Estimated training time remaining: 0:00:00
INFO:allennlp.training.trainer:Epo

{'best_epoch': 19,
 'peak_cpu_memory_MB': 330.632,
 'peak_gpu_0_memory_MB': 959,
 'training_duration': '0:00:00.747169',
 'training_start_epoch': 0,
 'training_epochs': 19,
 'epoch': 19,
 'training_accuracy': 1.0,
 'training_loss': 0.02699782233685255,
 'training_cpu_memory_MB': 330.632,
 'training_gpu_0_memory_MB': 959}