In [1]:
!pip install allennlp
!git clone https://github.com/mhagiwara/realworldnlp.git
%cd realworldnlp

Collecting allennlp
[?25l  Downloading https://files.pythonhosted.org/packages/bb/bb/041115d8bad1447080e5d1e30097c95e4b66e36074277afce8620a61cee3/allennlp-0.9.0-py3-none-any.whl (7.6MB)
[K     |████████████████████████████████| 7.6MB 8.4MB/s 
[?25hCollecting ftfy
[?25l  Downloading https://files.pythonhosted.org/packages/ec/d8/5e877ac5e827eaa41a7ea8c0dc1d3042e05d7e337604dc2aedb854e7b500/ftfy-5.7.tar.gz (58kB)
[K     |████████████████████████████████| 61kB 9.6MB/s 
Collecting flask-cors>=3.0.7
  Downloading https://files.pythonhosted.org/packages/78/38/e68b11daa5d613e3a91e4bf3da76c94ac9ee0d9cd515af9c1ab80d36f709/Flask_Cors-3.0.8-py2.py3-none-any.whl
Collecting jsonpickle
  Downloading https://files.pythonhosted.org/packages/7e/6b/fbb2d499b96861a18c1641f6fefe775110d3faba65c1524950e9ad64824a/jsonpickle-1.3-py2.py3-none-any.whl
Collecting responses>=0.7
  Downloading https://files.pythonhosted.org/packages/a5/52/8063322bd9ee6e7921b74fcb730c6ba983ff995ddfabd966bb689e313464/responses-0.

In [0]:
from typing import Dict

import numpy as np
import torch
import torch.optim as optim
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.fields import LabelField, TextField
from allennlp.data.instance import Instance
from allennlp.data.iterators import BucketIterator
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers.character_tokenizer import CharacterTokenizer
from allennlp.data.vocabulary import Vocabulary
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.training.trainer import Trainer
from overrides import overrides

from examples.sentiment.sst_classifier import LstmClassifier

In [0]:
EMBEDDING_DIM = 16
HIDDEN_DIM = 16

In [0]:
class TatoebaSentenceReader(DatasetReader):
    def __init__(self, token_indexers: Dict[str, TokenIndexer]=None, lazy=False):
        super().__init__(lazy=lazy)
        self.tokenizer = CharacterTokenizer()
        self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}

    @overrides
    def text_to_instance(self, tokens, label=None):
        fields = {}

        fields['tokens'] = TextField(tokens, self.token_indexers)
        if label:
            fields['label'] = LabelField(label)

        return Instance(fields)

    @overrides
    def _read(self, file_path: str):
        file_path = cached_path(file_path)
        with open(file_path, "r") as text_file:
            for line in text_file:
                lang_id, sent = line.rstrip().split('\t')

                tokens = self.tokenizer.tokenize(sent)

                yield self.text_to_instance(tokens, lang_id)

In [0]:
def classify(text: str, model: LstmClassifier):
    tokenizer = CharacterTokenizer()
    token_indexers = {'tokens': SingleIdTokenIndexer()}

    tokens = tokenizer.tokenize(text)
    instance = Instance({'tokens': TextField(tokens, token_indexers)})
    logits = model.forward_on_instance(instance)['logits']
    label_id = np.argmax(logits)
    label = model.vocab.get_token_from_index(label_id, 'labels')

    print('text: {}, label: {}'.format(text, label))

In [0]:
reader = TatoebaSentenceReader()

In [7]:
train_set = reader.read('https://s3.amazonaws.com/realworldnlpbook/data/tatoeba/sentences.top10langs.train.tsv')
dev_set = reader.read('https://s3.amazonaws.com/realworldnlpbook/data/tatoeba/sentences.top10langs.dev.tsv')

0it [00:00, ?it/s]
  0%|          | 0/4414160 [00:00<?, ?B/s][A
  1%|          | 52224/4414160 [00:00<00:13, 312595.29B/s][A
  6%|▌         | 243712/4414160 [00:00<00:10, 399021.79B/s][A
 21%|██        | 922624/4414160 [00:00<00:06, 546638.44B/s][A
 60%|██████    | 2667520/4414160 [00:00<00:02, 763476.15B/s][A
100000it [00:09, 10486.73it/s]
0it [00:00, ?it/s]
  0%|          | 0/443237 [00:00<?, ?B/s][A
 12%|█▏        | 52224/443237 [00:00<00:01, 329353.31B/s][A
 55%|█████▍    | 243712/443237 [00:00<00:00, 417511.09B/s][A
10000it [00:01, 6148.44it/s]


In [8]:
vocab = Vocabulary.from_instances(train_set,
                                  min_count={'tokens': 3})

100%|██████████| 100000/100000 [00:01<00:00, 58096.78it/s]


In [0]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [0]:
encoder = PytorchSeq2VecWrapper(
    torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

In [0]:
model = LstmClassifier(word_embeddings, encoder, vocab, positive_label='eng')

In [0]:
iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])
iterator.index_with(vocab)

In [0]:
optimizer = optim.Adam(model.parameters())

In [0]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_set,
                  validation_dataset=dev_set,
                  num_epochs=10)

trainer.train()

In [15]:
classify('Take your raincoat in case it rains.', model)

text: Take your raincoat in case it rains., label: eng


In [20]:
classify('Tu me recuerdas a mi padre.', model)

text: Tu me recuerdas a mi padre., label: spa


In [21]:
classify('Wie organisierst du das Essen am Mittag?', model)

text: Wie organisierst du das Essen am Mittag?, label: deu


In [22]:
classify("Il est des cas où cette règle ne s'applique pas.", model)

text: Il est des cas où cette règle ne s'applique pas., label: fra


In [23]:
classify('Estou fazendo um passeio em um parque.', model)

text: Estou fazendo um passeio em um parque., label: por


In [24]:
classify('Ve, postmorgaŭ jam estas la limdato.', model)

text: Ve, postmorgaŭ jam estas la limdato., label: epo


In [25]:
classify('Credevo che sarebbe venuto.', model)

text: Credevo che sarebbe venuto., label: ita


In [27]:
classify('Nem tudja, hogy én egy macska vagyok.', model)

text: Nem tudja, hogy én egy macska vagyok., label: hun


In [28]:
classify('Nella ur nli qrib acemma deg tenwalt.', model)

text: Nella ur nli qrib acemma deg tenwalt., label: ber


In [29]:
classify('Kurşun kalemin yok, değil mi?', model)

text: Kurşun kalemin yok, değil mi?, label: tur
