In [1]:
import itertools
import logging
from typing import Any, Tuple, Dict, List, Iterable
import torch
import torch.optim as optim


from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import Field, LabelField, TextField, ListField, SequenceLabelField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer, PretrainedBertIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer, PretrainedTransformerTokenizer
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding, PretrainedBertEmbedder
from allennlp.modules.seq2vec_encoders import BertPooler

from allennlp.modules import Seq2VecEncoder, Seq2SeqEncoder
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.data.vocabulary import Vocabulary

from allennlp.training.trainer import Trainer
from allennlp.data.iterators import BucketIterator
from allennlp.common import Params
from allennlp.nn import util
from datetime import date, datetime

In [2]:
torch.cuda.is_available()

True

In [3]:
from irc_chat_reader import ChatReader

In [4]:
token_indexers = {"tokens": SingleIdTokenIndexer()}

tokenizer_cfg = Params({"word_splitter": {"language": "en"}})

word_tokenizer = Tokenizer.from_params(tokenizer_cfg)

reader = ChatReader(
    tokenizer=word_tokenizer,
    token_indexers=token_indexers,
    raw = True,
    sub_sequence = 10,
    loop = True,
    #clip = 200
    )


In [5]:
train_instances = reader.read("../data/train")


67415it [00:36, 1855.37it/s]


In [6]:
dev_instances = reader.read("../data/dev")

2497it [00:01, 2169.79it/s]


In [7]:
vocab = Vocabulary.from_instances(train_instances+dev_instances)

100%|██████████| 69912/69912 [00:05<00:00, 13943.42it/s]


In [8]:
input_size = 300

turn_encoder_cfg = Params({"type":"gru",'input_size': input_size, 'hidden_size': 100, 'num_layers': 1,
                      'dropout': 0.25, 'bidirectional': True
})
#can be changed dynamically encoder_cfg["type"] = "lstm"
# warning: if bidirectional, state output dimension is hidden_size x 2 

turn_encoder = Seq2VecEncoder.from_params(turn_encoder_cfg)
turn_encoder.hidden_size = turn_encoder_cfg["hidden_size"]*(1+turn_encoder_cfg["bidirectional"])



turn_feature_size = 1

chat_encoder_cfg = Params({"type":"gru",'input_size': turn_encoder.hidden_size, 'hidden_size': 100, 'num_layers': 3,
                  'dropout': 0.25, 'bidirectional': False
})
chat_encoder = Seq2SeqEncoder.from_params(chat_encoder_cfg)
chat_encoder.hidden_size = chat_encoder_cfg["hidden_size"]

glove_dim, glove_version = input_size, "../data/glove-ubuntu.txt"
glove_text_field_embedder = Embedding.from_params(vocab,Params({"pretrained_file": glove_version,
                                                          "embedding_dim": glove_dim,
                                                          "trainable": True
}))
    

#token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
#                        embedding_dim=300)
token_embedding = glove_text_field_embedder 
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

400000it [00:02, 190785.53it/s]


In [9]:
chat_encoder_cfg =  Params({
            "type": "stacked_bidirectional_lstm",
            "hidden_size": 400,
            "input_size": 200,
            "num_layers": 3,
            "recurrent_dropout_probability": 0.3,
            "use_highway": True
        })


In [10]:
from allennlp.models import Model
from typing import Dict, List, Iterable
from allennlp.modules import TimeDistributed
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits

In [11]:
from allennlp.models import Model
from allennlp.common.checks import check_dimensions_match, ConfigurationError
from allennlp.data import Vocabulary
from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder, Embedding, InputVariationalDropout
from allennlp.modules.matrix_attention.bilinear_matrix_attention import BilinearMatrixAttention
from allennlp.modules.matrix_attention.dot_product_matrix_attention import DotProductMatrixAttention
from allennlp.modules.matrix_attention.linear_matrix_attention import LinearMatrixAttention

from allennlp.modules import FeedForward
from allennlp.models.model import Model
from allennlp.nn import InitializerApplicator, Activation
from allennlp.nn.activations import Activation
from allennlp.nn.util import add_positional_features


#???? TODO from allennlp.nn.util import min_value_of_dtype -> only allennlp >= 1.0
def min_value_of_dtype(dtype: torch.dtype):
    """
    Returns the minimum value of a given PyTorch data type. Does not allow torch.bool.
    """
    return info_value_of_dtype(dtype).min
def info_value_of_dtype(dtype: torch.dtype):
    """
    Returns the `finfo` or `iinfo` object of a given PyTorch data type. Does not allow torch.bool.
    """
    if dtype == torch.bool:
        raise TypeError("Does not support torch.bool")
    elif dtype.is_floating_point:
        return torch.finfo(dtype)
    else:
        return torch.iinfo(dtype)

# utility for max decoding. should be a better way but...
# should also not modify the original tensor ?
def force_max(t):
    """set all values of a tensor (batch,length,length) to zero if not maximum (on a given line)"""
    dims = t.size()
    b,_ = t.max(axis=2)
    bb = b.repeat((1,dims[1]))
    bb = torch.reshape(bb,(dims[0],dims[1],dims[1])).transpose(1,2)
    #print(bb)
    t[t<bb]=0
    return t
    
    

from allennlp.nn.util import get_text_field_mask
from allennlp.nn.util import get_lengths_from_binary_sequence_mask
from allennlp.training.metrics import F1Measure

import copy
from overrides import overrides
import torch
from torch.nn.modules import Dropout
from torch.nn.modules.linear import Linear
import numpy
import pandas as pds

class ChatGraphParser(Model):
    """
    A Parser for arbitrary graph structures.

    Registered as a `Model` with name "graph_parser".

    # Parameters

    vocab : `Vocabulary`, required
        A Vocabulary, required in order to compute sizes for input/output projections.
    text_field_embedder : `TextFieldEmbedder`, required
        Used to embed the `tokens` `TextField` we get as input to the model.
    turn_encoder : `Seq2VeqEncoder`
        The encoder that we will use to generate representation for whole turns from tokens.
    chat_encoder: `Seq2SeqEncoder`   The encoder that we will use to generate representations
        of turns within a chat
    arc_representation_dim : `int`, required.
        The dimension of the MLPs used for arc prediction.
        
    feature_size : `int`
        The embedding size for all the embedded features, such as distances
        
    tag_feedforward : `FeedForward`, optional, (default = None).
        The feedforward network used to produce tag representations.
        By default, a 1 layer feedforward network with an elu activation is used.
    arc_feedforward : `FeedForward`, optional, (default = None).
        The feedforward network used to produce arc representations.
        By default, a 1 layer feedforward network with an elu activation is used.

    dropout : `float`, optional, (default = 0.0)
        The variational dropout applied to the output of the encoder and MLP layers.
    input_dropout : `float`, optional, (default = 0.0)
        The dropout applied to the embedded text input.
    edge_prediction_threshold : `int`, optional (default = 0.5)
        The probability at which to consider a scored edge to be 'present'
        in the decoded graph. Must be between 0 and 1.
    initializer : `InitializerApplicator`, optional (default=`InitializerApplicator()`)
        Used to initialize the model parameters.
    """

    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        turn_encoder: Seq2VecEncoder, 
        chat_encoder: Seq2SeqEncoder,
        arc_representation_dim: int,
        feature_size = 4,
        arc_feedforward: FeedForward = None,
        use_features = False,
        turn_feature_size = 0,# total dimension of turn-based additional features
        pair_feature_size = 0,# total dimension of pair-based additional features
        dropout: float = 0.3,
        input_dropout: float = 0.0,
        edge_prediction_threshold: float = 0.5,
        positive_class_weight = 40,
        prediction_window = 13, # dont predict edge further apart
        initializer: InitializerApplicator = InitializerApplicator(),
        debug = False,
        **kwargs,
    ) -> None:
        super().__init__(vocab, **kwargs)
        
        self.text_field_embedder = text_field_embedder
        self.turn_encoder = TimeDistributed(turn_encoder)
        self.chat_encoder = chat_encoder
        
        self.edge_prediction_threshold = edge_prediction_threshold
        if not 0 < edge_prediction_threshold < 1:
            raise ConfigurationError(
                f"edge_prediction_threshold must be between "
                f"0 and 1 (exclusive) but found {edge_prediction_threshold}."
            )

        encoder_dim = chat_encoder.get_output_dim()
        
        self.use_features = use_features
        self.turn_feature_size = turn_feature_size
        self.pair_feature_size = pair_feature_size
        
        self.head_arc_feedforward = FeedForward(
            encoder_dim+self.turn_feature_size, 1, arc_representation_dim, Activation.by_name("relu")()
        )
        self.child_arc_feedforward = copy.deepcopy(self.head_arc_feedforward)

        # 10 possible distance buckets.
        if self.use_features:
            self._num_distance_buckets = 10
            self._distance_embedding = Embedding(
                embedding_dim=feature_size, num_embeddings=self._num_distance_buckets
            )
            total_feature_size = self.pair_feature_size
        
        #
        #self.arc_attention = BilinearMatrixAttention(
        #    arc_representation_dim, arc_representation_dim, use_input_biases=True
        #)
        #self.arc_attention = DotProductMatrixAttention()
        self.arc_attention = LinearMatrixAttention(
            arc_representation_dim, arc_representation_dim, combination="x,y,x*y,x-y", activation = None
        )
           
        self.classif_layer = torch.nn.Linear(in_features=self.chat_encoder.hidden_size, out_features=2)
        self._loss = torch.nn.CrossEntropyLoss()
        self._dropout = InputVariationalDropout(dropout)
        self._input_dropout = Dropout(input_dropout)
        self.accuracy = CategoricalAccuracy()
        self.f1measure = F1Measure(positive_label=1)
        # unused for now
        self.final_activation = Activation.by_name("sigmoid")
        # tested with distance feature (embedding) and spk_add boolean; final feedforward accounts also for attention scores
        if self.use_features:
            self.final_score = TimeDistributed(Linear(1+total_feature_size,1))#,self.final_activation))
        
        representation_dim = turn_encoder.get_output_dim()

        self._unlabelled_f1 = F1Measure(positive_label=1)
        #  with weight favoring recall of positive class 
        self._arc_loss = torch.nn.BCEWithLogitsLoss(reduction="none",
                                                    pos_weight=torch.tensor([positive_class_weight]))
        
        self.prediction_window = prediction_window
        initializer(self)
        # useful for debugging
        self.iter_count = 0 
        self.debug = debug
    # init done
        
    # todo 
    @overrides
    def forward(
        self,  # type: ignore
        lines,
        arcs: torch.LongTensor = None,
        loops = None,
        rel_features = None,
        offsets = None,
        is_server = None,
        metadata: List[Dict[str, Any]] = None,
    ) -> Dict[str, torch.Tensor]:

        """
        # Parameters

        lines: the chat as a list of turns, each being a list of token
        TODO: add metadata to instances
        metadata : List[Dict[str, Any]], optional (default = None)
            A dictionary of metadata for each batch element which has keys:
                tokens : `List[str]`, required.
                    The original string tokens in the sentence.
        arcs : a tensor containing the adjacency matrix for the instance dependencies between turns
            Has shape `(batch_size, sequence_length, sequence_length)`.

        # Returns

        An output dictionary.
        """
        #########
        # this is the part where chat is encoded as sequence of turn encodings
        #########
        # mask for each turn of each chat of the batch: shape = (batch_size x max_turns x tokens)
        token_mask = get_text_field_mask(lines,num_wrapping_dims=1)

        # chat turns fetching embedding
        # turns_embedding tensor is (batch_size x turns x max tokens x token embedding size)
        turns_embeddings = self.text_field_embedder(lines,num_wrapping_dims=1)
      
        # encoding turns
        # turn_h has shape (batch_size x turns x encoder_output_size) 
        turn_h = self.turn_encoder(turns_embeddings,token_mask)
        
        # mask for chats is now nb of turns; beware weird return type of torch.max (tuple) 
        chat_mask = token_mask.max(axis=2)[0]
        
        # renaming to mask -> easier to transpose the rest of graph_parser
        mask = chat_mask
        
        # graph parser goes on
        # leave input dropout for now
        # embedded_text_input = turn_h equivalent in hierarchical sequence -> renaming 
        #embedded_text_input = self._input_dropout(embedded_text_input)
        embedded_text_input = turn_h
        
        # encoded_turns = encoded chat = self.chat_encoder(turn_h,chat_mask) equivalent in hierarchical sequence
        encoded_turns = self.chat_encoder(embedded_text_input, mask)
        #breakpoint()
        encoded_turns = self._dropout(encoded_turns)
        
        
        
        logits = self.classif_layer(encoded_turns)[:,-1,:]
        probs = torch.nn.functional.softmax(logits, dim=-1)
        output_dict = {"logits": logits, "probs": probs, "label": loops}
        #print(logits, probs, loops)
        #for k, label in enumerate(loops):
            #if "server" not in [x["tokens"][-1] for x in metadata][k][1]:
                #print(label, probs[k], [x["tokens"][-1] for x in metadata][k])

        if loops is not None:
#            print(logits, loops)
#            print(loops.long().view(-1))
#            print(logits.size(), loops.size())
            loss = self._loss(logits, loops.long().view(-1))
            output_dict["loss"] = loss
            self.f1measure(logits, loops)
            self.accuracy(logits, loops)
        
        return output_dict
    # modified / partially tested
    def _construct_loss(
        self,
        arc_scores: torch.Tensor,
        arc_tags: torch.Tensor,
        mask: torch.BoolTensor,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Computes the arc loss for an adjacency matrix.

        # Parameters

        arc_scores : `torch.Tensor`, required.
            A tensor of shape (batch_size, sequence_length, sequence_length) used to generate a
            binary classification decision for whether an edge is present between two words.
        arc_tags : `torch.Tensor`, required.
            A tensor of shape (batch_size, sequence_length, sequence_length).
            The labels for every arc (0/1).
        mask : `torch.BoolTensor`, required.
            A mask of shape (batch_size, sequence_length), denoting unpadded
            elements in the sequence.

        # Returns

        arc_nll : `torch.Tensor`, required.
            The negative log likelihood from the arc loss.
        tag_nll : `torch.Tensor`, required.
            The negative log likelihood from the arc tag loss.
        """
        arc_indices = (arc_tags != -1).float()
        # Make the arc tags not have negative values anywhere
        # (by default, no edge is indicated with -1).
        arc_tags = arc_tags * arc_indices
        arc_nll = self._arc_loss(arc_scores, arc_indices) * mask.unsqueeze(1) * mask.unsqueeze(2)
        
        # We want the mask for the tags to only include the unmasked words
        # and we only care about the loss with respect to the gold arcs.
        tag_mask = mask.unsqueeze(1) * mask.unsqueeze(2) * arc_indices

        #batch_size, sequence_length, _, num_tags = arc_tag_logits.size()
        #original_shape = [batch_size, sequence_length, sequence_length]
        #reshaped_logits = arc_tag_logits.view(-1, num_tags)
        reshaped_tags = arc_tags.view(-1)
        #tag_nll = (
        #    self._tag_loss(reshaped_logits, reshaped_tags.long()).view(original_shape) * tag_mask
        #)

        valid_positions = tag_mask.sum()

        arc_nll = arc_nll.sum() / valid_positions.float()
        #tag_nll = tag_nll.sum() / valid_positions.float()
        return arc_nll#, tag_nll
    
    
    # modified / untested
    # Warning: the initial Dozat & Manning implementation filter prediction with threshold here instead
    # of decoding, but not in latest 1.1 allennlp version
    #-------
    # this is supposed to be called when ? doc unclear. says with model.forward_on_instances but does not seem
    # to be the case (see tests below)
    #-------
    # no method for Model in used version (0.9?)
    #@overrides
    def make_output_human_readable(
        self, output_dict: Dict[str, torch.Tensor]
    ) -> Dict[str, torch.Tensor]:
        #breakpoint()
        arc_probs = output_dict["arc_probs"].cpu().detach().numpy()
        mask = output_dict["mask"]
        lengths = get_lengths_from_binary_sequence_mask(mask)
        arcs = []
        arc_tags = []
        for instance_arc_probs, length in zip(
            arc_probs, lengths
        ):

            arc_matrix = instance_arc_probs > self.edge_prediction_threshold
            edges = []
            #edge_tags = []
            for i in range(length):
                for j in range(length):
                    if arc_matrix[i, j] == 1:
                        edges.append((i, j))
                        #tag = instance_arc_tag_probs[i, j].argmax(-1)
                        #edge_tags.append(self.vocab.get_token_from_index(tag, "labels"))
            arcs.append(edges)
            #arc_tags.append(edge_tags)

        output_dict["arcs"] = arcs
        #output_dict["arc_tags"] = arc_tags
        return output_dict
    
    
    
    # modified/partially tested
    @staticmethod
    def _greedy_decode(
        arc_scores: torch.Tensor, 
        mask: torch.BoolTensor,
        prediction_window: int,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Decodes the head and head tag predictions by decoding the unlabeled arcs
        independently for each word and then again, predicting the head tags of
        these greedily chosen arcs independently.

        # Parameters

        arc_scores : `torch.Tensor`, required.
            A tensor of shape (batch_size, sequence_length, sequence_length) used to generate
            a distribution over attachments of a given word to all other words.
        ###arc_tag_logits : `torch.Tensor`, required.
        ###    A tensor of shape (batch_size, sequence_length, sequence_length, num_tags) used to
        ###    generate a distribution over tags for each arc.
        mask : `torch.BoolTensor`, required.
            A mask of shape (batch_size, sequence_length).

        # Returns

        arc_probs : `torch.Tensor`
            A tensor of shape (batch_size, sequence_length, sequence_length) representing the
            probability of an arc being present for this edge.
        ####arc_tag_probs : `torch.Tensor`
        ####    A tensor of shape (batch_size, sequence_length, sequence_length, sequence_length)
        ####    representing the distribution over edge tags for a given edge.
        """
        # Mask the diagonal, because we don't self edges.
        # WARNING: might not be the case for chats ? -> should be an option
        #inf_diagonal_mask = torch.diag(arc_scores.new(mask.size(1)).fill_(-float('inf')))
        
        # no edges going backwards
        triangle_upper_mask = torch.triu(arc_scores.new(mask.size(1),mask.size(1)).fill_(-float('inf')))
        # prevent edges between turns more than prediction_window turns appart
        up = 1-torch.triu(arc_scores.new(mask.size(1),mask.size(1)).fill_(1),diagonal=prediction_window)
        down = 1-torch.tril(arc_scores.new(mask.size(1),mask.size(1)).fill_(1),diagonal=-prediction_window)
        diag_mask = torch.log(((up+down)-1))
        # up_mask = torch.triu(arc_scores.new_zeros(mask.size(1),mask.size(1),diagonal=-15)
        # down_mask = torch.tril(arc_scores.new_zeros(mask.size(1),mask.size(1),diagonal=15)
        # away_mask = (up_mask == down_mask)
        #arc_scores = arc_scores + inf_diagonal_mask 
        arc_scores = arc_scores + diag_mask + triangle_upper_mask
        
        # shape (batch_size, sequence_length, sequence_length, num_tags)
        #arc_tag_logits = arc_tag_logits + inf_diagonal_mask.unsqueeze(0).unsqueeze(-1)
        # Mask padded tokens, because we only want to consider actual word -> word edges.
        # CHAT: this is the wrong torch version lol this does not work/ confusion int/bools
        # minus_mask = ~mask.unsqueeze(2)
        # CHAT: this should work with torch>1.4
        #minus_mask = (mask<1).unsqueeze(2)
        minus_mask = (mask.unsqueeze(1) & mask.unsqueeze(2))<1
        
        arc_scores.masked_fill_(minus_mask, -float('inf'))
        # 
        
        #arc_tag_logits.masked_fill_(minus_mask.unsqueeze(-1), -float('inf'))
        # shape (batch_size, sequence_length, sequence_length)
        arc_probs = arc_scores.sigmoid()
        # best arc option: set all but the best arc for a given target to 0 (== best head) 
        #force_max(arc_probs)
        
        # shape (batch_size, sequence_length, sequence_length, num_tags)
        #arc_tag_probs = torch.nn.functional.softmax(arc_tag_logits, dim=-1)
        return arc_probs#, arc_tag_probs
    # modified / untested
    
    @overrides
#    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
#        return {"accuracy": self.accuracy.get_metric(reset)}
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        metrics = {}
        metrics["accuracy"] =  self.accuracy.get_metric(reset)
        precision, recall, f1_measure = self.f1measure.get_metric(reset)
        metrics["precision"] = precision
        metrics["recall"] = recall
        metrics["f1"] = f1_measure
        return metrics

    # since human readable not accessible, post process the outputs of
    # model.forward_on_instances
    # includes prob threshold here
    def extract_threshold(self,output_dict):
        instance_arc_probs = output_dict["arc_probs"]
        mask = output_dict["mask"]
        length = len(mask)
        if True:
            arc_matrix = instance_arc_probs > self.edge_prediction_threshold
            edges = []
            for i in range(length):
                for j in range(length):
                    if arc_matrix[i, j] == 1:
                        edges.append((i, j))

        output_dict["arcs"] = edges
        return output_dict

    def extract_best(self,output_dict):
        best_heads = output_dict["arc_probs"].argmax(axis=1)
        best_values = output_dict["arc_probs"].max(axis=1)
        heads = list(zip(range(len(best_heads)),zip(best_heads,best_values)))
        output_dict["arcs"] = heads
        return output_dict
    
    def predict_graph(self,instances,data_frame=False,best=False):
        """apply model on a bunch of instances

        data_frame: if True, produce a pandas DataFrame, otherwise just add explicit edges to model output

        best: if True, take only best head for each turn 
        """
        outputs = self.forward_on_instances(instances)
        
        if best:    
            outputs = [model.extract_best(x) for x in outputs]
        else:
            outputs = [model.extract_threshold(x) for x in outputs]
            
        if not data_frame: 
            return outputs

        # save only positive cases for now
        frame = []
        for one in outputs:
            source = one["metadata"]["file_source"]
            start_idx = one["metadata"]["first_line"]
            #print(source,start_idx)
            for (i,j) in one["arcs"]:
                if best: 
                    tid, label = j
                else:
                    tid, label = j,1
                frame.append((source,i+start_idx,tid+start_idx,"<sentence>","<sentence>",label))
        return pds.DataFrame(frame,columns=["source_file","source","target","sentence1","sentence2","label"])

In [12]:
arc_representation_dim = 50 
feature_size = 4

model = ChatGraphParser(vocab,word_embeddings,
                        turn_encoder,chat_encoder,arc_representation_dim,
                        use_features = True,
                        turn_feature_size = 1,
                        feature_size = feature_size,
                        pair_feature_size = 1+feature_size,
                        prediction_window = 10,
                        positive_class_weight = 1000,
                        debug=False,
                        edge_prediction_threshold=0.55)


if torch.cuda.is_available():
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1

#cuda_device = -1
    
    
# not used yet, but ready
from allennlp.training.optimizers import Optimizer
trainer_cfg = Params({
        "cuda_device": cuda_device,
        "grad_norm": 5,
        "num_epochs": 100,
        "optimizer": {
            "type": "dense_sparse_adam",
            "betas": [
                0.9,
                0.9
            ]
        },
        "patience": 50,
})
opt_cfg = trainer_cfg.pop("optimizer")

In [13]:
#optimizer = optim.SGD(model.parameters(), lr=0.01)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
#optimizer = optim.SparseAdam(model.parameters(), lr=0.001)

# not tested
#optimizer = Optimizer.from_params(model_parameters=model.parameters(),
#                                  params=opt_cfg)

# "lines" <-> "tokens"
iterator = BucketIterator(batch_size=512,
                          sorting_keys=[("lines","list_num_tokens")])
iterator.index_with(vocab)


# requires Allennlp > 1.0
#from allennlp.training.trainer import EpochCallback
#class Epoch_nb(EpochCallback):
#    
#    def __call__(
#          self,
#          trainer: "GradientDescentTrainer",
#          metrics: Dict[str, Any],
#          epoch: int
#      ) -> None:
#        print("Epoch nb %d"%epoch,end="\t")

    



In [None]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  num_epochs=10,
                  grad_norm=5,
                  patience=10,
                  cuda_device = cuda_device,
                  train_dataset=train_instances,
                  validation_dataset=dev_instances,
                  serialization_dir="/data/sebastien/loops/training/train_"+str(datetime.now().isoformat()), 
                  should_log_parameter_statistics = False
)

In [None]:

trainer.train()

In [None]:
# (from tutorial) Here's how to save the model.
with open("/data/sebastien/loops/model.th", 'wb') as f:
    torch.save(model.state_dict(), f)
vocab.save_to_files("/data/sebastien/loops/vocabulary")


In [14]:
model2 = ChatGraphParser(vocab,word_embeddings,
                        turn_encoder,chat_encoder,arc_representation_dim,
                        use_features = True,
                        turn_feature_size = 1,
                        feature_size = feature_size,
                        pair_feature_size = 1+feature_size,
                        prediction_window = 10,
                        positive_class_weight = 1000,
                        debug=False,
                        edge_prediction_threshold=0.55)

In [15]:
with open("/data/sebastien/loops/training/train_2021-01-19T16:29:46.403784/model_state_epoch_5.th", 'rb') as f:
    model2.load_state_dict(torch.load(f))

In [21]:
model2.cpu()

ChatGraphParser(
  (text_field_embedder): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (turn_encoder): TimeDistributed(
    (_module): PytorchSeq2VecWrapper(
      (_module): GRU(300, 100, batch_first=True, dropout=0.25, bidirectional=True)
    )
  )
  (chat_encoder): PytorchSeq2SeqWrapper(
    (_module): GRU(200, 100, num_layers=3, batch_first=True, dropout=0.25)
  )
  (head_arc_feedforward): FeedForward(
    (_linear_layers): ModuleList(
      (0): Linear(in_features=101, out_features=50, bias=True)
    )
    (_dropout): ModuleList(
      (0): Dropout(p=0.0, inplace=False)
    )
  )
  (child_arc_feedforward): FeedForward(
    (_linear_layers): ModuleList(
      (0): Linear(in_features=101, out_features=50, bias=True)
    )
    (_dropout): ModuleList(
      (0): Dropout(p=0.0, inplace=False)
    )
  )
  (_distance_embedding): Embedding()
  (arc_attention): LinearMatrixAttention()
  (classif_layer): Linear(in_features=100, out_features=2, bias=True)
  (_loss):

In [None]:
dev_instances = reader.read("../data/dev")

In [None]:
outputs = model2.forward_on_instances(dev_instances)

In [None]:
with open("loops_prediction.tsv", "w") as f:
    f.write("index\tfile_id\tsource\ttarget\tsentence\tlabel\tproba0\tproba1\tprediction\n")
    for i in range(len(dev_instances)):
        f.write(f"{i}\t{dev_instances[i]['metadata'].metadata['file_source']}"
              + f"\t{dev_instances[i]['metadata'].metadata['token_line']}"
              + f"\t{dev_instances[i]['metadata'].metadata['token_line']}"
              + "\t"+" ".join(map(str, dev_instances[i]['metadata'].metadata['tokens'][-1]))
              + f"\t{dev_instances[i]['loops'].label}"
              + f"\t{outputs[i]['probs'][0]}"
              + f"\t{outputs[i]['probs'][1]}"
              + f"\t{outputs[i]['probs'].argmax()}\n"
             )

In [16]:
test_instances = reader.read("../data/test")

4999it [00:02, 2032.48it/s]


In [22]:
outputs = model2.forward_on_instances(test_instances)

Encountered the loss key in the model's return dictionary which couldn't be split by the batch size. Key will be ignored.


In [23]:
with open("test_loops_prediction.tsv", "w") as f:
    f.write("index\tfile_id\tsource\ttarget\tsentence\tlabel\tproba0\tproba1\tprediction\n")
    for i in range(len(test_instances)):
        f.write(f"{i}\t{test_instances[i]['metadata'].metadata['file_source']}"
              + f"\t{test_instances[i]['metadata'].metadata['token_line']}"
              + f"\t{test_instances[i]['metadata'].metadata['token_line']}"
              + "\t"+" ".join(map(str, test_instances[i]['metadata'].metadata['tokens'][-1]))
              + f"\t{test_instances[i]['loops'].label}"
              + f"\t{outputs[i]['probs'][0]}"
              + f"\t{outputs[i]['probs'][1]}"
              + f"\t{outputs[i]['probs'].argmax()}\n"
             )

In [24]:
f1m = F1Measure(positive_label=1)

In [41]:
import numpy as np
f1m(torch.tensor([x["logits"] for x in outputs]), torch.tensor([x['loops'] for x in test_instances]))

RuntimeError: Could not infer dtype of LabelField

In [34]:
[x["logits"] for x in outputs]

[array([ 0.02097616, -0.05523992], dtype=float32),
 array([ 0.02097616, -0.05523992], dtype=float32),
 array([ 0.02097616, -0.05523992], dtype=float32),
 array([ 0.02097616, -0.05523992], dtype=float32),
 array([ 0.02097616, -0.05523992], dtype=float32),
 array([ 0.02097616, -0.05523992], dtype=float32),
 array([ 0.02097616, -0.05523992], dtype=float32),
 array([ 0.02097616, -0.05523992], dtype=float32),
 array([ 0.02097616, -0.05523992], dtype=float32),
 array([ 0.02097616, -0.05523992], dtype=float32),
 array([ 2.1005635, -1.7159123], dtype=float32),
 array([ 0.76982343, -0.48564577], dtype=float32),
 array([ 1.790357 , -1.2614411], dtype=float32),
 array([ 3.362391, -3.034902], dtype=float32),
 array([-0.9483521,  0.6841781], dtype=float32),
 array([ 2.7722416, -2.4350452], dtype=float32),
 array([ 1.4719468, -1.1286428], dtype=float32),
 array([ 2.6732879, -2.068485 ], dtype=float32),
 array([ 1.7999927, -1.5936846], dtype=float32),
 array([ 3.0509598, -2.523812 ], dtype=float32),
