In [110]:
### this one to test the dgl 
import sys
sys.path.append("/home/n/nguyenpk/CS6208/GNN_ERC/baseline/DialogueGCN-mianzhang")

### Classifier

In [111]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import dgcn

log = dgcn.utils.get_logger()


class Classifier(nn.Module):
    def __init__(self, input_dim, hidden_size, tag_size, args):
        super(Classifier, self).__init__()
        self.emotion_att = MaskedEmotionAtt(input_dim)
        self.lin1 = nn.Linear(input_dim, hidden_size)
        self.drop = nn.Dropout(args.drop_rate)
        self.lin2 = nn.Linear(hidden_size, tag_size)
        if args.class_weight:
            self.loss_weights = torch.tensor([1 / 0.086747, 1 / 0.144406, 1 / 0.227883,
                                              1 / 0.160585, 1 / 0.127711, 1 / 0.252668]).to(args.device)
            self.nll_loss = nn.NLLLoss(self.loss_weights)
        else:
            self.nll_loss = nn.NLLLoss()

    def get_prob(self, h, text_len_tensor):
        # h_hat = self.emotion_att(h, text_len_tensor)
        # hidden = self.drop(F.relu(self.lin1(h_hat)))
        hidden = self.drop(F.relu(self.lin1(h)))
        scores = self.lin2(hidden)
        log_prob = F.log_softmax(scores, dim=-1)

        return log_prob

    def forward(self, h, text_len_tensor):
        log_prob = self.get_prob(h, text_len_tensor)
        y_hat = torch.argmax(log_prob, dim=-1)

        return y_hat

    def get_loss(self, h, label_tensor, text_len_tensor):
        log_prob = self.get_prob(h, text_len_tensor)
        loss = self.nll_loss(log_prob, label_tensor)

        return loss


class MaskedEmotionAtt(nn.Module):

    def __init__(self, input_dim):
        super(MaskedEmotionAtt, self).__init__()
        self.lin = nn.Linear(input_dim, input_dim)

    def forward(self, h, text_len_tensor):
        batch_size = text_len_tensor.size(0)
        x = self.lin(h)  # [node_num, H]
        ret = torch.zeros_like(h)
        s = 0
        for bi in range(batch_size):
            cur_len = text_len_tensor[bi].item()
            y = x[s: s + cur_len]
            z = h[s: s + cur_len]
            scores = torch.mm(z, y.t())  # [L, L]
            probs = F.softmax(scores, dim=1)
            out = z.unsqueeze(0) * probs.unsqueeze(-1)  # [1, L, H] x [L, L, 1] --> [L, L, H]
            out = torch.sum(out, dim=1)  # [L, H]
            ret[s: s + cur_len, :] = out
            s += cur_len

        return ret




### EdgeAtt

In [112]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import dgcn

log = dgcn.utils.get_logger()


class EdgeAtt(nn.Module):

    def __init__(self, g_dim, args):
        super(EdgeAtt, self).__init__()
        self.device = args.device
        self.wp = args.wp
        self.wf = args.wf

        self.weight = nn.Parameter(torch.zeros((g_dim, g_dim)).float(), requires_grad=True)
        var = 2. / (self.weight.size(0) + self.weight.size(1))
        self.weight.data.normal_(0, var)

    def forward(self, node_features, text_len_tensor, edge_ind):
        batch_size, mx_len = node_features.size(0), node_features.size(1)
        alphas = []

        weight = self.weight.unsqueeze(0).unsqueeze(0)
        att_matrix = torch.matmul(weight, node_features.unsqueeze(-1)).squeeze(-1)  # [B, L, D_g]
        for i in range(batch_size):
            cur_len = text_len_tensor[i].item()
            alpha = torch.zeros((mx_len, 110)).to(self.device)
            for j in range(cur_len):
                s = j - self.wp if j - self.wp >= 0 else 0
                e = j + self.wf if j + self.wf <= cur_len - 1 else cur_len - 1
                tmp = att_matrix[i, s: e + 1, :]  # [L', D_g]
                feat = node_features[i, j]  # [D_g]
                score = torch.matmul(tmp, feat)
                probs = F.softmax(score, dim=1)  # [L']
                alpha[j, s: e + 1] = probs
            alphas.append(alpha)

        return alphas

# class EdgeAtt(nn.Module):
#
#     def __init__(self, g_dim, args):
#         super(EdgeAtt, self).__init__()
#         self.device = args.device
#         self.wp = args.wp
#         self.wf = args.wf
#         self.lin = nn.Linear(g_dim, 110)
#
#     def forward(self, node_features, text_len_tensor, edge_ind):
#         h = self.lin(node_features)  # [B, L, mx]
#         alphas = F.softmax(h, dim=-1)
#         # alphas = torch.ones((node_features.size(0), node_features.size(1), 110))
#         return alphas


### GCN

In [113]:
### original

In [114]:
import torch.nn as nn
import dgl
from dgl.nn.pytorch import RelGraphConv as RGCNConv
from dgl.nn.pytorch import GraphConv

class GCN(nn.Module):

    def __init__(self, g_dim, h1_dim, h2_dim, args):
        super(GCN, self).__init__()
        self.num_relations = 2 * args.n_speakers ** 2
        self.conv1 = RGCNConv(g_dim, h1_dim, self.num_relations, num_bases=30)
        self.conv2 = GraphConv(h1_dim, h2_dim)
        if args.device != 'cpu':
            self.conv1 = self.conv1.cuda()
            self.conv2 = self.conv2.cuda()

    def forward(self, node_features, edge_index, edge_norm, edge_type):
        g = dgl.graph((edge_index[0], edge_index[1]))
        g.norm = edge_norm
        x = self.conv1(g, node_features, edge_type)
        x = self.conv2(g, x)

        return x

### SeqContext


In [238]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class SeqContext(nn.Module):

    def __init__(self, u_dim, g_dim, args):
        super(SeqContext, self).__init__()
        self.input_size = u_dim
        self.hidden_dim = g_dim
        if args.rnn == "lstm":
            self.rnn = nn.LSTM(self.input_size, self.hidden_dim // 2, dropout=args.drop_rate,
                               bidirectional=True, num_layers=2, batch_first=True)
        elif args.rnn == "gru":
            self.rnn = nn.GRU(self.input_size, self.hidden_dim // 2, dropout=args.drop_rate,
                              bidirectional=True, num_layers=2, batch_first=True)

    def forward(self, text_len_tensor, text_tensor):
        packed = pack_padded_sequence(
            text_tensor,
            text_len_tensor.cpu(),
            batch_first=True,
            enforce_sorted=False
        )
        # print(len(self.rnn(packed, None)))
        # rnn_out, (_, _) = self.rnn(packed, None)
        rnn_out,  _ = self.rnn(packed, None)
        rnn_out, _ = pad_packed_sequence(rnn_out, batch_first=True)

        return rnn_out


### function

In [239]:
import numpy as np
import torch

import dgcn

log = dgcn.utils.get_logger()


def batch_graphify(features, lengths, speaker_tensor, wp, wf, edge_type_to_idx, att_model, device):
    node_features, edge_index, edge_norm, edge_type = [], [], [], []
    batch_size = features.size(0)
    length_sum = 0
    edge_ind = []
    edge_index_lengths = []

    for j in range(batch_size):
        edge_ind.append(edge_perms(lengths[j].cpu().item(), wp, wf))

    edge_weights = att_model(features, lengths, edge_ind)

    for j in range(batch_size):
        cur_len = lengths[j].item()
        node_features.append(features[j, :cur_len, :])
        perms = edge_perms(cur_len, wp, wf)
        perms_rec = [(item[0] + length_sum, item[1] + length_sum) for item in perms]
        length_sum += cur_len
        edge_index_lengths.append(len(perms))

        for item, item_rec in zip(perms, perms_rec):
            edge_index.append(torch.tensor([item_rec[0], item_rec[1]]))
            edge_norm.append(edge_weights[j][item[0], item[1]])
            # edge_norm.append(edge_weights[j, item[0], item[1]])

            speaker1 = speaker_tensor[j, item[0]].item()
            speaker2 = speaker_tensor[j, item[1]].item()
            if item[0] < item[1]:
                c = '0'
            else:
                c = '1'
            edge_type.append(edge_type_to_idx[str(speaker1) + str(speaker2) + c])

    node_features = torch.cat(node_features, dim=0).to(device)  # [E, D_g]
    edge_index = torch.stack(edge_index).t().contiguous().to(device)  # [2, E]
    edge_norm = torch.stack(edge_norm).to(device)  # [E]
    edge_type = torch.tensor(edge_type).long().to(device)  # [E]
    edge_index_lengths = torch.tensor(edge_index_lengths).long().to(device)  # [B]

    graph_out = dgl.graph((edge_index[0], edge_index[1]))
    graph_out.norm = edge_norm
    graph_out.ndata['feat'] = node_features
    graph_out.ndata['PE'] = dgl.laplacian_pe(graph_out, k=pos_enc_size, padding=True).to(device)
    graph_out.edata['feat'] = edge_norm
    graph_out.edge_index_lengths = edge_index_lengths
    return graph_out


def edge_perms(length, window_past, window_future):
    """
    Method to construct the edges of a graph (a utterance) considering the past and future window.
    return: list of tuples. tuple -> (vertice(int), neighbor(int))
    """

    all_perms = set()
    array = np.arange(length)
    for j in range(length):
        perms = set()

        if window_past == -1 and window_future == -1:
            eff_array = array
        elif window_past == -1:  # use all past context
            eff_array = array[:min(length, j + window_future + 1)]
        elif window_future == -1:  # use all future context
            eff_array = array[max(0, j - window_past):]
        else:
            eff_array = array[max(0, j - window_past):min(length, j + window_future + 1)]

        for item in eff_array:
            perms.add((j, item))
        all_perms = all_perms.union(perms)
    return list(all_perms)


## Read data and check training

In [240]:
# node_features = model.rnn(data["text_len_tensor"], data["text_tensor"]) # [batch_size, mx_len, D_g]
# features, edge_index, edge_norm, edge_type, edge_index_lengths = batch_graphify(
#     node_features, data["text_len_tensor"], data["speaker_tensor"], args.wp, args.wf,
#     model.edge_type_to_idx, model.edge_att, args.device)

# # graph_out = self.gcn(features, edge_index, edge_norm, edge_type)

In [241]:
# import dgl
# from dgl.nn import RelGraphConv
# from dgl.nn.pytorch import GraphConv

# g_dim = 200
# h1_dim = 100
# h2_dim = 100
# hc_dim = 100
# tag_size = 6


# g = dgl.graph((edge_index[0], edge_index[1]))
# g.norm = edge_norm
# # g.etypes= edge_type
# conv = RelGraphConv(g_dim, h1_dim, h2_dim, regularizer='basis', num_bases=30).cuda()
# conv1 = GraphConv(h1_dim, h2_dim).cuda()
# # res = conv(g, feat, etype)

# res = conv(g, features, edge_type)
# res2 = conv1(g, res)

In [242]:
g

Graph(num_nodes=1567, num_edges=29387,
      ndata_schemes={'feat': Scheme(shape=(200,), dtype=torch.float32), 'PE': Scheme(shape=(2,), dtype=torch.float32)}
      edata_schemes={})

In [328]:
## graph transformer 
torch.backends.cudnn.enabled=False

In [364]:
import dgl
import dgl.nn as dglnn
import dgl.sparse as dglsp
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from dgl.data import AsGraphPredDataset
from dgl.dataloading import GraphDataLoader
from ogb.graphproppred import collate_dgl, DglGraphPropPredDataset, Evaluator
from ogb.graphproppred.mol_encoder import AtomEncoder
from tqdm import tqdm



class MLP_layer(nn.Module):
    
    def __init__(self, input_dim, output_dim, L=2): # L = nb of hidden layers
        super(MLP_layer, self).__init__()
        list_FC_layers = [ nn.Linear( input_dim, input_dim, bias=True ) for l in range(L) ]
        list_FC_layers.append(nn.Linear( input_dim, output_dim , bias=True ))
        self.FC_layers = nn.ModuleList(list_FC_layers)
        self.L = L
        
    def forward(self, x):
        y = x
        for l in range(self.L):
            y = self.FC_layers[l](y)
            y = torch.relu(y)
        y = self.FC_layers[self.L](y)
        return y

class SparseMHA(nn.Module):
    """Sparse Multi-head Attention Module"""

    def __init__(self, hidden_size=80, num_heads=8):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads
        self.scaling = self.head_dim**-0.5

        self.q_proj = nn.Linear(hidden_size, hidden_size)
        self.k_proj = nn.Linear(hidden_size, hidden_size)
        self.v_proj = nn.Linear(hidden_size, hidden_size)
        self.out_proj = nn.Linear(hidden_size, hidden_size)

    def forward(self, A, h):
        N = len(h)
        # [N, dh, nh]
        q = self.q_proj(h).reshape(N, self.head_dim, self.num_heads)
        q *= self.scaling
        # [N, dh, nh]
        k = self.k_proj(h).reshape(N, self.head_dim, self.num_heads)
        # [N, dh, nh]
        v = self.v_proj(h).reshape(N, self.head_dim, self.num_heads)

        ######################################################################
        # (HIGHLIGHT) Compute the multi-head attention with Sparse Matrix API
        ######################################################################
        attn = dglsp.bsddmm(A, q, k.transpose(1, 0))  # (sparse) [N, N, nh]
        # Sparse softmax by default applies on the last sparse dimension.
        attn = attn.softmax()  # (sparse) [N, N, nh]
        out = dglsp.bspmm(attn, v)  # [N, dh, nh]

        return self.out_proj(out.reshape(N, -1))
    
class GTLayer(nn.Module):
    """Graph Transformer Layer"""

    def __init__(self, hidden_size=80, num_heads=8):
        super().__init__()
        self.MHA = SparseMHA(hidden_size=hidden_size, num_heads=num_heads)
        self.batchnorm1 = nn.BatchNorm1d(hidden_size)
        self.batchnorm2 = nn.BatchNorm1d(hidden_size)
        self.FFN1 = nn.Linear(hidden_size, hidden_size * 2)
        self.FFN2 = nn.Linear(hidden_size * 2, hidden_size)

    def forward(self, A, h):
        h1 = h
        h = self.MHA(A, h)
        h = self.batchnorm1(h + h1)

        h2 = h
        h = self.FFN2(F.relu(self.FFN1(h)))
        h = h2 + h

        return self.batchnorm2(h)

In [365]:
class GTModel(nn.Module):
    def __init__(
        self,
        out_size, # 6
        input_size=200, # g_dim
        hidden_size=80,
        pos_enc_size=2,
        num_layers=8,
        num_heads=8,
    ):
        super().__init__()
        self.embedding_h =  nn.Linear(input_dim, hidden_size)#dgl.nn.GATConv(input_dim, hidden_size, num_heads=num_heads)
        self.pos_linear = nn.Linear(pos_enc_size, hidden_size)
        self.layers = nn.ModuleList(
            [GTLayer(hidden_size, num_heads) for _ in range(num_layers)]
        )
        self.predictor = MLP_layer(hidden_size, out_size)

    def forward(self, g, X, pos_enc):
        indices = torch.stack(g.edges())
        N = g.num_nodes()
        A = dglsp.spmatrix(indices, shape=(N, N))

        h = self.embedding_h(X) + self.pos_linear(pos_enc)
        
        for layer in self.layers:
            h = layer(A, h)
        return self.predictor(h)


In [380]:
class DialogueGCN(nn.Module):
    def __init__(self, args):
        super(DialogueGCN, self).__init__()
        u_dim = 100
        g_dim = 200
        # h1_dim = 100
        # h2_dim = 100
        # hc_dim = 100
        tag_size = 6

        self.wp = args.wp
        self.wf = args.wf
        self.device = args.device
        #
        self.rnn = SeqContext(u_dim, g_dim, args)
        self.edge_att = EdgeAtt(g_dim, args)
        self.gtm = GTModel(tag_size, input_size= g_dim)
        self.softmax = nn.LogSoftmax(dim=1)
        edge_type_to_idx = {}
        for j in range(args.n_speakers):
            for k in range(args.n_speakers):
                edge_type_to_idx[str(j) + str(k) + '0'] = len(edge_type_to_idx)
                edge_type_to_idx[str(j) + str(k) + '1'] = len(edge_type_to_idx)
        self.edge_type_to_idx = edge_type_to_idx
        log.debug(self.edge_type_to_idx)

    def get_rep(self, data):
        node_features = self.rnn(data["text_len_tensor"], data["text_tensor"]) # [batch_size, mx_len, D_g]
        graph_out = batch_graphify(
            node_features, data["text_len_tensor"], data["speaker_tensor"], self.wp, self.wf,
            self.edge_type_to_idx, self.edge_att, self.device)
        return graph_out

    def forward(self, data):
        graph_out= self.get_rep(data)
        out = self.gtm(graph_out, graph_out.ndata['feat'], graph_out.ndata['PE'])
        # out = torch.argmax(out, dim=-1)
        return self.softmax(out)

#     def loss(self, y_scores, y_labels):
#         loss = nn.CrossEntropyLoss()(y_scores, y_labels)
#         return loss        
        
#     def accuracy(self, scores, targets):
#         scores = scores.detach().argmax(dim=1)
#         acc = (scores==targets).float().sum().item()
#         return acc
    
#     def update(self, lr):       
#         update = torch.optim.Adam( self.parameters(), lr=lr )
#         return update


In [381]:
import os
base_path = "/home/n/nguyenpk/CS6208/GNN_ERC/baseline/DialogueGCN-mianzhang"
data_path = os.path.join(base_path, "data/iemocap/ckpt/data.pkl")
batch_size = 32
device  = "cuda:0"
learning_rate = 0.0003
max_grad_value = -1
weight_decay = 1e-8
optimizer = "adam"

class Namespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

args = Namespace(batch_size=batch_size, 
                 device=device,
                 learning_rate=learning_rate,
                 max_grad_value=max_grad_value, 
                 weight_decay=weight_decay, 
                 optimizer=optimizer, 
                 from_begin=True,
                 epochs=20,
                 drop_rate=0.5,
                 wp=10,
                 wf=10,
                 n_speakers=2,
                 hidden_size=100,
                 rnn='gru',
                 class_weight=True,
                 seed=24,
                 
)
data = dgcn.utils.load_pkl(data_path)
trainset = dgcn.Dataset(data["train"], args.batch_size)
devset = dgcn.Dataset(data["dev"], args.batch_size)
testset = dgcn.Dataset(data["test"], args.batch_size)

model_file = "./save/model.pt"
model = DialogueGCN(args).to(device)
opt = dgcn.Optim(learning_rate, max_grad_value, weight_decay)
opt.set_parameters(model.parameters(), optimizer)

label_to_idx = {'hap': 0, 'sad': 1, 'neu': 2, 'ang': 3, 'exc': 4, 'fru': 5}


In [382]:
import sklearn
import copy
import logging


logging.basicConfig(filename='train_test.log', encoding='utf-8')
def evaluation(model, dataset,  device='cuda:0'):
    model.eval()
    y_true = []
    y_pred = []
    logits = []
    #-----------------------
    with torch.no_grad():
        for idx in range(len(dataset)):
            data = dataset[idx]
            y_true.append(data['label_tensor'])
            for k, v in data.items():
                data[k] = v.to(device)
            logits = model(data)
            y_hat = torch.argmax(logits, dim=-1).detach().cpu()
            y_pred.append(y_hat)
        y_true = torch.cat(y_true, dim=0).numpy()
        y_pred = torch.cat(y_pred, dim=0).numpy()
        f1  = sklearn.metrics.f1_score(y_true, y_pred,average="weighted")
        return f1
# -----------------------------------------------------------------------------

optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 20
scheduler = optim.lr_scheduler.StepLR(
    optimizer, step_size=num_epochs, gamma=0.5
)
# loss_fcn = nn.CrossEntropyLoss() # Can change as the paper itself
# loss_fcn = nn.NLLLoss()
loss_weights = torch.tensor([1 / 0.086747, 1 / 0.144406, 1 / 0.227883,
                                              1 / 0.160585, 1 / 0.127711, 1 / 0.252668]).to(args.device)
loss_fcn = nn.NLLLoss(loss_weights)
model.train()

best_state = None
best_dev_f1 = None
best_epoch = None

for epoch in range(args.epochs + 1):
    total_loss = 0
    for idx in range(len(trainset)):
        idata = trainset[idx]
        label = idata['label_tensor'].to(args.device)
        for k, v in idata.items():
            idata[k] = v.to(args.device)
        logits = model(idata)
        loss = loss_fcn(logits, label)
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    ev_train = evaluation(model, trainset)
    dev1 = ev_train
    if best_epoch is None or dev1 > best_dev_f1:
        best_dev_f1 = dev1
        best_epoch = epoch
        best_state = copy.deepcopy(model.state_dict())
    ev_test = evaluation(model, testset)
    log.info('[Epochs {}: , loss: {},  f1_train:{}, f1_test: {}]'
          .format(epoch, loss,  ev_train, ev_test)
         )

  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:18:34 [Epochs 0: , loss: 1.5844167470932007,  f1_train:0.5794932159346788, f1_test: 0.4080484277190507]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:19:03 [Epochs 1: , loss: 0.9388782382011414,  f1_train:0.6339281738304317, f1_test: 0.5848785369892672]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:19:31 [Epochs 2: , loss: 0.6961487531661987,  f1_train:0.6862320217087663, f1_test: 0.6011949254569395]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:20:00 [Epochs 3: , loss: 0.5864019989967346,  f1_train:0.7321332469023084, f1_test: 0.5571674552689915]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:20:29 [Epochs 4: , loss: 0.4575531482696533,  f1_train:0.8180331575483564, f1_test: 0.571455294890675]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:20:57 [Epochs 5: , loss: 0.3425407111644745,  f1_train:0.871577075894544, f1_test: 0.5755200905115831]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:21:27 [Epochs 6: , loss: 0.21159973740577698,  f1_train:0.9027469924387953, f1_test: 0.5536779697462095]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:21:56 [Epochs 7: , loss: 0.15840855240821838,  f1_train:0.9225289329250447, f1_test: 0.5478835015279871]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:22:24 [Epochs 8: , loss: 0.1295231729745865,  f1_train:0.9290618723851851, f1_test: 0.5304414598558329]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:22:53 [Epochs 9: , loss: 0.08705036342144012,  f1_train:0.9401985142122491, f1_test: 0.5434787937526424]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:23:21 [Epochs 10: , loss: 0.07253216207027435,  f1_train:0.9470787368706426, f1_test: 0.5499216671191951]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:23:50 [Epochs 11: , loss: 0.05674681439995766,  f1_train:0.9516240612136447, f1_test: 0.5568869362142781]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:24:20 [Epochs 12: , loss: 0.04587560519576073,  f1_train:0.9595468276949358, f1_test: 0.5517029088932707]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:24:47 [Epochs 13: , loss: 0.03575770929455757,  f1_train:0.9658807524326024, f1_test: 0.5552729993697084]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:25:17 [Epochs 14: , loss: 0.028528830036520958,  f1_train:0.9698377605558662, f1_test: 0.5523048972624998]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:25:45 [Epochs 15: , loss: 0.0186869278550148,  f1_train:0.9756599078106937, f1_test: 0.5436536393286213]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:26:14 [Epochs 16: , loss: 0.014933847822248936,  f1_train:0.9788079377035477, f1_test: 0.5467255058178825]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:26:43 [Epochs 17: , loss: 0.009903491474688053,  f1_train:0.9805372607773785, f1_test: 0.5508320618659671]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:27:11 [Epochs 18: , loss: 0.008538507856428623,  f1_train:0.9808075229708823, f1_test: 0.5436553776391468]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:27:40 [Epochs 19: , loss: 0.006060297600924969,  f1_train:0.9832798747083804, f1_test: 0.5512229549189175]


  probs = F.softmax(score)  # [L']
  probs = F.softmax(score)  # [L']


03/23/2023 03:28:10 [Epochs 20: , loss: 0.005354269873350859,  f1_train:0.980759976253737, f1_test: 0.5381254114917422]


In [378]:
logits

tensor([[3.8777e-04, 4.1262e-05, 9.9952e-01, 3.6980e-05, 9.0855e-06, 8.7924e-06],
        [3.3682e-10, 9.6676e-12, 9.4629e-06, 7.3109e-10, 2.6886e-06, 9.9999e-01],
        [9.2900e-12, 5.1017e-14, 5.5641e-08, 1.5471e-11, 1.2859e-06, 1.0000e+00],
        ...,
        [1.0000e+00, 2.3056e-10, 1.7881e-07, 2.6601e-06, 3.5811e-07, 1.0478e-10],
        [1.0000e+00, 1.9890e-12, 3.0023e-09, 1.5138e-07, 1.3159e-08, 5.1179e-13],
        [1.0000e+00, 7.1199e-13, 7.7585e-10, 5.7225e-08, 6.6974e-09, 1.3705e-13]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [356]:
onehot_label

tensor([[0, 0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        ...,
        [0, 0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0, 0]], device='cuda:0')

In [372]:
m = nn.Softmax(dim=1)
>>> input = torch.randn(2, 3)
>>> output = m(input)

In [374]:
output

tensor([[0.2959, 0.4747, 0.2294],
        [0.5376, 0.1552, 0.3072]])

In [371]:
softmax = nn.Softmax(dim=6)
softmax(logits)

IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 6)

In [351]:
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
# output = loss(input, target)

In [352]:
input

tensor([[ 1.4111,  1.2146, -0.0983, -0.6432,  1.5157],
        [ 0.5109,  0.1624,  0.2416, -1.2818, -0.1733],
        [ 0.3137, -0.8718,  0.8968,  1.9743, -1.1651]], requires_grad=True)

In [353]:
target

tensor([1, 2, 3])

In [354]:
torch.randn(3, 5).softmax(dim=1)

tensor([[0.0764, 0.0910, 0.4077, 0.2993, 0.1256],
        [0.0221, 0.0217, 0.1789, 0.0443, 0.7330],
        [0.1392, 0.4521, 0.1234, 0.0679, 0.2173]])

In [355]:
nn.CrossEntropyLoss()(input, torch.randn(3, 5).softmax(dim=1))

tensor(1.9045, grad_fn=<DivBackward1>)