In [18]:
pip list

Package                      Version
---------------------------- ---------
absl-py                      1.2.0
aiohttp                      3.8.1
aiosignal                    1.2.0
argon2-cffi                  21.1.0
astunparse                   1.6.3
async-timeout                4.0.2
attrs                        21.2.0
backcall                     0.2.0
bleach                       4.1.0
blis                         0.7.8
boto3                        1.21.39
botocore                     1.24.39
brotlipy                     0.7.0
cachetools                   5.2.0
cairocffi                    1.3.0
catalogue                    2.0.8
certifi                      2022.6.15
cffi                         1.15.0
chardet                      5.0.0
charset-normalizer           2.0.12
clean-text                   0.6.0
click                        8.1.2
convokit                     2.5.3
cryptography                 36.0.0
cycler                       0.11.0
cymem                        2.0.6


In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os
from tqdm import tqdm
import torch.nn.functional as F
from gensim.models import Word2Vec
from sklearn import metrics
from sklearn.metrics import classification_report
import itertools
import random

In [20]:
device = 'cuda'

In [21]:
class BertModel(nn.Module):
    def __init__(self, voc_size:int=30000, seq_len: int=512, d_model: int=128, d_ff:int=3072, pad_idx: int=1,
                num_encoder: int=12, num_heads: int=12, n_layers=6, dropout: float=0.1):
        super(BertModel, self).__init__()
        self.pad_idx = pad_idx
        self.layers = nn.ModuleList([nn.TransformerEncoderLayer(d_model=d_model, nhead=8, batch_first=True) for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, d_model)
        self.activ1 = nn.Tanh()
        self.classifier = nn.Linear(d_model, 1)
        
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        output = input
        for layer in self.layers:
            output = layer(output)
        output = output.mean(dim=1, keepdim=True)
        h_pooled = self.activ1(self.fc(output)) # [batch_size, d_model]
        logits_clsf = self.classifier(h_pooled) # [batch_size, 2]
        return logits_clsf # [B, S, D_model]

In [22]:
bert = BertModel()

In [23]:
input = torch.FloatTensor([[0]*128]*3)

In [6]:
input = input.unsqueeze(dim=0)


In [7]:
input.shape

torch.Size([1, 3, 128])

In [8]:
bert(input)

tensor([[[0.1369]]], grad_fn=<ViewBackward0>)

In [24]:
def log_metrics(preds, labels):
    preds = torch.stack(preds)
    preds = preds.cpu().detach().numpy()
    labels = torch.stack(labels)
    labels = labels.cpu().detach().numpy()
    
    
    ## Method 2 using ravel()
    fpr_micro, tpr_micro, _ = metrics.roc_curve(labels.ravel(), preds.ravel())
    auc_micro = metrics.auc(fpr_micro, tpr_micro)
    n=10
    preds = [round(i) for i in preds.ravel()]
    acc = metrics.accuracy_score(labels.ravel(), preds)
    f1 = metrics.f1_score(labels.ravel(), preds)
    preds = [preds[i*n:(i+1)*n] for i in range(len(preds)//6)]
    #target_names = list(genre_list.keys())
    #print(classification_report(labels,preds, target_names=target_names))
    return {"auc_micro": auc_micro, "acc" : acc, "f1" : f1, 'pred' : preds}

In [25]:
def train_one_epoch(dataloader, model, optimizer, device, loss_fn):
    model.train()
    tk0 = tqdm(dataloader, total=len(dataloader))
    total_loss = 0.0
    
    for source_tensor, label in tk0:
        optimizer.zero_grad()
        output = model(source_tensor.to(device))
        loss = loss_fn(output.view(-1, output.size(-1)), torch.FloatTensor([label]).to(device))
        
        total_loss += loss
        loss.backward()
        optimizer.step()
        
    avg_train_loss = total_loss / len(dataloader) 
    print(" Average training loss: {0:.2f}".format(avg_train_loss))  

def test_one_epoch(testset, model, loss_fn):
    model.eval()
    tk0 = tqdm(testset, total=len(testset))
    targets = []
    outputs = []
    total_loss = 0.0
    with torch.no_grad():
        for source_tensor, label in tk0:
            label = torch.FloatTensor([label])
            output = model(source_tensor.to(device))
            loss = loss_fn(output.view(-1, output.size(-1)), label.to(device))
            total_loss += loss
            targets.extend(label)
            outputs.extend(torch.sigmoid(output[0]).to('cpu'))
    avg_valid_loss = total_loss / len(testset) 
    score = log_metrics(outputs, targets)
    print(" Average valid loss: {0:.2f}".format(avg_valid_loss))  
    print('AUC_SCORE: ', score["auc_micro"], " acc: ", score["acc"], "f1: ", score["f1"])
    return score, avg_valid_loss
    
def fit(model, train_dataloader, valid_dataloader=None,test_dataloader=None, EPOCHS=3, lr=0.000002):
    loss_fn = nn.BCEWithLogitsLoss() #ignore padding
    optimizer = torch.optim.AdamW(model.parameters(),lr=lr)
    for i in range(EPOCHS):
        print(f"EPOCHS:{i+1}")
        print('TRAIN')
        train_one_epoch(train_dataloader, model, optimizer, device, loss_fn)
        auc, loss = test_one_epoch(valid_dataloader,model, loss_fn)
        if i == 0:
            torch.save(model,'rnn_best.model')
            best_loss = loss
        if  loss<best_loss:
            best_loss = loss
            print(i,' EPOCH BEST MODEL!')
            torch.save(model,'rnn_best.model')
    best_model = torch.load('rnn_best.model')
    auc, loss  = test_one_epoch(test_dataloader,best_model, loss_fn)
    print('BEST MODEL')
    print('AUC_SCORE: ', auc["auc_micro"], " acc: ", auc["acc"], "f1: ", auc["f1"])
    return auc

In [26]:
from gensim.models import Word2Vec

In [27]:
import json

In [28]:
with open('incoherence.json', "r") as json_file:
    incoherence = json.load(json_file)

# 평균 임베딩 

In [215]:
import pickle
with open('emb_avg.pickle','rb') as fw:
    movie_emb = pickle.load(fw)

In [233]:
type(movie_emb[movie_name][21][0])

numpy.float64

In [234]:
correct_triple = []
wrong_triple = []
nomissing_double = []
missing_double = []
path1 = 'word2vec_model_noscene/'

filenames = os.listdir(path1)
train_data = []
valid_data = []
test_data = []
for filename in filenames:
    movie_name = filename[:-14]
    
    w2v=Word2Vec.load(path1 + filename)
    base = 'scene_'
    scene_num = 0
    passs = 0
    if movie_name not in set(incoherence['correct_triple'].keys()):
        continue
    for scene in incoherence['correct_triple'][movie_name]:
        passs = 0
        embs = torch.zeros(1,3,128)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if str(movie_emb[movie_name][s-1][0]) != 'nan':
                embs[0][i] = torch.from_numpy(movie_emb[movie_name][s-1])
            else:
                passs = 1
                break
        if not passs:        
            correct_triple.append([embs,[0]])
    
    for scene in incoherence['wrong_triple'][movie_name]:
        passs = 0
        embs = torch.zeros(1,3,128)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if str(movie_emb[movie_name][s-1][0]) != 'nan':
                embs[0][i] = torch.from_numpy(movie_emb[movie_name][s-1])
            else:
                passs = 1
                break
        if not passs:     
            wrong_triple.append([embs,[1]])
    
    for scene in incoherence['nomissing_double'][movie_name]:
        passs = 0
        embs = torch.zeros(1,2,128)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if str(movie_emb[movie_name][s-1][0]) != 'nan':
                embs[0][i] = torch.from_numpy(movie_emb[movie_name][s-1])
            else:
                passs = 1
                break
        if not passs:     
            nomissing_double.append([embs,[0]])
        
    for scene in incoherence['missing_double'][movie_name]:
        passs = 0
        embs = torch.zeros(1,2,128)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if str(movie_emb[movie_name][s-1][0]) != 'nan':
                embs[0][i] = torch.from_numpy(movie_emb[movie_name][s-1])
            else:
                passs = 1
                break
        if not passs:     
            missing_double.append([embs,[1]])


In [235]:
print(len(correct_triple))
print(len(wrong_triple))
print(len(nomissing_double))
print(len(missing_double))

50264
47069
51224
51276


In [236]:
random.Random(4).shuffle(correct_triple)
random.Random(4).shuffle(wrong_triple)
train_data3 = correct_triple[:35000]+wrong_triple[:35000]
valid_data3 = correct_triple[35000:40000]+wrong_triple[35000:40000]
test_data3 = correct_triple[40000:]+wrong_triple[40000:]
random.Random(4).shuffle(train_data3)
random.Random(4).shuffle(valid_data3)
random.Random(4).shuffle(test_data3)

In [237]:
bert3 = BertModel().to('cuda')
score = fit(bert3, train_data3,valid_data3,test_data3,EPOCHS=20)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:45<00:00, 119.65it/s]


 Average training loss: 0.69


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1085.41it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5064958389880849  acc:  0.5138 f1:  0.31308279174908166
EPOCHS:2
TRAIN


100%|████████████████████████████████████| 70000/70000 [10:50<00:00, 107.55it/s]


 Average training loss: 0.69


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1022.62it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5196855259068243  acc:  0.518 f1:  0.3654555028962612
1  EPOCH BEST MODEL!
EPOCHS:3
TRAIN


100%|████████████████████████████████████| 70000/70000 [10:15<00:00, 113.69it/s]


 Average training loss: 0.69


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1026.38it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5292789450077131  acc:  0.5239 f1:  0.4124398370973713
2  EPOCH BEST MODEL!
EPOCHS:4
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:36<00:00, 121.50it/s]


 Average training loss: 0.69


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1085.74it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5351959192932695  acc:  0.5271 f1:  0.4615734942502562
3  EPOCH BEST MODEL!
EPOCHS:5
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:38<00:00, 120.94it/s]


 Average training loss: 0.69


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1081.35it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5378436028990944  acc:  0.5269 f1:  0.47936612743479695
4  EPOCH BEST MODEL!
EPOCHS:6
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:33<00:00, 122.04it/s]


 Average training loss: 0.68


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1062.48it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5401773483938703  acc:  0.5266 f1:  0.48777320926206447
EPOCHS:7
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:34<00:00, 121.86it/s]


 Average training loss: 0.68


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1048.55it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5412045473236365  acc:  0.5265 f1:  0.4883846569421934
EPOCHS:8
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:22<00:00, 124.40it/s]


 Average training loss: 0.68


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1054.95it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5415936551253051  acc:  0.5289 f1:  0.4842911877394636
EPOCHS:9
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:24<00:00, 123.93it/s]


 Average training loss: 0.68


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1100.43it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5418706793701641  acc:  0.5319 f1:  0.48304803975704025
EPOCHS:10
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:19<00:00, 125.11it/s]


 Average training loss: 0.67


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1110.67it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5421407534730981  acc:  0.5311 f1:  0.4791736087970676
EPOCHS:11
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:10<00:00, 127.15it/s]


 Average training loss: 0.67


100%|███████████████████████████████████| 10000/10000 [00:08<00:00, 1127.10it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5429680607129147  acc:  0.5289 f1:  0.47168330155882027
EPOCHS:12
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:19<00:00, 125.04it/s]


 Average training loss: 0.67


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1105.45it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.543256982318879  acc:  0.5291 f1:  0.4739135292146128
EPOCHS:13
TRAIN


100%|████████████████████████████████████| 70000/70000 [10:59<00:00, 106.15it/s]


 Average training loss: 0.66


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1030.89it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.544385549168769  acc:  0.5316 f1:  0.4781639928698752
EPOCHS:14
TRAIN


100%|████████████████████████████████████| 70000/70000 [10:48<00:00, 108.00it/s]


 Average training loss: 0.66


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1042.77it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5452029619702254  acc:  0.5327 f1:  0.4733461061647695
EPOCHS:15
TRAIN


100%|████████████████████████████████████| 70000/70000 [10:19<00:00, 112.96it/s]


 Average training loss: 0.66


100%|████████████████████████████████████| 10000/10000 [00:11<00:00, 842.97it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.5454390064157221  acc:  0.5327 f1:  0.4726328856788173
EPOCHS:16
TRAIN


100%|████████████████████████████████████| 70000/70000 [11:12<00:00, 104.03it/s]


 Average training loss: 0.65


100%|████████████████████████████████████| 10000/10000 [00:10<00:00, 944.77it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.5465671326226083  acc:  0.5329 f1:  0.46793484451532064
EPOCHS:17
TRAIN


100%|████████████████████████████████████| 70000/70000 [10:38<00:00, 109.68it/s]


 Average training loss: 0.65


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1062.49it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.5459255564089343  acc:  0.5309 f1:  0.45674580196873193
EPOCHS:18
TRAIN


100%|████████████████████████████████████| 70000/70000 [10:49<00:00, 107.75it/s]


 Average training loss: 0.64


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1060.53it/s]


 Average valid loss: 0.72
AUC_SCORE:  0.5471606386904025  acc:  0.5313 f1:  0.4678096968320654
EPOCHS:19
TRAIN


100%|████████████████████████████████████| 70000/70000 [10:46<00:00, 108.30it/s]


 Average training loss: 0.64


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1035.74it/s]


 Average valid loss: 0.72
AUC_SCORE:  0.5467447518116337  acc:  0.5322 f1:  0.4710538218000905
EPOCHS:20
TRAIN


100%|████████████████████████████████████| 70000/70000 [10:46<00:00, 108.36it/s]


 Average training loss: 0.63


100%|████████████████████████████████████| 10000/10000 [00:12<00:00, 797.44it/s]


 Average valid loss: 0.72
AUC_SCORE:  0.5472767481219295  acc:  0.5357 f1:  0.4671180993917135
EPOCHS:21
TRAIN


100%|████████████████████████████████████| 70000/70000 [10:48<00:00, 107.88it/s]


 Average training loss: 0.63


100%|████████████████████████████████████| 10000/10000 [00:13<00:00, 741.39it/s]


 Average valid loss: 0.73
AUC_SCORE:  0.54658840366216  acc:  0.5344 f1:  0.4686144715818306
EPOCHS:22
TRAIN


100%|████████████████████████████████████| 70000/70000 [10:40<00:00, 109.26it/s]


 Average training loss: 0.63


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1058.45it/s]


 Average valid loss: 0.73
AUC_SCORE:  0.5462756472755304  acc:  0.5335 f1:  0.4671616219303255
EPOCHS:23
TRAIN


100%|████████████████████████████████████| 70000/70000 [10:58<00:00, 106.34it/s]


 Average training loss: 0.62


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1067.14it/s]


 Average valid loss: 0.73
AUC_SCORE:  0.5451962321498023  acc:  0.533 f1:  0.466773235898607
EPOCHS:24
TRAIN


100%|████████████████████████████████████| 70000/70000 [11:09<00:00, 104.52it/s]


 Average training loss: 0.62


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1057.56it/s]


 Average valid loss: 0.74
AUC_SCORE:  0.5450074767103148  acc:  0.5329 f1:  0.46525472238122495
EPOCHS:25
TRAIN


100%|████████████████████████████████████| 70000/70000 [11:27<00:00, 101.74it/s]


 Average training loss: 0.62


100%|████████████████████████████████████| 10000/10000 [00:11<00:00, 882.99it/s]


 Average valid loss: 0.74
AUC_SCORE:  0.5437200780867473  acc:  0.5327 f1:  0.47132028510012447
EPOCHS:26
TRAIN


100%|████████████████████████████████████| 70000/70000 [11:01<00:00, 105.80it/s]


 Average training loss: 0.61


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1077.52it/s]


 Average valid loss: 0.75
AUC_SCORE:  0.5424272715717684  acc:  0.5323 f1:  0.46333907056798623
EPOCHS:27
TRAIN


100%|████████████████████████████████████| 70000/70000 [10:49<00:00, 107.73it/s]


 Average training loss: 0.61


100%|████████████████████████████████████| 10000/10000 [00:10<00:00, 999.29it/s]


 Average valid loss: 0.75
AUC_SCORE:  0.5414288145833926  acc:  0.5309 f1:  0.4740441753559816
EPOCHS:28
TRAIN


100%|████████████████████████████████████| 70000/70000 [11:05<00:00, 105.17it/s]


 Average training loss: 0.60


100%|████████████████████████████████████| 10000/10000 [00:10<00:00, 954.08it/s]


 Average valid loss: 0.75
AUC_SCORE:  0.540933291496283  acc:  0.5305 f1:  0.47664697358154046
EPOCHS:29
TRAIN


100%|████████████████████████████████████| 70000/70000 [11:16<00:00, 103.51it/s]


 Average training loss: 0.60


100%|████████████████████████████████████| 10000/10000 [00:10<00:00, 937.93it/s]


 Average valid loss: 0.76
AUC_SCORE:  0.540526798325188  acc:  0.5275 f1:  0.45064527380537145
EPOCHS:30
TRAIN


100%|████████████████████████████████████| 70000/70000 [10:29<00:00, 111.22it/s]


 Average training loss: 0.60


100%|████████████████████████████████████| 10000/10000 [00:12<00:00, 789.97it/s]


 Average valid loss: 0.77
AUC_SCORE:  0.5396481962338724  acc:  0.5276 f1:  0.4542513863216266


100%|███████████████████████████████████| 17333/17333 [00:16<00:00, 1081.38it/s]

 Average valid loss: 0.69
AUC_SCORE:  0.5342401877851181  acc:  0.525760110771361 f1:  0.47888931152529485
BEST MODEL
AUC_SCORE:  0.5342401877851181  acc:  0.525760110771361 f1:  0.47888931152529485
{'auc_micro': 0.5342401877851181, 'acc': 0.525760110771361, 'f1': 0.47888931152529485, 'pred': [[1, 0, 0, 0, 1, 0, 1, 0, 1, 1], [0, 0, 1, 1, 0, 1, 1, 0, 0, 1], [0, 1, 1, 1, 0, 1, 0, 0, 1, 0], [1, 0, 0, 1, 0, 1, 0, 1, 0, 1], [0, 0, 0, 1, 1, 0, 0, 0, 1, 1], [0, 1, 1, 1, 1, 0, 0, 1, 0, 1], [1, 1, 1, 0, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 1, 1, 0, 1, 0, 1], [1, 0, 1, 0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 1, 0, 1, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1, 0], [0, 0, 0, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 1, 0], [0, 0, 0, 1, 0, 1, 1, 0, 0, 1], [1, 1, 0, 0, 1, 1, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 1, 0], [0, 0, 1, 1, 1, 1, 0, 0, 1, 0], [0, 0, 0, 0, 1, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 1, 1, 0], [0, 0, 1, 1, 1, 1, 0, 1, 0, 0], [1, 1, 1, 0, 1, 1, 1, 1, 0, 1], [1, 1, 1, 0, 1, 1, 1, 0, 1, 1], [




In [240]:
random.Random(4).shuffle(nomissing_double)
random.Random(4).shuffle(missing_double)
train_data4 = nomissing_double[:35000]+missing_double[:35000]
valid_data4 = nomissing_double[35000:40000]+missing_double[35000:40000]
test_data4 = nomissing_double[40000:]+missing_double[40000:]
random.Random(4).shuffle(train_data4)
random.Random(4).shuffle(valid_data4)
random.Random(4).shuffle(test_data4)

In [None]:
bert4 = BertModel().to('cuda')
score = fit(bert4, train_data4,valid_data4,test_data4,EPOCHS=20)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 70000/70000 [10:34<00:00, 110.24it/s]


 Average training loss: 0.69


100%|████████████████████████████████████| 10000/10000 [00:18<00:00, 539.26it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5121737243855724  acc:  0.5167 f1:  0.2526673882789547
EPOCHS:2
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:35<00:00, 121.66it/s]


 Average training loss: 0.69


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1093.42it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5201349817707992  acc:  0.5193 f1:  0.376766498120057
1  EPOCH BEST MODEL!
EPOCHS:3
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:20<00:00, 124.94it/s]


 Average training loss: 0.69


100%|███████████████████████████████████| 10000/10000 [00:08<00:00, 1111.74it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5277105162937965  acc:  0.5227 f1:  0.45332722483106175
2  EPOCH BEST MODEL!
EPOCHS:4
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:23<00:00, 124.18it/s]


 Average training loss: 0.69


100%|███████████████████████████████████| 10000/10000 [00:08<00:00, 1116.36it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5330179010419165  acc:  0.5264 f1:  0.48677936714347636
3  EPOCH BEST MODEL!
EPOCHS:5
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:18<00:00, 125.37it/s]


 Average training loss: 0.69


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1056.89it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5374623866130412  acc:  0.5281 f1:  0.5019525065963061
4  EPOCH BEST MODEL!
EPOCHS:6
TRAIN


 32%|███████████▍                        | 22149/70000 [02:57<06:34, 121.29it/s]

# edge2vec

In [242]:
correct_triple = []
wrong_triple = []
nomissing_double = []
missing_double = []
path1 = 'word2vec_model_edge/'

filenames = os.listdir(path1)
train_data = []
valid_data = []
test_data = []
for filename in filenames:
    movie_name = filename[:-14]
    
    w2v=Word2Vec.load(path1 + filename)
    base = 'scene_'
    scene_num = 0
    passs = 0
    if movie_name not in set(incoherence['correct_triple'].keys()):
        continue
    for scene in incoherence['correct_triple'][movie_name]:
        passs = 0
        embs = torch.zeros(1,3,128)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if str(movie_emb[movie_name][s-1][0]) != 'nan':
                embs[0][i] = torch.from_numpy(movie_emb[movie_name][s-1])
            else:
                passs = 1
                break
        if not passs:        
            correct_triple.append([embs,[0]])
    
    for scene in incoherence['wrong_triple'][movie_name]:
        passs = 0
        embs = torch.zeros(1,3,128)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if str(movie_emb[movie_name][s-1][0]) != 'nan':
                embs[0][i] = torch.from_numpy(movie_emb[movie_name][s-1])
            else:
                passs = 1
                break
        if not passs:     
            wrong_triple.append([embs,[1]])
    
    for scene in incoherence['nomissing_double'][movie_name]:
        passs = 0
        embs = torch.zeros(1,2,128)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if str(movie_emb[movie_name][s-1][0]) != 'nan':
                embs[0][i] = torch.from_numpy(movie_emb[movie_name][s-1])
            else:
                passs = 1
                break
        if not passs:     
            nomissing_double.append([embs,[0]])
        
    for scene in incoherence['missing_double'][movie_name]:
        passs = 0
        embs = torch.zeros(1,2,128)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if str(movie_emb[movie_name][s-1][0]) != 'nan':
                embs[0][i] = torch.from_numpy(movie_emb[movie_name][s-1])
            else:
                passs = 1
                break
        if not passs:     
            missing_double.append([embs,[1]])

In [243]:
random.Random(4).shuffle(correct_triple)
random.Random(4).shuffle(wrong_triple)
train_data5 = correct_triple[:35000]+wrong_triple[:35000]
valid_data5 = correct_triple[35000:40000]+wrong_triple[35000:40000]
test_data5 = correct_triple[40000:]+wrong_triple[40000:]
random.Random(4).shuffle(train_data5)
random.Random(4).shuffle(valid_data5)
random.Random(4).shuffle(test_data5)

In [245]:
bert5 = BertModel().to('cuda')
score = fit(bert5, train_data5,valid_data5,test_data5,EPOCHS=20)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 70000/70000 [10:02<00:00, 116.14it/s]


 Average training loss: 0.69


100%|████████████████████████████████████| 10000/10000 [00:13<00:00, 745.65it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5016191676268636  acc:  0.5135 f1:  0.008963128946832348
EPOCHS:2
TRAIN


100%|█████████████████████████████████████| 70000/70000 [15:07<00:00, 77.15it/s]


 Average training loss: 0.69


100%|████████████████████████████████████| 10000/10000 [00:15<00:00, 650.76it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5098184680385229  acc:  0.5137 f1:  0.0749476887958912
1  EPOCH BEST MODEL!
EPOCHS:3
TRAIN


100%|█████████████████████████████████████| 70000/70000 [15:11<00:00, 76.81it/s]


 Average training loss: 0.69


100%|████████████████████████████████████| 10000/10000 [00:23<00:00, 421.30it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.518914841838824  acc:  0.5131 f1:  0.19799044638445065
2  EPOCH BEST MODEL!
EPOCHS:4
TRAIN


100%|█████████████████████████████████████| 70000/70000 [15:12<00:00, 76.67it/s]


 Average training loss: 0.69


100%|████████████████████████████████████| 10000/10000 [00:13<00:00, 720.39it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5298673717286935  acc:  0.5196 f1:  0.3049768518518518
3  EPOCH BEST MODEL!
EPOCHS:5
TRAIN


100%|█████████████████████████████████████| 70000/70000 [15:05<00:00, 77.30it/s]


 Average training loss: 0.69


100%|████████████████████████████████████| 10000/10000 [00:13<00:00, 738.10it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5345315008307205  acc:  0.5227 f1:  0.37681159420289856
4  EPOCH BEST MODEL!
EPOCHS:6
TRAIN


100%|█████████████████████████████████████| 70000/70000 [15:07<00:00, 77.12it/s]


 Average training loss: 0.68


100%|████████████████████████████████████| 10000/10000 [00:14<00:00, 705.41it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5396675051868247  acc:  0.5273 f1:  0.4099363375358881
5  EPOCH BEST MODEL!
EPOCHS:7
TRAIN


100%|█████████████████████████████████████| 70000/70000 [14:37<00:00, 79.80it/s]


 Average training loss: 0.68


100%|████████████████████████████████████| 10000/10000 [00:11<00:00, 857.93it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5425480158841517  acc:  0.5291 f1:  0.43503299340131973
EPOCHS:8
TRAIN


100%|█████████████████████████████████████| 70000/70000 [15:15<00:00, 76.46it/s]


 Average training loss: 0.68


100%|████████████████████████████████████| 10000/10000 [00:11<00:00, 884.38it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5443307535884837  acc:  0.5285 f1:  0.45459803354540196
EPOCHS:9
TRAIN


100%|█████████████████████████████████████| 70000/70000 [15:30<00:00, 75.26it/s]


 Average training loss: 0.68


100%|████████████████████████████████████| 10000/10000 [00:12<00:00, 784.77it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.54666452950044  acc:  0.5326 f1:  0.4620165745856353
EPOCHS:10
TRAIN


100%|█████████████████████████████████████| 70000/70000 [15:24<00:00, 75.74it/s]


 Average training loss: 0.67


100%|████████████████████████████████████| 10000/10000 [00:10<00:00, 913.05it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.547808479215011  acc:  0.5316 f1:  0.47909252669039143
EPOCHS:11
TRAIN


100%|█████████████████████████████████████| 70000/70000 [15:38<00:00, 74.55it/s]


 Average training loss: 0.67


100%|████████████████████████████████████| 10000/10000 [00:14<00:00, 684.39it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5476339003829405  acc:  0.5304 f1:  0.4782222222222222
EPOCHS:12
TRAIN


100%|█████████████████████████████████████| 70000/70000 [15:53<00:00, 73.38it/s]


 Average training loss: 0.67


100%|████████████████████████████████████| 10000/10000 [00:12<00:00, 806.57it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5486555328259245  acc:  0.5332 f1:  0.5002141327623126
EPOCHS:13
TRAIN


100%|████████████████████████████████████| 70000/70000 [11:26<00:00, 101.93it/s]


 Average training loss: 0.66


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1070.16it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5503726183209936  acc:  0.535 f1:  0.49598959462388903
EPOCHS:14
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:13<00:00, 126.41it/s]


 Average training loss: 0.66


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1062.87it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5501334481232856  acc:  0.5359 f1:  0.5096671949286846
EPOCHS:15
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:12<00:00, 126.65it/s]


 Average training loss: 0.66


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1075.78it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5509794408905738  acc:  0.5346 f1:  0.509692372524231
EPOCHS:16
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:14<00:00, 126.17it/s]


 Average training loss: 0.65


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1017.71it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.5508778200776385  acc:  0.5363 f1:  0.5124592577016086
EPOCHS:17
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:13<00:00, 126.55it/s]


 Average training loss: 0.65


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1056.82it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.5493169388024132  acc:  0.5342 f1:  0.5216676935715753
EPOCHS:18
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:17<00:00, 125.45it/s]


 Average training loss: 0.64


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1074.90it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.5506466562469138  acc:  0.5315 f1:  0.528054800040294
EPOCHS:19
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:18<00:00, 125.24it/s]


 Average training loss: 0.64


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1094.46it/s]


 Average valid loss: 0.72
AUC_SCORE:  0.5495035271849585  acc:  0.5331 f1:  0.514909090909091
EPOCHS:20
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:16<00:00, 125.68it/s]


 Average training loss: 0.64


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1066.71it/s]


 Average valid loss: 0.72
AUC_SCORE:  0.5494412576657461  acc:  0.535 f1:  0.5303978994142596


100%|███████████████████████████████████| 17333/17333 [00:16<00:00, 1076.47it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5326423205174561  acc:  0.5244908555933768 f1:  0.40232052211747643
BEST MODEL
AUC_SCORE:  0.5326423205174561  acc:  0.5244908555933768 f1:  0.40232052211747643
{'auc_micro': 0.5326423205174561, 'acc': 0.5244908555933768, 'f1': 0.40232052211747643, 'pred': [[0, 0, 1, 0, 0, 0, 1, 1, 1, 0], [0, 0, 1, 1, 0, 1, 0, 0, 1, 0], [0, 1, 1, 1, 0, 1, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 1, 0, 1, 0], [0, 0, 1, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 1, 0], [1, 0, 0, 0, 0, 1, 1, 0, 0, 1], [0, 1, 0, 0, 0, 0, 1, 1, 0, 0], [1, 0, 1, 0, 0, 0, 1, 0, 0, 0], [1, 0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 1, 1], [0, 1, 1, 1, 0, 0, 0, 1, 1, 0], [1, 0, 1, 1, 1, 1, 0, 0, 1, 0], [0, 0, 0, 1, 0, 1, 1, 1, 0, 0], [0, 0, 1, 0, 1, 0, 0, 0, 1, 1], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 0, 0, 0, 0], [1, 0, 1, 0, 0, 1, 0, 0, 0, 1], [1, 1, 1, 1, 1, 0, 0, 1, 0, 1], [0, 0, 0, 0, 0, 1, 0, 1, 0, 0], [1, 0, 1, 0, 0, 1, 0, 0, 0, 1], [1, 0, 0, 0, 1, 1, 1, 1, 0, 0]

In [246]:
random.Random(4).shuffle(nomissing_double)
random.Random(4).shuffle(missing_double)
train_data6 = nomissing_double[:35000]+missing_double[:35000]
valid_data6 = nomissing_double[35000:40000]+missing_double[35000:40000]
test_data6 = nomissing_double[40000:]+missing_double[40000:]
random.Random(4).shuffle(train_data6)
random.Random(4).shuffle(valid_data6)
random.Random(4).shuffle(test_data6)

In [247]:
bert6 = BertModel().to('cuda')
score = fit(bert6, train_data6,valid_data6,test_data6,EPOCHS=20)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:21<00:00, 124.70it/s]


 Average training loss: 0.69


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1061.48it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5040190761301017  acc:  0.5136 f1:  0.018563357546408393
EPOCHS:2
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:15<00:00, 126.04it/s]


 Average training loss: 0.69


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1071.88it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5128326650485532  acc:  0.514 f1:  0.07639680729760548
1  EPOCH BEST MODEL!
EPOCHS:3
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:13<00:00, 126.50it/s]


 Average training loss: 0.69


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1082.11it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5184970295778014  acc:  0.5129 f1:  0.1939434055932484
2  EPOCH BEST MODEL!
EPOCHS:4
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:14<00:00, 126.22it/s]


 Average training loss: 0.69


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1049.32it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5217668698856078  acc:  0.5172 f1:  0.3071182548794489
EPOCHS:5
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:11<00:00, 126.81it/s]


 Average training loss: 0.69


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1091.16it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5271760915150164  acc:  0.5229 f1:  0.37049742710120076
EPOCHS:6
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:11<00:00, 126.92it/s]


 Average training loss: 0.68


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1086.65it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5308154456750186  acc:  0.5221 f1:  0.40581872435658334
EPOCHS:7
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:10<00:00, 127.13it/s]


 Average training loss: 0.68


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1036.23it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5348960507353868  acc:  0.528 f1:  0.4300893503984545
EPOCHS:8
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:14<00:00, 126.31it/s]


 Average training loss: 0.68


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1081.16it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5371105718111271  acc:  0.5295 f1:  0.4482232907235839
EPOCHS:9
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:15<00:00, 125.95it/s]


 Average training loss: 0.68


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1072.16it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5394680465693538  acc:  0.5336 f1:  0.4735891647855531
EPOCHS:10
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:17<00:00, 125.48it/s]


 Average training loss: 0.67


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1093.50it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5421957357368874  acc:  0.5337 f1:  0.468967088030976
EPOCHS:11
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:15<00:00, 126.12it/s]


 Average training loss: 0.67


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1053.24it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5427072625234491  acc:  0.5361 f1:  0.48208105392430506
EPOCHS:12
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:09<00:00, 127.32it/s]


 Average training loss: 0.67


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1089.94it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5456195384817822  acc:  0.5383 f1:  0.4879671731174448
EPOCHS:13
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:13<00:00, 126.57it/s]


 Average training loss: 0.66


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1059.59it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5471528378227501  acc:  0.5387 f1:  0.49711108688542466
EPOCHS:14
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:14<00:00, 126.15it/s]


 Average training loss: 0.66


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1060.53it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5480413844305546  acc:  0.5367 f1:  0.49238523063438155
EPOCHS:15
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:17<00:00, 125.58it/s]


 Average training loss: 0.66


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1086.51it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5494638156047216  acc:  0.5353 f1:  0.4936253677672442
EPOCHS:16
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:09<00:00, 127.44it/s]


 Average training loss: 0.65


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1071.55it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.5497860318439636  acc:  0.5358 f1:  0.4916776171703898
EPOCHS:17
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:14<00:00, 126.35it/s]


 Average training loss: 0.65


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1044.16it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.5509914904728836  acc:  0.5382 f1:  0.504612744046342
EPOCHS:18
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:16<00:00, 125.83it/s]


 Average training loss: 0.65


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1066.84it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.5513741948146844  acc:  0.5381 f1:  0.5092956549452884
EPOCHS:19
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:15<00:00, 125.93it/s]


 Average training loss: 0.64


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1047.52it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.5504967370251519  acc:  0.5375 f1:  0.5147413702654495
EPOCHS:20
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:14<00:00, 126.24it/s]


 Average training loss: 0.64


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1093.86it/s]


 Average valid loss: 0.72
AUC_SCORE:  0.55048328632862  acc:  0.5415 f1:  0.5211488250652742


100%|███████████████████████████████████| 10000/10000 [00:08<00:00, 1122.66it/s]

 Average valid loss: 0.69
AUC_SCORE:  0.5183098294167306  acc:  0.5163 f1:  0.18910310142497902
BEST MODEL
AUC_SCORE:  0.5183098294167306  acc:  0.5163 f1:  0.18910310142497902
{'auc_micro': 0.5183098294167306, 'acc': 0.5163, 'f1': 0.18910310142497902, 'pred': [[0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1, 1, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0




# dim 384

In [29]:
correct_triple = []
wrong_triple = []
nomissing_double = []
missing_double = []
path1 = 'word2vec_model_char/'

filenames = os.listdir(path1)
train_data = []
valid_data = []
test_data = []
for filename in filenames:
    movie_name = filename[:-14]
    
    w2v=Word2Vec.load(path1 + filename)
    base = 'scene_'
    scene_num = 0
    passs = 0
    if movie_name not in set(incoherence['correct_triple'].keys()):
        continue
    for scene in incoherence['correct_triple'][movie_name]:
        passs = 0
        embs = torch.zeros(1,3,384)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if emb_name in w2v.wv.index_to_key:
                embs[0][i] = torch.from_numpy(w2v.wv[emb_name])
            else:
                passs = 1
                break
        if not passs:        
            correct_triple.append([embs,[0]])
    
    for scene in incoherence['wrong_triple'][movie_name]:
        passs = 0
        embs = torch.zeros(1,3,384)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if emb_name in w2v.wv.index_to_key:
                embs[0][i] = torch.from_numpy(w2v.wv[emb_name])
            else:
                passs = 1
                break
        if not passs:     
            wrong_triple.append([embs,[1]])
    
    for scene in incoherence['nomissing_double'][movie_name]:
        passs = 0
        embs = torch.zeros(1,2,384)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if emb_name in w2v.wv.index_to_key:
                embs[0][i] = torch.from_numpy(w2v.wv[emb_name])
            else:
                passs = 1
                break
        if not passs:     
            nomissing_double.append([embs,[0]])
        
    for scene in incoherence['missing_double'][movie_name]:
        passs = 0
        embs = torch.zeros(1,2,384)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if emb_name in w2v.wv.index_to_key:
                embs[0][i] = torch.from_numpy(w2v.wv[emb_name])
            else:
                passs = 1
                break
        if not passs:     
            missing_double.append([embs,[1]])


In [30]:
random.Random(4).shuffle(correct_triple)
random.Random(4).shuffle(wrong_triple)
train_data7 = correct_triple[:35000]+wrong_triple[:35000]
valid_data7 = correct_triple[35000:40000]+wrong_triple[35000:40000]
test_data7 = correct_triple[40000:]+wrong_triple[40000:]
random.Random(4).shuffle(train_data7)
random.Random(4).shuffle(valid_data7)
random.Random(4).shuffle(test_data7)

In [16]:
bert7 = BertModel(d_model=384).to('cuda')
score = fit(bert7, train_data7,valid_data7,test_data7,EPOCHS=10)
print(score)

EPOCHS:1
TRAIN


100%|█████████████████████████████████████| 70000/70000 [14:33<00:00, 80.09it/s]


 Average training loss: 0.69


100%|████████████████████████████████████| 10000/10000 [00:11<00:00, 871.56it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.53696652  acc:  0.5235 f1:  0.5884080504448476
EPOCHS:2
TRAIN


100%|█████████████████████████████████████| 70000/70000 [15:10<00:00, 76.90it/s]


 Average training loss: 0.69


100%|████████████████████████████████████| 10000/10000 [00:14<00:00, 687.05it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5427696399999999  acc:  0.5309 f1:  0.5988198067219703
1  EPOCH BEST MODEL!
EPOCHS:3
TRAIN


100%|█████████████████████████████████████| 70000/70000 [15:21<00:00, 75.97it/s]


 Average training loss: 0.69


100%|████████████████████████████████████| 10000/10000 [00:15<00:00, 666.17it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.54316388  acc:  0.5322 f1:  0.6028862478777589
EPOCHS:4
TRAIN


100%|█████████████████████████████████████| 70000/70000 [14:35<00:00, 79.96it/s]


 Average training loss: 0.68


100%|████████████████████████████████████| 10000/10000 [00:13<00:00, 759.51it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.54088134  acc:  0.5345 f1:  0.5951117682873793
EPOCHS:5
TRAIN


100%|█████████████████████████████████████| 70000/70000 [14:22<00:00, 81.16it/s]


 Average training loss: 0.67


100%|████████████████████████████████████| 10000/10000 [00:13<00:00, 752.71it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.54047466  acc:  0.5321 f1:  0.5938015452730272
EPOCHS:6
TRAIN


100%|█████████████████████████████████████| 70000/70000 [14:50<00:00, 78.64it/s]


 Average training loss: 0.67


100%|████████████████████████████████████| 10000/10000 [00:11<00:00, 848.63it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.53899362  acc:  0.5306 f1:  0.5899720475192173
EPOCHS:7
TRAIN


100%|█████████████████████████████████████| 70000/70000 [14:51<00:00, 78.49it/s]


 Average training loss: 0.65


100%|████████████████████████████████████| 10000/10000 [00:12<00:00, 817.75it/s]


 Average valid loss: 0.72
AUC_SCORE:  0.53668214  acc:  0.5262 f1:  0.5869944211994421
EPOCHS:8
TRAIN


 80%|█████████████████████████████▌       | 55882/70000 [11:56<03:00, 78.01it/s]


KeyboardInterrupt: 

In [14]:
random.Random(4).shuffle(nomissing_double)
random.Random(4).shuffle(missing_double)
train_data8 = nomissing_double[:35000]+missing_double[:35000]
valid_data8 = nomissing_double[35000:40000]+missing_double[35000:40000]
test_data8 = nomissing_double[40000:]+missing_double[40000:]
random.Random(4).shuffle(train_data8)
random.Random(4).shuffle(valid_data8)
random.Random(4).shuffle(test_data8)

In [15]:
bert8 = BertModel(d_model=384).to('cuda')
score = fit(bert8, train_data8,valid_data8,test_data8,EPOCHS=30)
print(score)

EPOCHS:1
TRAIN


100%|█████████████████████████████████████| 70000/70000 [14:25<00:00, 80.84it/s]


 Average training loss: 0.70


100%|████████████████████████████████████| 10000/10000 [00:12<00:00, 800.77it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.52849848  acc:  0.5203 f1:  0.5828332898512915
EPOCHS:2
TRAIN


  2%|▊                                     | 1506/70000 [00:19<15:01, 75.95it/s]


KeyboardInterrupt: 

# extra+emb

In [41]:
correct_triple = []
wrong_triple = []
nomissing_double = []
missing_double = []
path1 = 'word2vec_model_extra+emb/'

filenames = os.listdir(path1)
train_data = []
valid_data = []
test_data = []
for filename in filenames:
    movie_name = filename[:-14]
    
    w2v=Word2Vec.load(path1 + filename)
    base = 'scene_'
    scene_num = 0
    passs = 0
    if movie_name not in set(incoherence['correct_triple'].keys()):
        continue
    for scene in incoherence['correct_triple'][movie_name]:
        passs = 0
        embs = torch.zeros(1,3,384)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if emb_name in w2v.wv.index_to_key:
                embs[0][i] = torch.from_numpy(w2v.wv[emb_name])
            else:
                passs = 1
                break
        if not passs:        
            correct_triple.append([embs,[0]])
    
    for scene in incoherence['wrong_triple'][movie_name]:
        passs = 0
        embs = torch.zeros(1,3,384)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if emb_name in w2v.wv.index_to_key:
                embs[0][i] = torch.from_numpy(w2v.wv[emb_name])
            else:
                passs = 1
                break
        if not passs:     
            wrong_triple.append([embs,[1]])
    
    for scene in incoherence['nomissing_double'][movie_name]:
        passs = 0
        embs = torch.zeros(1,2,384)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if emb_name in w2v.wv.index_to_key:
                embs[0][i] = torch.from_numpy(w2v.wv[emb_name])
            else:
                passs = 1
                break
        if not passs:     
            nomissing_double.append([embs,[0]])
        
    for scene in incoherence['missing_double'][movie_name]:
        passs = 0
        embs = torch.zeros(1,2,384)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if emb_name in w2v.wv.index_to_key:
                embs[0][i] = torch.from_numpy(w2v.wv[emb_name])
            else:
                passs = 1
                break
        if not passs:     
            missing_double.append([embs,[1]])


In [42]:
random.Random(4).shuffle(correct_triple)
random.Random(4).shuffle(wrong_triple)
train_data7 = correct_triple[:35000]+wrong_triple[:35000]
valid_data7 = correct_triple[35000:40000]+wrong_triple[35000:40000]
test_data7 = correct_triple[40000:]+wrong_triple[40000:]
random.Random(4).shuffle(train_data7)
random.Random(4).shuffle(valid_data7)
random.Random(4).shuffle(test_data7)

In [43]:
bert7 = BertModel(d_model=384).to('cuda')
score = fit(bert7, train_data7,valid_data7,test_data7,EPOCHS=10)
print(score)

EPOCHS:1
TRAIN


100%|█████████████████████████████████████| 70000/70000 [15:48<00:00, 73.80it/s]


 Average training loss: 0.69


100%|████████████████████████████████████| 10000/10000 [00:13<00:00, 717.83it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5215202400000001  acc:  0.5151 f1:  0.5915943737892698
EPOCHS:2
TRAIN


100%|█████████████████████████████████████| 70000/70000 [14:38<00:00, 79.64it/s]


 Average training loss: 0.69


100%|████████████████████████████████████| 10000/10000 [00:12<00:00, 814.17it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.53118524  acc:  0.5183 f1:  0.5816760746851932
EPOCHS:3
TRAIN


100%|█████████████████████████████████████| 70000/70000 [14:50<00:00, 78.62it/s]


 Average training loss: 0.69


100%|████████████████████████████████████| 10000/10000 [00:13<00:00, 736.66it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5336234599999999  acc:  0.5235 f1:  0.580139219314477
EPOCHS:4
TRAIN


100%|█████████████████████████████████████| 70000/70000 [11:52<00:00, 98.31it/s]


 Average training loss: 0.68


100%|████████████████████████████████████| 10000/10000 [00:10<00:00, 955.77it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.5352274800000001  acc:  0.5222 f1:  0.588103448275862
EPOCHS:5
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:24<00:00, 124.10it/s]


 Average training loss: 0.67


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1013.79it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.53554054  acc:  0.5256 f1:  0.593278463648834
EPOCHS:6
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:24<00:00, 124.07it/s]


 Average training loss: 0.66


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1022.36it/s]


 Average valid loss: 0.72
AUC_SCORE:  0.53529682  acc:  0.5232 f1:  0.5827791389569479
EPOCHS:7
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:22<00:00, 124.36it/s]


 Average training loss: 0.65


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1034.75it/s]


 Average valid loss: 0.73
AUC_SCORE:  0.53262508  acc:  0.5214 f1:  0.5821547057796403
EPOCHS:8
TRAIN


100%|████████████████████████████████████| 70000/70000 [09:23<00:00, 124.20it/s]


 Average training loss: 0.64


100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1019.07it/s]


 Average valid loss: 0.75
AUC_SCORE:  0.52940992  acc:  0.5165 f1:  0.5612920787587334
EPOCHS:9
TRAIN


  1%|▌                                    | 1046/70000 [00:08<09:39, 118.97it/s]


KeyboardInterrupt: 

In [None]:
random.Random(4).shuffle(nomissing_double)
random.Random(4).shuffle(missing_double)
train_data8 = nomissing_double[:35000]+missing_double[:35000]
valid_data8 = nomissing_double[35000:40000]+missing_double[35000:40000]
test_data8 = nomissing_double[40000:]+missing_double[40000:]
random.Random(4).shuffle(train_data8)
random.Random(4).shuffle(valid_data8)
random.Random(4).shuffle(test_data8)

In [None]:
bert8 = BertModel(d_model=384).to('cuda')
score = fit(bert8, train_data8,valid_data8,test_data8,EPOCHS=10)
print(score)

# 최종

In [44]:
correct_triple = []
wrong_triple = []
nomissing_double = []
missing_double = []
path1 = 'word2vec_model_extra+pos/'

filenames = os.listdir(path1)
train_data = []
valid_data = []
test_data = []
for filename in filenames:
    movie_name = filename[:-14]
    
    w2v=Word2Vec.load(path1 + filename)
    base = 'scene_'
    scene_num = 0
    passs = 0
    if movie_name not in set(incoherence['correct_triple'].keys()):
        continue
    for scene in incoherence['correct_triple'][movie_name]:
        passs = 0
        embs = torch.zeros(1,3,400)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if emb_name in w2v.wv.index_to_key:
                embs[0][i] = torch.from_numpy(w2v.wv[emb_name])
            else:
                passs = 1
                break
        if not passs:        
            correct_triple.append([embs,[0]])
    
    for scene in incoherence['wrong_triple'][movie_name]:
        passs = 0
        embs = torch.zeros(1,3,400)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if emb_name in w2v.wv.index_to_key:
                embs[0][i] = torch.from_numpy(w2v.wv[emb_name])
            else:
                passs = 1
                break
        if not passs:     
            wrong_triple.append([embs,[1]])
    
    for scene in incoherence['nomissing_double'][movie_name]:
        passs = 0
        embs = torch.zeros(1,2,400)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if emb_name in w2v.wv.index_to_key:
                embs[0][i] = torch.from_numpy(w2v.wv[emb_name])
            else:
                passs = 1
                break
        if not passs:     
            nomissing_double.append([embs,[0]])
        
    for scene in incoherence['missing_double'][movie_name]:
        passs = 0
        embs = torch.zeros(1,2,400)
        for i,s in enumerate(scene):
            emb_name = base + str(s)
            if emb_name in w2v.wv.index_to_key:
                embs[0][i] = torch.from_numpy(w2v.wv[emb_name])
            else:
                passs = 1
                break
        if not passs:     
            missing_double.append([embs,[1]])


In [45]:
random.Random(4).shuffle(correct_triple)
random.Random(4).shuffle(wrong_triple)
train_data10 = correct_triple[:35000]+wrong_triple[:35000]
valid_data10 = correct_triple[35000:40000]+wrong_triple[35000:40000]
test_data10 = correct_triple[40000:]+wrong_triple[40000:]
random.Random(4).shuffle(train_data10)
random.Random(4).shuffle(valid_data10)
random.Random(4).shuffle(test_data10)

In [46]:
bert10 = BertModel(d_model=400).to('cuda')
score = fit(bert10, train_data10,valid_data10,test_data10,EPOCHS=10)
print(score)

EPOCHS:1
TRAIN


100%|█████████████████████████████████████| 70000/70000 [12:01<00:00, 96.98it/s]


 Average training loss: 0.69


100%|████████████████████████████████████| 10000/10000 [00:10<00:00, 936.27it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.5843455  acc:  0.5683 f1:  0.6439000247463499
EPOCHS:2
TRAIN


100%|█████████████████████████████████████| 70000/70000 [12:06<00:00, 96.29it/s]


 Average training loss: 0.68


100%|████████████████████████████████████| 10000/10000 [00:10<00:00, 939.02it/s]


 Average valid loss: 0.67
AUC_SCORE:  0.62601156  acc:  0.5931 f1:  0.6373763479190803
1  EPOCH BEST MODEL!
EPOCHS:3
TRAIN


100%|█████████████████████████████████████| 70000/70000 [13:06<00:00, 89.03it/s]


 Average training loss: 0.65


100%|████████████████████████████████████| 10000/10000 [00:16<00:00, 621.01it/s]


 Average valid loss: 0.65
AUC_SCORE:  0.6671278799999999  acc:  0.6149 f1:  0.6393181605319846
2  EPOCH BEST MODEL!
EPOCHS:4
TRAIN


100%|█████████████████████████████████████| 70000/70000 [14:23<00:00, 81.04it/s]


 Average training loss: 0.62


100%|████████████████████████████████████| 10000/10000 [00:17<00:00, 580.22it/s]


 Average valid loss: 0.63
AUC_SCORE:  0.69419812  acc:  0.6349 f1:  0.6485029363627611
3  EPOCH BEST MODEL!
EPOCHS:5
TRAIN


100%|█████████████████████████████████████| 70000/70000 [13:01<00:00, 89.53it/s]


 Average training loss: 0.59


100%|████████████████████████████████████| 10000/10000 [00:10<00:00, 940.45it/s]


 Average valid loss: 0.62
AUC_SCORE:  0.7139061000000001  acc:  0.6463 f1:  0.6617576742851679
4  EPOCH BEST MODEL!
EPOCHS:6
TRAIN


100%|█████████████████████████████████████| 70000/70000 [12:05<00:00, 96.52it/s]


 Average training loss: 0.56


100%|████████████████████████████████████| 10000/10000 [00:10<00:00, 939.55it/s]


 Average valid loss: 0.61
AUC_SCORE:  0.72311562  acc:  0.6503 f1:  0.6594605122212485
5  EPOCH BEST MODEL!
EPOCHS:7
TRAIN


100%|█████████████████████████████████████| 70000/70000 [13:02<00:00, 89.49it/s]


 Average training loss: 0.54


100%|████████████████████████████████████| 10000/10000 [00:12<00:00, 779.84it/s]


 Average valid loss: 0.61
AUC_SCORE:  0.73073992  acc:  0.6558 f1:  0.6689111196614083
EPOCHS:8
TRAIN


100%|█████████████████████████████████████| 70000/70000 [12:22<00:00, 94.24it/s]


 Average training loss: 0.52


100%|████████████████████████████████████| 10000/10000 [00:11<00:00, 877.00it/s]


 Average valid loss: 0.62
AUC_SCORE:  0.7317544799999999  acc:  0.6554 f1:  0.670176110260337
EPOCHS:9
TRAIN


100%|█████████████████████████████████████| 70000/70000 [12:19<00:00, 94.71it/s]


 Average training loss: 0.49


100%|████████████████████████████████████| 10000/10000 [00:20<00:00, 478.29it/s]


 Average valid loss: 0.63
AUC_SCORE:  0.7341675799999999  acc:  0.655 f1:  0.6616320125539428
EPOCHS:10
TRAIN


100%|█████████████████████████████████████| 70000/70000 [12:15<00:00, 95.21it/s]


 Average training loss: 0.47


100%|████████████████████████████████████| 10000/10000 [00:11<00:00, 907.59it/s]


 Average valid loss: 0.65
AUC_SCORE:  0.73319422  acc:  0.6532 f1:  0.6747936984246062


100%|██████████████████████████████████████| 7118/7118 [00:08<00:00, 802.24it/s]


 Average valid loss: 0.63
AUC_SCORE:  0.7107552394944355  acc:  0.6174487215509975 f1:  0.3142785192646689
BEST MODEL
AUC_SCORE:  0.7107552394944355  acc:  0.6174487215509975 f1:  0.3142785192646689
{'auc_micro': 0.7107552394944355, 'acc': 0.6174487215509975, 'f1': 0.3142785192646689, 'pred': [[0, 1, 0, 1, 0, 0, 1, 1, 1, 1], [1, 1, 0, 0, 0, 0, 1, 0, 0, 1], [1, 1, 0, 1, 0, 0, 0, 1, 1, 1], [1, 0, 1, 1, 0, 0, 0, 1, 0, 0], [0, 1, 1, 1, 0, 1, 1, 1, 0, 1], [1, 0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 1, 0, 0], [0, 1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 1, 0, 0, 1, 0, 1, 0, 0], [0, 0, 1, 1, 1, 1, 0, 0, 1, 1], [1, 0, 0, 0, 0, 1, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 1, 0, 0, 1, 0], [1, 1, 1, 1, 1, 1, 0, 1, 1, 1], [0, 0, 1, 0, 0, 1, 1, 1, 0, 1], [1, 0, 0, 1, 1, 0, 0, 0, 0, 1], [0, 1, 0, 0, 0, 0, 0, 1, 0, 0], [1, 1, 1, 0, 1, 1, 0, 1, 1, 1], [0, 0, 1, 0, 0, 1, 0, 1, 1, 1], [0, 1, 0, 0, 0, 0, 1, 1, 0, 0], [0, 1, 0, 1, 1, 0, 0, 0, 0, 0], [

In [51]:
random.Random(4).shuffle(nomissing_double)
random.Random(4).shuffle(missing_double)
train_data8 = nomissing_double[:35000]+missing_double[:35000]
valid_data8 = nomissing_double[35000:40000]+missing_double[35000:40000]
test_data8 = nomissing_double[40000:]+missing_double[40000:]
random.Random(4).shuffle(train_data8)
random.Random(4).shuffle(valid_data8)
random.Random(4).shuffle(test_data8)

In [53]:
bert8 = BertModel(d_model=400).to('cuda')
score = fit(bert8, train_data8,valid_data8,test_data8,EPOCHS=10)
print(score)

EPOCHS:1
TRAIN


 13%|█████                                 | 9421/70000 [02:07<13:37, 74.10it/s]


KeyboardInterrupt: 