In [55]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os
from tqdm import tqdm
import torch.nn.functional as F
from gensim.models import Word2Vec
from sklearn import metrics
from sklearn.metrics import classification_report
import itertools
import random

In [56]:
device = 'cuda'

In [57]:
class BertModel(nn.Module):
    def __init__(self, voc_size:int=30000, seq_len: int=512, d_model: int=384, d_ff:int=3072, pad_idx: int=1,
                num_encoder: int=12, num_heads: int=12, n_layers=6, dropout: float=0.1):
        super(BertModel, self).__init__()
        self.pad_idx = pad_idx
        self.layers = nn.ModuleList([nn.TransformerEncoderLayer(d_model=d_model, nhead=8, batch_first=True) for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, d_model)
        self.activ1 = nn.Tanh()
        self.classifier = nn.Linear(d_model, 1)
        
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        output = input
        for layer in self.layers:
            output = layer(output)
        output = output[0][1].unsqueeze(0)
        h_pooled = self.activ1(self.fc(output)) # [batch_size, d_model]
        logits_clsf = self.classifier(h_pooled) # [batch_size, 2]
        return logits_clsf # [B, S, D_model]

In [58]:
bert = BertModel()

In [5]:
input = torch.FloatTensor([[0]*128]*3)

In [6]:
input = input.unsqueeze(dim=0)


In [7]:
input.shape

torch.Size([1, 3, 128])

In [9]:
bert(input)

AssertionError: was expecting embedding dimension of 384, but got 128

In [59]:
def log_metrics(preds, labels):
    preds = torch.stack(preds)
    preds = preds.cpu().detach().numpy()
    labels = torch.stack(labels)
    labels = labels.cpu().detach().numpy()
    
    
    ## Method 2 using ravel()
    fpr_micro, tpr_micro, _ = metrics.roc_curve(labels.ravel(), preds.ravel())
    auc_micro = metrics.auc(fpr_micro, tpr_micro)
    n=10
    preds = [round(i) for i in preds.ravel()]
    acc = metrics.accuracy_score(labels.ravel(), preds)
    f1 = metrics.f1_score(labels.ravel(), preds)
    preds = [preds[i*n:(i+1)*n] for i in range(len(preds)//6)]
    #target_names = list(genre_list.keys())
    #print(classification_report(labels,preds, target_names=target_names))
    return {"auc_micro": auc_micro, "acc" : acc, "f1" : f1, 'pred' : preds}

In [60]:
def train_one_epoch(dataloader, model, optimizer, device, loss_fn):
    model.train()
    tk0 = tqdm(dataloader, total=len(dataloader))
    total_loss = 0.0
    
    for source_tensor, label in tk0:
        optimizer.zero_grad()
        output = model(source_tensor.to(device))
        loss = loss_fn(output.view(-1, output.size(-1)), torch.FloatTensor([label]).to(device))
        
        total_loss += loss
        loss.backward()
        optimizer.step()
        
    avg_train_loss = total_loss / len(dataloader) 
    print(" Average training loss: {0:.2f}".format(avg_train_loss))  

def test_one_epoch(testset, model, loss_fn):
    model.eval()
    tk0 = tqdm(testset, total=len(testset))
    targets = []
    outputs = []
    total_loss = 0.0
    with torch.no_grad():
        for source_tensor, label in tk0:
            label = torch.FloatTensor([label])
            output = model(source_tensor.to(device))
            loss = loss_fn(output.view(-1, output.size(-1)), label.to(device))
            total_loss += loss
            targets.extend(label)
            outputs.extend(torch.sigmoid(output[0]).to('cpu'))
    avg_valid_loss = total_loss / len(testset) 
    score = log_metrics(outputs, targets)
    print(" Average valid loss: {0:.2f}".format(avg_valid_loss))  
    print('AUC_SCORE: ', score["auc_micro"], " acc: ", score["acc"], "f1: ", score["f1"])
    return score, avg_valid_loss
    
def fit(model, train_dataloader, valid_dataloader=None,test_dataloader=None, EPOCHS=3, lr=0.000001):
    loss_fn = nn.BCEWithLogitsLoss() #ignore padding
    optimizer = torch.optim.AdamW(model.parameters(),lr=lr)
    for i in range(EPOCHS):
        print(f"EPOCHS:{i+1}")
        print('TRAIN')
        train_one_epoch(train_dataloader, model, optimizer, device, loss_fn)
        auc, loss = test_one_epoch(valid_dataloader,model, loss_fn)
        test_one_epoch(test_dataloader,model, loss_fn)
        if i == 0:
            torch.save(model,'rnn_best.model')
            best_loss = loss
        if  loss<best_loss:
            best_loss = loss
            print(i,' EPOCH BEST MODEL!')
            torch.save(model,'rnn_best.model')
    best_model = torch.load('rnn_best.model')
    auc, loss  = test_one_epoch(test_dataloader,best_model, loss_fn)
    print('BEST MODEL')
    print('AUC_SCORE: ', auc["auc_micro"], " acc: ", auc["acc"], "f1: ", auc["f1"])
    return auc

In [61]:
from gensim.models import Word2Vec

In [62]:
import json

In [63]:
with open('protagonist.json', "r") as json_file:
    char_123 = json.load(json_file)

# Text Rank

In [72]:
mov_pro1 = []
mov_pro2 = []
mov_pro3 = []
mov_pro1_no = []
mov_pro2_no = []
mov_pro3_no = []
path1 = 'word2vec_model_text3/'

filenames = os.listdir(path1)
train_data = []
valid_data = []
test_data = []
for filename in filenames:
    movie_name = filename[:-14]
    
    w2v=Word2Vec.load(path1 + filename)
    base = 'scene_'
    scene_num = 0
    passs = 0
    max_scene = 0
    for node in w2v.wv.index_to_key:
        if base in node:
            scene_num += 1
            if int(node[6:])>max_scene:
                max_scene = int(node[6:])
    embs = torch.zeros(1,scene_num,384)
    k = 0
    for i in range(1,max_scene+1):
        emb_name = base + str(i)
        if emb_name in w2v.wv.index_to_key:
            emb = w2v.wv[emb_name]
            emb = torch.from_numpy(emb)
            embs[0][k] = emb
            k += 1

    mov_mean = torch.mean(embs, dim=1)
    
    
    
    if movie_name not in set(char_123.keys()):
        continue        
        
    for scene in char_123[movie_name]['mov_pro1']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro1.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro1_no']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro1_no.append([embs,[0]])    

    for scene in char_123[movie_name]['mov_pro2']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro2.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro2_no']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro2_no.append([embs,[0]])  

    for scene in char_123[movie_name]['mov_pro3']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro3.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro3_no']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro3_no.append([embs,[0]])  

In [73]:
print(len(mov_pro1))
print(len(mov_pro2))
print(len(mov_pro3))
print(len(mov_pro1_no))
print(len(mov_pro2_no))
print(len(mov_pro3_no))

95014
61247
43559
48420
82187
99875


In [74]:
#random.Random(4).shuffle(mov_pro1)
#random.Random(4).shuffle(mov_pro1_no)
train_data = mov_pro1[:20000]+mov_pro1_no[:20000]
valid_data = mov_pro1[20000:24000]+mov_pro1_no[20000:24000]
test_data = mov_pro1[24000:28000]+mov_pro1_no[24000:28000]
random.Random(4).shuffle(train_data)

In [75]:
bert = BertModel().to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=2)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:47<00:00, 115.12it/s]


 Average training loss: 0.65


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1017.70it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.64391659375  acc:  0.600375 f1:  0.49803736850368985


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1000.60it/s]


 Average valid loss: 0.67
AUC_SCORE:  0.6595055937500001  acc:  0.610625 f1:  0.5113725490196078
EPOCHS:2
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:50<00:00, 114.24it/s]


 Average training loss: 0.61


100%|██████████████████████████████████████| 8000/8000 [00:08<00:00, 997.44it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.6370434375  acc:  0.601125 f1:  0.5103575264692343


100%|██████████████████████████████████████| 8000/8000 [00:08<00:00, 999.89it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.6489208125000001  acc:  0.60525 f1:  0.5103875968992249


100%|██████████████████████████████████████| 8000/8000 [00:08<00:00, 985.23it/s]

 Average valid loss: 0.67
AUC_SCORE:  0.6595055937500001  acc:  0.610625 f1:  0.5113725490196078
BEST MODEL
AUC_SCORE:  0.6595055937500001  acc:  0.610625 f1:  0.5113725490196078
{'auc_micro': 0.6595055937500001, 'acc': 0.610625, 'f1': 0.5113725490196078, 'pred': [[0, 1, 0, 1, 0, 0, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 0, 0, 0, 1, 1], [1, 1, 0, 1, 1, 0, 0, 1, 0, 0], [1, 0, 1, 0, 1, 1, 1, 0, 0, 0], [0, 1, 0, 0, 1, 0, 1, 0, 1, 1], [1, 0, 1, 1, 1, 0, 1, 0, 1, 0], [0, 1, 1, 1, 0, 0, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 1, 1, 1, 1, 1, 0, 1, 1], [1, 0, 0, 1, 1, 1, 0, 1, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 1, 1], [0, 1, 1, 0, 1, 1, 1, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 1, 0, 0, 0], [1, 0, 1, 1, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 1, 0, 1, 0], [1, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 1, 0, 1, 0, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 1, 0, 0, 1, 0],




In [76]:
#random.Random(4).shuffle(mov_pro1)
#random.Random(4).shuffle(mov_pro1_no)
train_data = mov_pro2[:20000]+mov_pro2_no[:20000]
valid_data = mov_pro2[20000:24000]+mov_pro2_no[20000:24000]
test_data = mov_pro2[24000:28000]+mov_pro2_no[24000:28000]
random.Random(4).shuffle(train_data)

In [77]:
bert = BertModel().to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=2)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:42<00:00, 116.75it/s]


 Average training loss: 0.66


100%|██████████████████████████████████████| 8000/8000 [00:08<00:00, 997.59it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.59489625  acc:  0.56775 f1:  0.5277246653919694


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1012.91it/s]


 Average valid loss: 0.64
AUC_SCORE:  0.6953853125  acc:  0.641375 f1:  0.5912523151446074
EPOCHS:2
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:32<00:00, 120.30it/s]


 Average training loss: 0.63


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1011.08it/s]


 Average valid loss: 0.72
AUC_SCORE:  0.58113140625  acc:  0.563375 f1:  0.5211788896504455


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1015.05it/s]


 Average valid loss: 0.64
AUC_SCORE:  0.699235  acc:  0.647 f1:  0.5958786491127647


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1002.33it/s]


 Average valid loss: 0.64
AUC_SCORE:  0.6953853125  acc:  0.641375 f1:  0.5912523151446074
BEST MODEL
AUC_SCORE:  0.6953853125  acc:  0.641375 f1:  0.5912523151446074
{'auc_micro': 0.6953853125, 'acc': 0.641375, 'f1': 0.5912523151446074, 'pred': [[0, 0, 0, 1, 1, 1, 0, 0, 0, 1], [1, 1, 1, 0, 1, 1, 1, 0, 0, 0], [1, 0, 0, 1, 1, 1, 0, 0, 0, 1], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 1, 1, 1, 0, 0, 0, 1], [0, 1, 1, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 0, 0], [0, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 0, 0, 1, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 1, 0, 0, 0, 0], [1, 0, 1, 0, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 0, 0, 1, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1, 0, 1], [0, 0, 1, 0, 1, 1, 0, 1, 1, 0], [0, 1, 0, 0, 0, 0, 0, 0, 1, 1], [1, 0, 0, 1, 1, 1, 0, 0, 0, 1], [0, 1, 1, 1, 1, 0, 1, 0, 1, 1], [1, 1, 1, 0, 1, 0, 1, 0, 1, 1], [0, 0, 1, 1, 0, 0, 0, 0, 1, 0], [1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 1, 1, 1, 0, 1, 1, 0], [0, 0, 0, 0, 1, 1

In [78]:
#random.Random(4).shuffle(mov_pro1)
#random.Random(4).shuffle(mov_pro1_no)
train_data = mov_pro3[:20000]+mov_pro3_no[:20000]
valid_data = mov_pro3[20000:24000]+mov_pro3_no[20000:24000]
test_data = mov_pro3[24000:28000]+mov_pro3_no[24000:28000]
random.Random(4).shuffle(train_data)

In [79]:
bert = BertModel().to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=2)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:33<00:00, 120.04it/s]


 Average training loss: 0.65


100%|██████████████████████████████████████| 8000/8000 [00:08<00:00, 976.51it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.5745679375  acc:  0.54625 f1:  0.5559089796917055


100%|██████████████████████████████████████| 8000/8000 [00:08<00:00, 999.98it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.627767875  acc:  0.59225 f1:  0.6031630170316302
EPOCHS:2
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:33<00:00, 119.92it/s]


 Average training loss: 0.62


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1016.05it/s]


 Average valid loss: 0.74
AUC_SCORE:  0.5556298125  acc:  0.542 f1:  0.5802978235967926


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1023.73it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.62168459375  acc:  0.577375 f1:  0.6124928366762177


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1015.67it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.627767875  acc:  0.59225 f1:  0.6031630170316302
BEST MODEL
AUC_SCORE:  0.627767875  acc:  0.59225 f1:  0.6031630170316302
{'auc_micro': 0.627767875, 'acc': 0.59225, 'f1': 0.6031630170316302, 'pred': [[1, 1, 0, 0, 0, 0, 1, 0, 0, 1], [0, 0, 1, 1, 1, 1, 0, 0, 1, 0], [0, 0, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 0, 1, 1, 1, 1], [1, 1, 1, 0, 1, 1, 1, 1, 1, 0], [1, 0, 1, 1, 1, 0, 1, 1, 0, 1], [1, 0, 1, 1, 1, 0, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 0, 1, 1, 1], [0, 1, 1, 0, 0, 1, 1, 1, 1, 1], [0, 0, 0, 1, 1, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1, 1, 1, 0, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 1, 0, 1, 1, 0, 1, 0, 1], [0, 1, 0, 0, 1, 1, 0, 0, 0, 1], [0, 1, 1, 1, 1, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 1, 0, 1, 1, 1, 1, 0], [0, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 0, 1, 1], [1, 0, 1, 0, 1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 1, 0, 0, 1, 1, 1], [1, 1, 0, 1, 1, 1, 1, 0

In [47]:
mov_pro1 = []
mov_pro2 = []
mov_pro3 = []
mov_pro1_no = []
mov_pro2_no = []
mov_pro3_no = []
path1 = 'word2vec_model_char/'

filenames = os.listdir(path1)
train_data = []
valid_data = []
test_data = []
for filename in filenames:
    movie_name = filename[:-14]
    
    w2v=Word2Vec.load(path1 + filename)
    base = 'scene_'
    scene_num = 0
    passs = 0
    max_scene = 0
    for node in w2v.wv.index_to_key:
        if base in node:
            scene_num += 1
            if int(node[6:])>max_scene:
                max_scene = int(node[6:])
    embs = torch.zeros(1,scene_num,384)
    k = 0
    for i in range(1,max_scene+1):
        emb_name = base + str(i)
        if emb_name in w2v.wv.index_to_key:
            emb = w2v.wv[emb_name]
            emb = torch.from_numpy(emb)
            embs[0][k] = emb
            k += 1

    mov_mean = torch.mean(embs, dim=1)
    
    
    
    if movie_name not in set(char_123.keys()):
        continue        
        
    for scene in char_123[movie_name]['mov_pro1']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro1.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro1_no']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro1_no.append([embs,[0]])    

    for scene in char_123[movie_name]['mov_pro2']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro2.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro2_no']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro2_no.append([embs,[0]])  

    for scene in char_123[movie_name]['mov_pro3']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro3.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro3_no']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro3_no.append([embs,[0]])  

In [48]:
print(len(mov_pro1))
print(len(mov_pro2))
print(len(mov_pro3))
print(len(mov_pro1_no))
print(len(mov_pro2_no))
print(len(mov_pro3_no))

95014
61247
43559
48420
82187
99875


In [49]:
#random.Random(4).shuffle(mov_pro1)
#random.Random(4).shuffle(mov_pro1_no)
train_data = mov_pro1[:20000]+mov_pro1_no[:20000]
valid_data = mov_pro1[20000:24000]+mov_pro1_no[20000:24000]
test_data = mov_pro1[24000:28000]+mov_pro1_no[24000:28000]
random.Random(4).shuffle(train_data)

In [50]:
bert = BertModel().to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=2)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:20<00:00, 124.94it/s]


 Average training loss: 0.65


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1053.03it/s]


 Average valid loss: 0.67
AUC_SCORE:  0.65563115625  acc:  0.606375 f1:  0.5215012915970217


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1054.38it/s]


 Average valid loss: 0.66
AUC_SCORE:  0.66631628125  acc:  0.618 f1:  0.5446960667461264
EPOCHS:2
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:51<00:00, 113.88it/s]


 Average training loss: 0.62


100%|██████████████████████████████████████| 8000/8000 [00:11<00:00, 685.04it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.6426145937500001  acc:  0.599625 f1:  0.5168200331875096


100%|██████████████████████████████████████| 8000/8000 [00:11<00:00, 691.42it/s]


 Average valid loss: 0.67
AUC_SCORE:  0.661013125  acc:  0.613625 f1:  0.5447046693180143


100%|██████████████████████████████████████| 8000/8000 [00:10<00:00, 781.69it/s]


 Average valid loss: 0.66
AUC_SCORE:  0.66631628125  acc:  0.618 f1:  0.5446960667461264
BEST MODEL
AUC_SCORE:  0.66631628125  acc:  0.618 f1:  0.5446960667461264
{'auc_micro': 0.66631628125, 'acc': 0.618, 'f1': 0.5446960667461264, 'pred': [[0, 1, 0, 1, 0, 0, 1, 1, 1, 0], [1, 1, 0, 0, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 0, 0, 1, 1], [1, 1, 0, 1, 1, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1, 0, 0], [0, 1, 1, 1, 0, 1, 1, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 1, 1], [0, 1, 1, 1, 0, 0, 0, 0, 1, 0], [1, 1, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 1, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 1, 1, 1, 1, 0, 0, 1], [0, 0, 0, 1, 1, 0, 0, 1, 0, 0], [0, 0, 1, 0, 1, 0, 0, 1, 0, 1], [0, 0, 0, 1, 0, 1, 1, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0, 0, 1, 1], [0, 1, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 0, 0, 1, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 1, 0, 1], [1, 1, 1, 0, 1, 0, 0, 0, 1, 0], [0, 0, 1, 1, 0, 1, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0

In [51]:
#random.Random(4).shuffle(mov_pro1)
#random.Random(4).shuffle(mov_pro1_no)
train_data = mov_pro2[:20000]+mov_pro2_no[:20000]
valid_data = mov_pro2[20000:24000]+mov_pro2_no[20000:24000]
test_data = mov_pro2[24000:28000]+mov_pro2_no[24000:28000]
random.Random(4).shuffle(train_data)

In [52]:
bert = BertModel().to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=2)
print(score)

EPOCHS:1
TRAIN


100%|█████████████████████████████████████| 40000/40000 [08:38<00:00, 77.15it/s]


 Average training loss: 0.66


100%|██████████████████████████████████████| 8000/8000 [00:09<00:00, 801.15it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.62288428125  acc:  0.58575 f1:  0.5346812693063746


100%|██████████████████████████████████████| 8000/8000 [00:09<00:00, 848.55it/s]


 Average valid loss: 0.65
AUC_SCORE:  0.6844014375  acc:  0.63075 f1:  0.5659712018806935
EPOCHS:2
TRAIN


100%|█████████████████████████████████████| 40000/40000 [08:32<00:00, 78.04it/s]


 Average training loss: 0.63


100%|██████████████████████████████████████| 8000/8000 [00:09<00:00, 813.04it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.59592815625  acc:  0.569 f1:  0.5208449138410227


100%|██████████████████████████████████████| 8000/8000 [00:09<00:00, 851.21it/s]


 Average valid loss: 0.64
AUC_SCORE:  0.68494759375  acc:  0.635 f1:  0.574468085106383


100%|██████████████████████████████████████| 8000/8000 [00:09<00:00, 819.93it/s]

 Average valid loss: 0.65
AUC_SCORE:  0.6844014375  acc:  0.63075 f1:  0.5659712018806935
BEST MODEL
AUC_SCORE:  0.6844014375  acc:  0.63075 f1:  0.5659712018806935
{'auc_micro': 0.6844014375, 'acc': 0.63075, 'f1': 0.5659712018806935, 'pred': [[0, 0, 0, 1, 1, 1, 0, 0, 0, 1], [0, 1, 1, 1, 1, 1, 1, 1, 0, 0], [0, 1, 1, 0, 1, 1, 1, 0, 1, 0], [0, 1, 1, 0, 0, 0, 0, 0, 0, 1], [0, 1, 0, 1, 1, 1, 1, 0, 0, 1], [0, 1, 1, 1, 1, 0, 1, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 0, 0], [0, 1, 1, 1, 0, 1, 1, 1, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 1, 1, 0, 1, 0, 0], [1, 0, 0, 1, 0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0, 1, 0, 0, 1], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 1, 1, 0], [1, 1, 0, 0, 1, 0, 0, 0, 1, 1], [1, 1, 0, 1, 1, 1, 1, 0, 0, 1], [1, 1, 1, 1, 1, 0, 1, 1, 1, 1], [1, 0, 0, 0, 1, 0, 1, 0, 1, 0], [0, 1, 1, 0, 0, 0, 0, 0, 1, 1], [1, 0, 1, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 0




In [53]:
#random.Random(4).shuffle(mov_pro3)
#random.Random(4).shuffle(mov_pro3_no)
train_data = mov_pro3[:20000]+mov_pro3_no[:20000]
valid_data = mov_pro3[20000:24000]+mov_pro3_no[20000:24000]
test_data = mov_pro3[24000:28000]+mov_pro3_no[24000:28000]
random.Random(4).shuffle(train_data)

In [54]:
bert = BertModel().to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=2)
print(score)

EPOCHS:1
TRAIN


100%|█████████████████████████████████████| 40000/40000 [08:50<00:00, 75.36it/s]


 Average training loss: 0.65


100%|██████████████████████████████████████| 8000/8000 [00:11<00:00, 720.88it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.600325125  acc:  0.571875 f1:  0.5460569913850231


100%|██████████████████████████████████████| 8000/8000 [00:11<00:00, 674.97it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.61115734375  acc:  0.572375 f1:  0.5573812912407815
EPOCHS:2
TRAIN


100%|█████████████████████████████████████| 40000/40000 [08:42<00:00, 76.57it/s]


 Average training loss: 0.62


100%|██████████████████████████████████████| 8000/8000 [00:09<00:00, 860.57it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.6019278125  acc:  0.56875 f1:  0.5412234042553191


100%|██████████████████████████████████████| 8000/8000 [00:12<00:00, 649.37it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.6026184375  acc:  0.56925 f1:  0.5528159875421749


100%|██████████████████████████████████████| 8000/8000 [00:10<00:00, 747.74it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.61115734375  acc:  0.572375 f1:  0.5573812912407815
BEST MODEL
AUC_SCORE:  0.61115734375  acc:  0.572375 f1:  0.5573812912407815
{'auc_micro': 0.61115734375, 'acc': 0.572375, 'f1': 0.5573812912407815, 'pred': [[0, 1, 1, 0, 0, 1, 1, 1, 0, 0], [0, 1, 1, 1, 0, 1, 0, 1, 1, 0], [0, 0, 1, 0, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 0, 1, 1, 1, 0, 1], [1, 1, 1, 0, 0, 0, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 0, 1, 0, 0], [0, 1, 1, 1, 1, 0, 0, 1, 1, 1], [0, 0, 1, 1, 0, 1, 1, 0, 1, 0], [0, 1, 1, 0, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 0, 1, 1, 1, 1, 1, 1, 1], [0, 1, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 0, 1, 1, 0, 1, 1, 0, 1], [1, 1, 1, 0, 1, 1, 1, 0, 0, 1], [1, 0, 1, 1, 1, 0, 1, 1, 0, 1], [0, 0, 0, 0, 1, 0, 0, 0, 0, 1], [0, 0, 1, 1, 1, 0, 1, 0, 0, 0], [1, 1, 0, 1, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1, 1, 1, 0], [1, 0, 0, 1, 1, 1, 1, 0, 1, 1], [1, 0, 1, 0, 1, 0, 0, 0, 0, 1], [0, 1, 1, 0, 0, 0, 0, 0, 1, 0], [0, 1, 0, 1, 0

# 평균 임베딩 

In [148]:
import pickle
with open('emb_avg3.pickle','rb') as fw:
    movie_emb = pickle.load(fw)

In [149]:
mov_pro1 = []
mov_pro2 = []
mov_pro3 = []
mov_pro1_no = []
mov_pro2_no = []
mov_pro3_no = []
path1 = 'word2vec_model_char/'

filenames = os.listdir(path1)
train_data = []
valid_data = []
test_data = []

for filename in filenames:
    movie_name = filename[:-14]
    
    w2v=Word2Vec.load(path1 + filename)
    base = 'scene_'
    scene_num = 0
    passs = 0
    max_scene = 0
    scene2num = {}
    nodes = []
    for node in w2v.wv.index_to_key:
        if base in node:
            scene_num += 1
            nodes.append(int(node[6:]))
            if int(node[6:])>max_scene:
                max_scene = int(node[6:])
    k = 0
    for i in range(max_scene+1):
        if i in nodes:
            scene2num[i] = k
            k += 1
    k = 0
    if movie_name not in set(char_123.keys()):
        continue        
    real_embs = []
    
    
    
    
    mov_embs = torch.tensor(movie_emb[movie_name]).unsqueeze(0)

    #print(torch.tensor(real_embs).shape)
    
    mov_mean = torch.mean(mov_embs, dim=1).unsqueeze(0)
    
    
    
    if movie_name not in set(char_123.keys()):
        continue        
        
    for scene in char_123[movie_name]['mov_pro1']:
        if scene not in scene2num.keys():
            continue
        scene = scene2num[scene]
        emb_name = base + str(scene)
        if emb_name not in w2v.wv.index_to_key:
            continue
        if str(mov_embs[0][scene][0]) == 'nan':
            continue
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        embs[0][0] = mov_embs[0][scene]
        mov_pro1.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro1_no']:
        if scene not in scene2num.keys():
            continue
        scene = scene2num[scene]
        emb_name = base + str(scene)
        if emb_name not in w2v.wv.index_to_key:
            continue
        if str(mov_embs[0][scene][0]) == 'nan':
            continue
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        embs[0][0] = mov_embs[0][scene]
        mov_pro1_no.append([embs,[0]]) 

    for scene in char_123[movie_name]['mov_pro2']:
        if scene not in scene2num.keys():
            continue
        scene = scene2num[scene]
        emb_name = base + str(scene)
        if emb_name not in w2v.wv.index_to_key:
            continue
        if str(mov_embs[0][scene][0]) == 'nan':
            continue
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        embs[0][0] = mov_embs[0][scene]
        mov_pro2.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro2_no']:
        if scene not in scene2num.keys():
            continue
        scene = scene2num[scene]
        emb_name = base + str(scene)
        if emb_name not in w2v.wv.index_to_key:
            continue
        if str(mov_embs[0][scene][0]) == 'nan':
            continue
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        embs[0][0] = mov_embs[0][scene]
        mov_pro2_no.append([embs,[0]])

    for scene in char_123[movie_name]['mov_pro3']:
        if scene not in scene2num.keys():
            continue
        scene = scene2num[scene]
        emb_name = base + str(scene)
        if emb_name not in w2v.wv.index_to_key:
            continue
        if str(mov_embs[0][scene][0]) == 'nan':
            continue        
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        embs[0][0] = mov_embs[0][scene]
        mov_pro3.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro3_no']:
        if scene not in scene2num.keys():
            continue
        scene = scene2num[scene]
        emb_name = base + str(scene)
        if emb_name not in w2v.wv.index_to_key:
            continue
        if str(mov_embs[0][scene][0]) == 'nan':
            continue        
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        embs[0][0] = mov_embs[0][scene]
        mov_pro3_no.append([embs,[0]])

In [150]:
print(len(mov_pro1))
print(len(mov_pro2))
print(len(mov_pro3))
print(len(mov_pro1_no))
print(len(mov_pro2_no))
print(len(mov_pro3_no))

83403
54085
38558
41074
70392
85919


In [151]:
#random.Random(4).shuffle(mov_pro1)
#random.Random(4).shuffle(mov_pro1_no)
train_data = mov_pro1[:20000]+mov_pro1_no[:20000]
valid_data = mov_pro1[20000:24000]+mov_pro1_no[20000:24000]
test_data = mov_pro1[24000:28000]+mov_pro1_no[24000:28000]
random.Random(4).shuffle(train_data)

In [152]:
bert = BertModel(d_model=384).to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=2)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:18<00:00, 125.76it/s]


 Average training loss: 0.64


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1044.59it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.61936103125  acc:  0.5825 f1:  0.6034196152932795


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1030.86it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.6445035625000001  acc:  0.597 f1:  0.62874251497006
EPOCHS:2
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:18<00:00, 125.45it/s]


 Average training loss: 0.60


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1036.00it/s]


 Average valid loss: 0.75
AUC_SCORE:  0.585522125  acc:  0.55875 f1:  0.5363803519831889


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1051.71it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.63706525  acc:  0.592625 f1:  0.5818041832413705


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1044.78it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.6445035625000001  acc:  0.597 f1:  0.62874251497006
BEST MODEL
AUC_SCORE:  0.6445035625000001  acc:  0.597 f1:  0.62874251497006
{'auc_micro': 0.6445035625000001, 'acc': 0.597, 'f1': 0.62874251497006, 'pred': [[1, 1, 1, 0, 0, 1, 1, 0, 0, 1], [1, 1, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1, 0], [1, 1, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 1, 1, 0, 0, 0, 1], [1, 1, 1, 1, 0, 1, 0, 0, 0, 0], [1, 0, 1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, 0, 1, 0, 0], [0, 0, 1, 0, 1, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0, 1, 1, 0, 0], [1, 0, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 0, 1, 1, 1, 1], [1, 1, 1, 1, 1

In [153]:
#random.Random(4).shuffle(mov_pro2)
#random.Random(4).shuffle(mov_pro2_no)
train_data = mov_pro2[:20000]+mov_pro2_no[:20000]
valid_data = mov_pro2[20000:24000]+mov_pro2_no[20000:24000]
test_data = mov_pro2[24000:28000]+mov_pro2_no[24000:28000]
random.Random(4).shuffle(train_data)

In [154]:
bert = BertModel(d_model=384).to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=2)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:30<00:00, 121.02it/s]


 Average training loss: 0.65


100%|██████████████████████████████████████| 8000/8000 [00:08<00:00, 989.46it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.624644625  acc:  0.579375 f1:  0.6536284096757591


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1002.33it/s]


 Average valid loss: 0.67
AUC_SCORE:  0.67868178125  acc:  0.605375 f1:  0.6696662132468347
EPOCHS:2
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:28<00:00, 121.59it/s]


 Average training loss: 0.63


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1034.42it/s]


 Average valid loss: 0.72
AUC_SCORE:  0.60555221875  acc:  0.56925 f1:  0.6472153972153972


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1039.19it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.6687654999999999  acc:  0.60325 f1:  0.6665966386554621


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1026.58it/s]


 Average valid loss: 0.67
AUC_SCORE:  0.67868178125  acc:  0.605375 f1:  0.6696662132468347
BEST MODEL
AUC_SCORE:  0.67868178125  acc:  0.605375 f1:  0.6696662132468347
{'auc_micro': 0.67868178125, 'acc': 0.605375, 'f1': 0.6696662132468347, 'pred': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 1, 0, 1, 0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 1, 1, 1, 0, 1], [0, 0, 1, 1, 1, 0, 0, 1, 0, 1], [1, 0, 0, 0, 0, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 1, 0, 1, 1, 1], [1, 0, 1, 1, 1, 0, 1, 1, 1, 1], [1, 1, 1, 1, 0, 1, 1, 0, 1, 0], [1, 1, 1, 1, 1, 1, 0, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1

In [157]:
#random.Random(4).shuffle(mov_pro3)
#random.Random(4).shuffle(mov_pro3_no)
train_data = mov_pro3[:20000]+mov_pro3_no[:20000]
valid_data = mov_pro3[20000:24000]+mov_pro3_no[20000:24000]
test_data = mov_pro3[24000:28000]+mov_pro3_no[24000:28000]
random.Random(4).shuffle(train_data)

In [158]:
bert = BertModel(d_model=384).to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=1)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:24<00:00, 123.21it/s]


 Average training loss: 0.64


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1011.29it/s]


 Average valid loss: 0.75
AUC_SCORE:  0.5849458437499999  acc:  0.5415 f1:  0.630614300100705


100%|██████████████████████████████████████| 8000/8000 [00:08<00:00, 992.48it/s]


 Average valid loss: 0.72
AUC_SCORE:  0.616472  acc:  0.58325 f1:  0.6572074850915074


100%|██████████████████████████████████████| 8000/8000 [00:11<00:00, 719.38it/s]


 Average valid loss: 0.72
AUC_SCORE:  0.616472  acc:  0.58325 f1:  0.6572074850915074
BEST MODEL
AUC_SCORE:  0.616472  acc:  0.58325 f1:  0.6572074850915074
{'auc_micro': 0.616472, 'acc': 0.58325, 'f1': 0.6572074850915074, 'pred': [[1, 1, 0, 0, 1, 1, 0, 0, 1, 1], [0, 0, 0, 1, 0, 1, 0, 1, 1, 0], [0, 1, 1, 1, 1, 1, 1, 0, 1, 1], [0, 1, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1, 1], [1, 0, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 

# sum

In [15]:
import pickle
with open('emb_sum.pickle','rb') as fw:
    movie_emb = pickle.load(fw)

In [16]:
mov_pro1 = []
mov_pro2 = []
mov_pro3 = []
mov_pro1_no = []
mov_pro2_no = []
mov_pro3_no = []
path1 = 'word2vec_model_char/'

filenames = os.listdir(path1)
train_data = []
valid_data = []
test_data = []

for filename in filenames:
    movie_name = filename[:-14]
    if movie_name not in movie_emb.keys():
        continue
    print(movie_name)
    w2v=Word2Vec.load(path1 + filename)
    base = 'scene_'
    scene_num = 0
    passs = 0
    max_scene = 0
    scene2num = {}
    nodes = []
    for node in w2v.wv.index_to_key:
        if base in node:
            scene_num += 1
            nodes.append(int(node[6:]))
            if int(node[6:])>max_scene:
                max_scene = int(node[6:])
    k = 0
    for i in range(max_scene+1):
        if i in nodes:
            scene2num[i] = k
            k += 1
    k = 0
    if movie_name not in set(char_123.keys()):
        continue        
    real_embs = []
    
    
    
    
    mov_embs = torch.tensor(movie_emb[movie_name]).unsqueeze(0)

    #print(torch.tensor(real_embs).shape)
    
    mov_mean = torch.mean(mov_embs, dim=1).unsqueeze(0)
    
    
    
    if movie_name not in set(char_123.keys()):
        continue        
        
    for scene in char_123[movie_name]['mov_pro1']:
        if scene not in scene2num.keys():
            continue
        scene = scene2num[scene]
        emb_name = base + str(scene)
        if emb_name not in w2v.wv.index_to_key:
            continue
        if str(mov_embs[0][scene][0]) == 'nan':
            continue
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        embs[0][0] = mov_embs[0][scene]
        mov_pro1.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro1_no']:
        if scene not in scene2num.keys():
            continue
        scene = scene2num[scene]
        emb_name = base + str(scene)
        if emb_name not in w2v.wv.index_to_key:
            continue
        if str(mov_embs[0][scene][0]) == 'nan':
            continue
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        embs[0][0] = mov_embs[0][scene]
        mov_pro1_no.append([embs,[0]]) 

    for scene in char_123[movie_name]['mov_pro2']:
        if scene not in scene2num.keys():
            continue
        scene = scene2num[scene]
        emb_name = base + str(scene)
        if emb_name not in w2v.wv.index_to_key:
            continue
        if str(mov_embs[0][scene][0]) == 'nan':
            continue
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        embs[0][0] = mov_embs[0][scene]
        mov_pro2.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro2_no']:
        if scene not in scene2num.keys():
            continue
        scene = scene2num[scene]
        emb_name = base + str(scene)
        if emb_name not in w2v.wv.index_to_key:
            continue
        if str(mov_embs[0][scene][0]) == 'nan':
            continue
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        embs[0][0] = mov_embs[0][scene]
        mov_pro2_no.append([embs,[0]])

    for scene in char_123[movie_name]['mov_pro3']:
        if scene not in scene2num.keys():
            continue
        scene = scene2num[scene]
        emb_name = base + str(scene)
        if emb_name not in w2v.wv.index_to_key:
            continue
        if str(mov_embs[0][scene][0]) == 'nan':
            continue        
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        embs[0][0] = mov_embs[0][scene]
        mov_pro3.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro3_no']:
        if scene not in scene2num.keys():
            continue
        scene = scene2num[scene]
        emb_name = base + str(scene)
        if emb_name not in w2v.wv.index_to_key:
            continue
        if str(mov_embs[0][scene][0]) == 'nan':
            continue        
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        embs[0][0] = mov_embs[0][scene]
        mov_pro3_no.append([embs,[0]])

Stranglehold (1931 film)


  mov_embs = torch.tensor(movie_emb[movie_name]).unsqueeze(0)


The Limey
So I Married an Axe Murderer
Ghostbusters
The Searchers (film)
The Kingdom (film)
Vanilla Sky
Queen of the Damned
Fear and Loathing in Las Vegas (film)
The Body Snatcher (film)
Ace Ventura: Pet Detective
Jennifers Body
G.I. Joe: The Rise of Cobra
Sling Blade
Living in Oblivion
A Few Good Men
The Bourne Ultimatum (film)
Field of Dreams
Insomnia (2002 film)
Priest (2011 film)
The Goonies
Someone to Watch Over Me (film)
The Mask (film)
Jane Eyre (2011 film)
Pineapple Express (film)
Only God Forgives
Kids (film)
White Angel (1994 film)
Smashed (film)
Wind Chill (film)
Halloween 4: The Return of Michael Myers
Monty Python Live at the Hollywood Bowl
Out of Sight
The Hebrew Hammer
The King of Comedy (1983 film)
Bones (2001 film)
Mr. Blandings Builds His Dream House
Harold & Kumar Go to White Castle
Man Trouble
Cold Mountain (film)
The Corruptor
Nashville (film)
While She Was Out
THX 1138
Man on Fire (2004 film)
Buffy the Vampire Slayer (film)
Demolition Man (film)
Twins (1988 film)


Panther (film)
Dracula (1958 film)
Star Trek V: The Final Frontier
Drive (2011 film)
The Hustler (film)
Made for Each Other (1939 film)
Autumn in New York (film)
The Proposal (film)
Apocalypse Now Redux
Wild Hogs
Below (film)
Annie Hall
Whos Your Daddy (film)
The Majestic (film)
The Leopard Man
The Help (film)
Stolen Summer
Gladiator (2000 film)
Game 6
Dances with Wolves
Case 39
Chinatown (1974 film)
The Fifth Element
Fright Night (2011 film)
Henrys Crime
Domino (film)
Public Enemies (2009 film)
From Here to Eternity
Ghostbusters II
Crazy, Stupid, Love.
John Q
Cube (film)
The Little Mermaid (1989 film)
Gremlins
Silverado (film)
Burning Annie
Horrible Bosses
Platinum Blonde (film)
This Is 40
Grand Hotel (film)
Very Bad Things
The Godfather Part II
A Serious Man
Chaos (2005 Capitol film)
Drive Angry
Heavenly Creatures
The Addams Family (film)
Innerspace
M (1931 film)
Heathers
The Lord of the Rings: The Fellowship of the Ring
Only You (1994 film)
Music of the Heart
Enough (film)
Ring (fil

Babel (film)
The Fabulous Baker Boys
Nightbreed
Die Hard 2
April Fools Day (1986 film)
Flash Gordon (film)
Megamind
Superman (1978 film)
Wanted (2008 film)
The Best Exotic Marigold Hotel
Minority Report (film)
Three Men and a Baby
Love & Basketball
Heat (1995 film)
Go (1999 film)
Remember Me (2010 film)
Up in the Air (2009 film)
Clash of the Titans (2010 film)
Ronin (film)
Taking Lives (film)
The Distinguished Gentleman
Klute
They (2002 film)
All the Presidents Men (film)
Anna Karenina (2012 film)
8mm (film)
The Dark Knight (film)
Logans Run (film)
Mystery Men
Anastasia (1997 film)
Dark Star (film)
Monte Carlo (2011 film)
Fast Times at Ridgemont High
Affliction (film)
Darkman
Oceans Twelve
Two for the Money (film)
Yes Man (film)
Dr. Strangelove
Pet Sematary (film)
Petulia
Mission: Impossible (film)
Bound (film)
Oblivion (2013 film)
The Private Life of Sherlock Holmes
Argo (2012 film)
Rambling Rose (film)
Death to Smoochy
The Stunt Man
From Dusk till Dawn
Lethal Weapon
Slither (2006 fil

In [17]:
print(len(mov_pro1))
print(len(mov_pro2))
print(len(mov_pro3))
print(len(mov_pro1_no))
print(len(mov_pro2_no))
print(len(mov_pro3_no))

83323
54018
38503
41055
70360
85875


In [18]:
#random.Random(4).shuffle(mov_pro1)
#random.Random(4).shuffle(mov_pro1_no)
train_data = mov_pro1[:20000]+mov_pro1_no[:20000]
valid_data = mov_pro1[20000:24000]+mov_pro1_no[20000:24000]
test_data = mov_pro1[24000:28000]+mov_pro1_no[24000:28000]
random.Random(4).shuffle(train_data)

In [19]:
bert = BertModel(d_model=384).to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=5)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:22<00:00, 124.15it/s]


 Average training loss: 0.68


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1061.22it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5858777500000001  acc:  0.5495 f1:  0.5959641255605381


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1063.14it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.6040329375  acc:  0.56725 f1:  0.6154187958231504
EPOCHS:2
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:28<00:00, 121.71it/s]


 Average training loss: 0.67


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1031.64it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.5794192187499999  acc:  0.547125 f1:  0.5802340400880547


100%|██████████████████████████████████████| 8000/8000 [00:08<00:00, 979.06it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.6052255  acc:  0.569875 f1:  0.6061577200412042
EPOCHS:3
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:26<00:00, 122.54it/s]


 Average training loss: 0.66


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1016.72it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.57402078125  acc:  0.546875 f1:  0.563515954244431


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1010.88it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.605287375  acc:  0.572875 f1:  0.5943250623293364
EPOCHS:4
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:24<00:00, 123.11it/s]


 Average training loss: 0.65


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1051.75it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.5588785625  acc:  0.537125 f1:  0.5517491829076383


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1040.99it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.59842328125  acc:  0.5665 f1:  0.5861575178997614
EPOCHS:5
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:29<00:00, 121.54it/s]


 Average training loss: 0.64


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1032.20it/s]


 Average valid loss: 0.74
AUC_SCORE:  0.52918465625  acc:  0.52075 f1:  0.511716760061131


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1058.27it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.5798719375  acc:  0.556375 f1:  0.559294672792748


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1035.20it/s]

 Average valid loss: 0.68
AUC_SCORE:  0.6040329375  acc:  0.56725 f1:  0.6154187958231504
BEST MODEL
AUC_SCORE:  0.6040329375  acc:  0.56725 f1:  0.6154187958231504
{'auc_micro': 0.6040329375, 'acc': 0.56725, 'f1': 0.6154187958231504, 'pred': [[0, 0, 1, 1, 1, 1, 1, 0, 1, 1], [1, 1, 1, 0, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0, 1, 1, 1, 1], [0, 1, 1, 1, 0, 1, 1, 1, 0, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 0, 0, 0, 1, 1, 1, 0, 1], [1, 0, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 1, 1, 1], [0, 1, 1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 1, 1, 1, 0, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [0, 1, 1, 0, 1, 1, 1, 1, 1, 1], [0, 0, 1, 1, 1, 0, 1, 0, 1, 0], [1, 1, 0, 1, 0, 1, 1, 1, 1, 0], [1, 0, 1, 0, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 1, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 1, 1, 1], [1, 1, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 1, 0, 1], [1, 0, 1, 1, 0, 1, 0, 1, 1, 1], [0, 0, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1, 1], [1, 1, 1, 0, 1, 1, 1




In [17]:
#random.Random(4).shuffle(mov_pro2)
#random.Random(4).shuffle(mov_pro2_no)
train_data = mov_pro2[:20000]+mov_pro2_no[:20000]
valid_data = mov_pro2[20000:24000]+mov_pro2_no[20000:24000]
test_data = mov_pro2[24000:28000]+mov_pro2_no[24000:28000]
random.Random(4).shuffle(train_data)

In [18]:
bert = BertModel(d_model=384).to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=2)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:20<00:00, 124.77it/s]


 Average training loss: 0.68


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1044.21it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.61201459375  acc:  0.56375 f1:  0.6476882697355139


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1045.66it/s]


 Average valid loss: 0.67
AUC_SCORE:  0.6504895312499999  acc:  0.581125 f1:  0.6643293599118503
EPOCHS:2
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:18<00:00, 125.65it/s]


 Average training loss: 0.67


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1029.76it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.6093034687500001  acc:  0.5705 f1:  0.6432724252491694


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1037.54it/s]


 Average valid loss: 0.67
AUC_SCORE:  0.64837909375  acc:  0.58225 f1:  0.655392864508146


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1034.57it/s]

 Average valid loss: 0.67
AUC_SCORE:  0.6504895312499999  acc:  0.581125 f1:  0.6643293599118503
BEST MODEL
AUC_SCORE:  0.6504895312499999  acc:  0.581125 f1:  0.6643293599118503
{'auc_micro': 0.6504895312499999, 'acc': 0.581125, 'f1': 0.6643293599118503, 'pred': [[1, 1, 1, 1, 1, 1, 0, 0, 1, 1], [0, 1, 1, 1, 0, 1, 1, 0, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1, 1], [0, 0, 1, 0, 0, 0, 1, 0, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1, 0], [0, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 0, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 1, 1, 1, 0, 1, 1, 0, 1, 1], [1, 1, 1, 1, 1, 0, 0, 0, 1, 1], [1, 1, 1, 1, 1, 1, 0, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0, 1, 0, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0, 0, 0, 1, 1], [1, 1, 0, 1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 0, 1, 1, 0],




In [20]:
#random.Random(4).shuffle(mov_pro3)
#random.Random(4).shuffle(mov_pro3_no)
train_data = mov_pro3[:20000]+mov_pro3_no[:20000]
valid_data = mov_pro3[20000:24000]+mov_pro3_no[20000:24000]
test_data = mov_pro3[24000:28000]+mov_pro3_no[24000:28000]
random.Random(4).shuffle(train_data)

In [21]:
bert = BertModel(d_model=384).to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=3)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:25<00:00, 123.02it/s]


 Average training loss: 0.67


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1039.92it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.57589315625  acc:  0.54975 f1:  0.62211498111624


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1028.65it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.60073459375  acc:  0.566875 f1:  0.6260925865976044
EPOCHS:2
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:25<00:00, 122.79it/s]


 Average training loss: 0.66


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1028.17it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.5574775625  acc:  0.540125 f1:  0.6080749973367422


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1028.07it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.59331715625  acc:  0.565375 f1:  0.6165214514172273
EPOCHS:3
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:22<00:00, 124.14it/s]


 Average training loss: 0.65


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1027.96it/s]


 Average valid loss: 0.73
AUC_SCORE:  0.5361494375  acc:  0.52625 f1:  0.6013044393014938


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1059.97it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.58011171875  acc:  0.565375 f1:  0.622024133057941


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1053.45it/s]

 Average valid loss: 0.68
AUC_SCORE:  0.60073459375  acc:  0.566875 f1:  0.6260925865976044
BEST MODEL
AUC_SCORE:  0.60073459375  acc:  0.566875 f1:  0.6260925865976044
{'auc_micro': 0.60073459375, 'acc': 0.566875, 'f1': 0.6260925865976044, 'pred': [[1, 0, 1, 1, 0, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0, 0, 1, 1, 1], [0, 1, 0, 1, 0, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 0, 1, 0, 1, 1], [0, 0, 0, 1, 1, 1, 0, 1, 1, 1], [1, 0, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 1, 0, 1, 1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 1, 0, 0, 0, 1], [0, 1, 0, 0, 0, 1, 0, 0, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 1, 1, 1, 1, 0, 1], [0, 0, 1, 1, 0, 0, 0, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 0, 0, 1, 1, 1, 1, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 0, 1, 1, 1, 0, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 0, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1




# Doc2Vec

In [159]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [160]:
mov_pro1 = []
mov_pro2 = []
mov_pro3 = []
mov_pro1_no = []
mov_pro2_no = []
mov_pro3_no = []
path1 = 'doc2vec/'

filenames = os.listdir(path1)
train_data = []
valid_data = []
test_data = []
for filename in filenames:
    movie_name = filename[:-14]
    
    w2v=Doc2Vec.load(path1 + filename)
    base = 'scene_'
    scene_num = 0
    passs = 0
    max_scene = 0
    for node in w2v.dv.index_to_key:
        if base in node:
            scene_num += 1
            if int(node[6:])>max_scene:
                max_scene = int(node[6:])
    embs = torch.zeros(1,scene_num,384)
    k = 0
    for i in range(1,max_scene+1):
        emb_name = base + str(i)
        if emb_name in w2v.dv.index_to_key:
            emb = w2v.dv[emb_name]
            emb = torch.from_numpy(emb)
            embs[0][k] = emb
            k += 1

    mov_mean = torch.mean(embs, dim=1)
    
    
    
    if movie_name not in set(char_123.keys()):
        continue        
        
    for scene in char_123[movie_name]['mov_pro1']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        
        if emb_name in w2v.dv.index_to_key:
            
            embs[0][0] = torch.from_numpy(w2v.dv[emb_name])
        else:
            
            passs = 1
            continue
        if not passs: 
            mov_pro1.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro1_no']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.dv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.dv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro1_no.append([embs,[0]])    

    for scene in char_123[movie_name]['mov_pro2']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.dv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.dv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro2.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro2_no']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.dv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.dv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro2_no.append([embs,[0]])  

    for scene in char_123[movie_name]['mov_pro3']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.dv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.dv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro3.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro3_no']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.dv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.dv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro3_no.append([embs,[0]])

In [161]:
print(len(mov_pro1))
print(len(mov_pro2))
print(len(mov_pro3))
print(len(mov_pro1_no))
print(len(mov_pro2_no))
print(len(mov_pro3_no))

75814
48861
34694
52470
79423
93590


In [162]:
#random.Random(4).shuffle(mov_pro1)
#random.Random(4).shuffle(mov_pro1_no)
train_data = mov_pro1[:20000]+mov_pro1_no[:20000]
valid_data = mov_pro1[20000:24000]+mov_pro1_no[20000:24000]
test_data = mov_pro1[24000:28000]+mov_pro1_no[24000:28000]
random.Random(4).shuffle(train_data)

In [163]:
bert = BertModel(d_model=384).to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=1)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:22<00:00, 123.99it/s]


 Average training loss: 0.68


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1013.04it/s]


 Average valid loss: 0.73
AUC_SCORE:  0.40678575000000006  acc:  0.44925 f1:  0.535917421529387


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1038.53it/s]


 Average valid loss: 0.75
AUC_SCORE:  0.34729734375  acc:  0.376125 f1:  0.44844734224776217


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1039.21it/s]

 Average valid loss: 0.75
AUC_SCORE:  0.34729734375  acc:  0.376125 f1:  0.44844734224776217
BEST MODEL
AUC_SCORE:  0.34729734375  acc:  0.376125 f1:  0.44844734224776217
{'auc_micro': 0.34729734375, 'acc': 0.376125, 'f1': 0.44844734224776217, 'pred': [[0, 0, 1, 1, 1, 1, 1, 0, 0, 1], [0, 0, 1, 1, 1, 1, 0, 1, 1, 0], [1, 1, 0, 0, 1, 1, 1, 1, 0, 1], [1, 1, 1, 0, 1, 1, 1, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 0, 0, 1, 1, 0, 1, 1], [1, 0, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0, 0, 1, 0, 0], [1, 0, 1, 0, 1, 0, 0, 1, 1, 1], [1, 0, 1, 1, 1, 1, 0, 1, 1, 0], [0, 1, 0, 1, 1, 1, 1, 1, 1, 1], [1, 0, 1, 0, 0, 0, 1, 1, 1, 0], [1, 1, 1, 1, 0, 0, 1, 1, 1, 1], [0, 1, 1, 0, 0, 0, 0, 0, 1, 1], [1, 1, 1, 1, 0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 1, 1, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [1, 0, 1, 1, 1, 0, 0, 1, 1, 0], [0, 0, 0, 1, 0, 1, 1, 0, 0, 1], [0, 1, 0, 0, 0, 1, 1, 1, 0, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 1, 1, 0, 1, 0, 0], [0, 0, 0, 1




In [164]:
#random.Random(4).shuffle(mov_pro2)
#random.Random(4).shuffle(mov_pro2_no)
train_data = mov_pro2[:20000]+mov_pro2_no[:20000]
valid_data = mov_pro2[20000:24000]+mov_pro2_no[2000:24000]
test_data = mov_pro2[24000:28000]+mov_pro2_no[24000:28000]
random.Random(4).shuffle(train_data)

In [165]:
bert = BertModel(d_model=384).to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=1)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:20<00:00, 124.64it/s]


 Average training loss: 0.68


100%|███████████████████████████████████| 26000/26000 [00:25<00:00, 1021.22it/s]


 Average valid loss: 0.76
AUC_SCORE:  0.6260939545454545  acc:  0.42496153846153845 f1:  0.3047016695344835


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1038.62it/s]


 Average valid loss: 0.76
AUC_SCORE:  0.42840365625  acc:  0.450625 f1:  0.5975643256112078


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1017.63it/s]


 Average valid loss: 0.76
AUC_SCORE:  0.42840365625  acc:  0.450625 f1:  0.5975643256112078
BEST MODEL
AUC_SCORE:  0.42840365625  acc:  0.450625 f1:  0.5975643256112078
{'auc_micro': 0.42840365625, 'acc': 0.450625, 'f1': 0.5975643256112078, 'pred': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 0, 1, 1, 1, 0, 1, 1, 1], [0, 1, 1, 1, 1, 0, 1, 1, 1, 1], [1, 1, 0, 0, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 1, 1, 1], [1, 0, 1, 1, 1, 1, 1, 0, 0, 0], [0, 1, 1, 1, 1, 1, 1, 0, 1, 0], [0, 0, 0, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0

In [166]:
#random.Random(4).shuffle(mov_pro3)
#random.Random(4).shuffle(mov_pro3_no)
train_data = mov_pro3[:20000]+mov_pro3_no[:20000]
valid_data = mov_pro3[20000:24000]+mov_pro3_no[20000:24000]
test_data = mov_pro3[24000:28000]+mov_pro3_no[24000:28000]
random.Random(4).shuffle(train_data)

In [167]:
bert = BertModel(d_model=384).to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=1)
print(score)

EPOCHS:1
TRAIN


 18%|██████▌                              | 7083/40000 [00:58<04:33, 120.30it/s]


KeyboardInterrupt: 

# edge2vec

In [168]:
mov_pro1 = []
mov_pro2 = []
mov_pro3 = []
mov_pro1_no = []
mov_pro2_no = []
mov_pro3_no = []
path1 = 'word2vec_model_edge/'

filenames = os.listdir(path1)
train_data = []
valid_data = []
test_data = []

for filename in filenames:
    movie_name = filename[:-14]
    
    w2v=Word2Vec.load(path1 + filename)
    base = 'scene_'
    scene_num = 0
    passs = 0
    max_scene = 0
    for node in w2v.wv.index_to_key:
        if base in node:
            scene_num += 1
            if int(node[6:])>max_scene:
                max_scene = int(node[6:])
    embs = torch.zeros(1,scene_num,400)
    k = 0
    for i in range(1,max_scene+1):
        emb_name = base + str(i)
        if emb_name in w2v.wv.index_to_key:
            emb = w2v.wv[emb_name]
            emb = torch.from_numpy(emb)
            embs[0][k] = emb
            k += 1

    mov_mean = torch.mean(embs, dim=1)
    
    
    
    if movie_name not in set(char_123.keys()):
        continue        
        
    for scene in char_123[movie_name]['mov_pro1']:
        passs = 0
        embs = torch.zeros(1,3,400)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro1.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro1_no']:
        passs = 0
        embs = torch.zeros(1,3,400)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro1_no.append([embs,[0]])    

    for scene in char_123[movie_name]['mov_pro2']:
        passs = 0
        embs = torch.zeros(1,3,400)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro2.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro2_no']:
        passs = 0
        embs = torch.zeros(1,3,400)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro2_no.append([embs,[0]])  

    for scene in char_123[movie_name]['mov_pro3']:
        passs = 0
        embs = torch.zeros(1,3,400)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro3.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro3_no']:
        passs = 0
        embs = torch.zeros(1,3,400)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro3_no.append([embs,[0]])  

In [169]:
#random.Random(4).shuffle(mov_pro1)
#random.Random(4).shuffle(mov_pro1_no)
train_data = mov_pro1[:20000]+mov_pro1_no[:20000]
valid_data = mov_pro1[20000:24000]+mov_pro1_no[20000:24000]
test_data = mov_pro1[24000:28000]+mov_pro1_no[24000:28000]
random.Random(4).shuffle(train_data)

In [170]:
bert = BertModel(d_model=400).to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=1)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:31<00:00, 120.63it/s]


 Average training loss: 0.65


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1045.65it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.63051640625  acc:  0.587875 f1:  0.49033853764105734


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1036.38it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.6521768749999999  acc:  0.600375 f1:  0.5257380210651239


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1054.72it/s]

 Average valid loss: 0.68
AUC_SCORE:  0.6521768749999999  acc:  0.600375 f1:  0.5257380210651239
BEST MODEL
AUC_SCORE:  0.6521768749999999  acc:  0.600375 f1:  0.5257380210651239
{'auc_micro': 0.6521768749999999, 'acc': 0.600375, 'f1': 0.5257380210651239, 'pred': [[0, 1, 0, 1, 1, 1, 1, 1, 1, 0], [1, 0, 1, 0, 0, 1, 1, 1, 0, 1], [1, 1, 1, 1, 1, 1, 0, 0, 1, 1], [1, 1, 0, 1, 1, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 0, 1, 1, 1, 0], [1, 0, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 0, 1, 1, 1, 1], [1, 1, 1, 0, 0, 1, 0, 1, 1, 1], [1, 1, 0, 0, 1, 1, 0, 1, 1, 1], [1, 0, 0, 1, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 0, 1, 1, 1, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 1, 0, 1, 1], [0, 0, 0, 1, 0, 1, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 1, 0, 0, 1], [0, 1, 0, 0, 1, 0, 1, 0, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 1, 0, 1, 1, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1, 0, 1], [0, 0, 0, 1, 0, 0, 0, 0, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0],




In [171]:
#random.Random(4).shuffle(mov_pro2)
#random.Random(4).shuffle(mov_pro2_no)
train_data = mov_pro2[:20000]+mov_pro2_no[:20000]
valid_data = mov_pro2[20000:24000]+mov_pro2_no[20000:24000]
test_data = mov_pro2[24000:28000]+mov_pro2_no[24000:28000]
random.Random(4).shuffle(train_data)

In [172]:
bert = BertModel(d_model=400).to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=1)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:20<00:00, 124.63it/s]


 Average training loss: 0.66


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1026.86it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.58580803125  acc:  0.558 f1:  0.5688856376493538


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1048.62it/s]


 Average valid loss: 0.65
AUC_SCORE:  0.67210596875  acc:  0.6275 f1:  0.6114732724902218


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1039.93it/s]

 Average valid loss: 0.65
AUC_SCORE:  0.67210596875  acc:  0.6275 f1:  0.6114732724902218
BEST MODEL
AUC_SCORE:  0.67210596875  acc:  0.6275 f1:  0.6114732724902218
{'auc_micro': 0.67210596875, 'acc': 0.6275, 'f1': 0.6114732724902218, 'pred': [[1, 0, 1, 1, 0, 1, 0, 1, 0, 1], [1, 0, 1, 1, 1, 1, 1, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 1, 0, 0, 1, 1, 0], [0, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 0, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 1, 1, 1, 1, 0, 1], [0, 0, 1, 1, 1, 0, 1, 1, 0, 0], [1, 0, 1, 0, 1, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 1, 1, 0, 1, 1, 1], [1, 0, 0, 1, 0, 0, 1, 0, 0, 0], [1, 0, 1, 1, 1, 0, 1, 0, 0, 1], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 1, 0, 1, 0, 1, 0], [1, 0, 1, 1, 0, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0, 1, 1, 1, 1], [1, 1, 0, 0, 1, 0, 0, 0, 1, 1], [0, 0, 1, 0, 0, 0, 1, 0, 0, 0], [1, 0, 1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0




In [173]:
#random.Random(4).shuffle(mov_pro3)
#random.Random(4).shuffle(mov_pro3_no)
train_data = mov_pro3[:20000]+mov_pro3_no[:20000]
valid_data = mov_pro3[20000:24000]+mov_pro3_no[20000:24000]
test_data = mov_pro3[24000:28000]+mov_pro3_no[24000:28000]
random.Random(4).shuffle(train_data)

In [175]:
bert = BertModel(d_model=400).to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=1)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:20<00:00, 124.64it/s]


 Average training loss: 0.66


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1029.96it/s]


 Average valid loss: 0.72
AUC_SCORE:  0.5477542187500001  acc:  0.530125 f1:  0.49833177632456965


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1034.30it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.606707875  acc:  0.572375 f1:  0.5603392880092534


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1037.45it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.606707875  acc:  0.572375 f1:  0.5603392880092534
BEST MODEL
AUC_SCORE:  0.606707875  acc:  0.572375 f1:  0.5603392880092534
{'auc_micro': 0.606707875, 'acc': 0.572375, 'f1': 0.5603392880092534, 'pred': [[0, 1, 1, 0, 1, 1, 1, 0, 0, 0], [1, 1, 0, 1, 1, 1, 0, 1, 1, 0], [0, 1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 1, 1, 1, 0, 0, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 0, 1, 0, 1, 1, 1, 1, 1], [1, 0, 0, 0, 1, 0, 0, 1, 0, 0], [0, 0, 1, 0, 1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 0, 1, 1, 1], [0, 1, 1, 1, 1, 1, 0, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 0, 1, 1, 0, 1], [1, 1, 1, 1, 0, 1, 1, 0, 0, 0], [0, 0, 1, 0, 0, 1, 1, 1, 0, 1], [1, 1, 0, 1, 0, 0, 0, 1, 0, 1], [1, 1, 0, 1, 0, 1, 0, 0, 0, 0], [1, 1, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [0, 0, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 1, 0, 0, 0, 0, 1], [1, 1, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 1, 1

# 연결추가 + 임베딩 유

In [176]:
mov_pro1 = []
mov_pro2 = []
mov_pro3 = []
mov_pro1_no = []
mov_pro2_no = []
mov_pro3_no = []
path1 = 'word2vec_model_extra+emb/'

filenames = os.listdir(path1)
train_data = []
valid_data = []
test_data = []

for filename in filenames:
    movie_name = filename[:-14]
    
    w2v=Word2Vec.load(path1 + filename)
    base = 'scene_'
    scene_num = 0
    passs = 0
    max_scene = 0
    for node in w2v.wv.index_to_key:
        if base in node:
            scene_num += 1
            if int(node[6:])>max_scene:
                max_scene = int(node[6:])
    embs = torch.zeros(1,scene_num,384)
    k = 0
    for i in range(1,max_scene+1):
        emb_name = base + str(i)
        if emb_name in w2v.wv.index_to_key:
            emb = w2v.wv[emb_name]
            emb = torch.from_numpy(emb)
            embs[0][k] = emb
            k += 1

    mov_mean = torch.mean(embs, dim=1)
    
    
    
    if movie_name not in set(char_123.keys()):
        continue        
        
    for scene in char_123[movie_name]['mov_pro1']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro1.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro1_no']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro1_no.append([embs,[0]])    

    for scene in char_123[movie_name]['mov_pro2']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro2.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro2_no']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro2_no.append([embs,[0]])  

    for scene in char_123[movie_name]['mov_pro3']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro3.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro3_no']:
        passs = 0
        embs = torch.zeros(1,3,384)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro3_no.append([embs,[0]])  


In [177]:
#random.Random(4).shuffle(mov_pro1)
#random.Random(4).shuffle(mov_pro1_no)
train_data = mov_pro1[:20000]+mov_pro1_no[:20000]
valid_data = mov_pro1[20000:24000]+mov_pro1_no[20000:24000]
test_data = mov_pro1[24000:28000]+mov_pro1_no[24000:28000]
random.Random(4).shuffle(train_data)

In [178]:
bert = BertModel().to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=1)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:20<00:00, 124.80it/s]


 Average training loss: 0.65


100%|██████████████████████████████████████| 8000/8000 [00:08<00:00, 970.49it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.6217812812500001  acc:  0.5875 f1:  0.5178258328462888


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1025.01it/s]


 Average valid loss: 0.65
AUC_SCORE:  0.6796928125000001  acc:  0.623375 f1:  0.5687705739229998


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1016.79it/s]

 Average valid loss: 0.65
AUC_SCORE:  0.6796928125000001  acc:  0.623375 f1:  0.5687705739229998
BEST MODEL
AUC_SCORE:  0.6796928125000001  acc:  0.623375 f1:  0.5687705739229998
{'auc_micro': 0.6796928125000001, 'acc': 0.623375, 'f1': 0.5687705739229998, 'pred': [[0, 1, 1, 1, 0, 0, 1, 1, 1, 1], [1, 1, 1, 0, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 1, 1, 1], [1, 1, 0, 1, 1, 0, 1, 1, 1, 0], [1, 1, 1, 0, 0, 1, 1, 1, 0, 0], [0, 1, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 0, 0, 0, 1, 1, 1, 0], [1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 1, 0, 1, 1, 1], [1, 0, 0, 0, 1, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 1, 1, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 1, 1, 1], [0, 0, 1, 0, 1, 0, 0, 1, 1, 0], [1, 0, 1, 1, 1, 1, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 1, 0, 0, 1], [1, 1, 0, 0, 1, 1, 1, 0, 1, 1], [1, 0, 0, 0, 0, 0, 0, 1, 1, 1], [1, 0, 1, 0, 1, 0, 1, 0, 0, 1], [1, 0, 0, 0, 0, 0, 1, 1, 1, 1], [1, 1, 0, 1, 0, 0, 0, 0, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 1],




In [179]:
#random.Random(4).shuffle(mov_pro2)
#random.Random(4).shuffle(mov_pro2_no)
train_data = mov_pro2[:20000]+mov_pro2_no[:20000]
valid_data = mov_pro2[20000:24000]+mov_pro2_no[20000:24000]
test_data = mov_pro2[24000:28000]+mov_pro2_no[24000:28000]
random.Random(4).shuffle(train_data)

In [180]:
bert = BertModel().to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=1)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:20<00:00, 124.81it/s]


 Average training loss: 0.66


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1028.13it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.60633603125  acc:  0.5765 f1:  0.5310077519379846


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1047.02it/s]


 Average valid loss: 0.64
AUC_SCORE:  0.6917506250000001  acc:  0.6315 f1:  0.5778923253150058


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1047.60it/s]


 Average valid loss: 0.64
AUC_SCORE:  0.6917506250000001  acc:  0.6315 f1:  0.5778923253150058
BEST MODEL
AUC_SCORE:  0.6917506250000001  acc:  0.6315 f1:  0.5778923253150058
{'auc_micro': 0.6917506250000001, 'acc': 0.6315, 'f1': 0.5778923253150058, 'pred': [[1, 0, 0, 0, 0, 1, 0, 1, 1, 0], [0, 0, 1, 1, 1, 1, 1, 0, 0, 1], [0, 1, 1, 1, 0, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 1, 1, 1, 0, 1, 0, 1], [0, 1, 0, 0, 0, 0, 1, 0, 0, 1], [1, 0, 0, 1, 1, 0, 1, 0, 0, 0], [0, 0, 0, 1, 1, 1, 1, 1, 0, 0], [1, 0, 1, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 1, 1, 0, 1, 0, 0, 1, 0, 0], [1, 0, 0, 1, 0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0, 1, 0, 0, 1], [0, 1, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 1, 0], [1, 0, 0, 0, 1, 0, 0, 0, 1, 0], [1, 0, 0, 1, 0, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 1, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0, 0, 0, 1, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 1, 0, 0], [0, 0

In [181]:
#random.Random(4).shuffle(mov_pro3)
#random.Random(4).shuffle(mov_pro3_no)
train_data = mov_pro3[:20000]+mov_pro3_no[:20000]
valid_data = mov_pro3[20000:24000]+mov_pro3_no[20000:24000]
test_data = mov_pro3[24000:28000]+mov_pro3_no[24000:28000]
random.Random(4).shuffle(train_data)

In [182]:
bert = BertModel().to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=1)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:22<00:00, 124.02it/s]


 Average training loss: 0.64


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1007.39it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.5862926875000001  acc:  0.55875 f1:  0.5333157059756742


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1029.73it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.61723784375  acc:  0.5855 f1:  0.5712438582880786


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1030.76it/s]


 Average valid loss: 0.69
AUC_SCORE:  0.61723784375  acc:  0.5855 f1:  0.5712438582880786
BEST MODEL
AUC_SCORE:  0.61723784375  acc:  0.5855 f1:  0.5712438582880786
{'auc_micro': 0.61723784375, 'acc': 0.5855, 'f1': 0.5712438582880786, 'pred': [[1, 1, 1, 0, 1, 1, 1, 0, 0, 0], [0, 0, 1, 1, 1, 1, 0, 1, 1, 0], [0, 1, 1, 1, 0, 1, 1, 1, 1, 1], [1, 0, 1, 0, 0, 0, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 1, 0], [1, 1, 1, 1, 0, 1, 1, 1, 0, 0], [1, 0, 0, 0, 1, 1, 0, 1, 1, 1], [1, 1, 1, 0, 1, 0, 1, 1, 1, 0], [0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 0, 1, 1, 1], [0, 1, 1, 0, 0, 1, 1, 1, 1, 1], [1, 0, 0, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 1, 0, 1, 1, 1, 1], [1, 1, 1, 1, 0, 1, 1, 0, 1, 1], [1, 0, 1, 0, 1, 0, 0, 1, 0, 1], [0, 1, 0, 0, 1, 1, 0, 0, 0, 1], [0, 1, 0, 1, 1, 0, 1, 0, 0, 0], [1, 1, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1, 0, 1, 1, 1, 1, 1, 1, 0], [1, 0, 1, 1, 1, 1, 1, 0, 1, 1], [0, 1, 1, 0, 1, 1, 1, 0, 0, 1], [1, 1, 1, 1, 0, 0, 1, 1, 0, 1], [0, 1, 0, 1, 0, 1, 1

# 연결 유 + pos

In [183]:
mov_pro1 = []
mov_pro2 = []
mov_pro3 = []
mov_pro1_no = []
mov_pro2_no = []
mov_pro3_no = []
path1 = 'word2vec_model_extra+pos/'

filenames = os.listdir(path1)
train_data = []
valid_data = []
test_data = []

for filename in filenames:
    movie_name = filename[:-14]
    
    w2v=Word2Vec.load(path1 + filename)
    base = 'scene_'
    scene_num = 0
    passs = 0
    max_scene = 0
    for node in w2v.wv.index_to_key:
        if base in node:
            scene_num += 1
            if int(node[6:])>max_scene:
                max_scene = int(node[6:])
    embs = torch.zeros(1,scene_num,400)
    k = 0
    for i in range(1,max_scene+1):
        emb_name = base + str(i)
        if emb_name in w2v.wv.index_to_key:
            emb = w2v.wv[emb_name]
            emb = torch.from_numpy(emb)
            embs[0][k] = emb
            k += 1

    mov_mean = torch.mean(embs, dim=1)
    
    
    
    if movie_name not in set(char_123.keys()):
        continue        
        
    for scene in char_123[movie_name]['mov_pro1']:
        passs = 0
        embs = torch.zeros(1,3,400)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro1.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro1_no']:
        passs = 0
        embs = torch.zeros(1,3,400)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro1_no.append([embs,[0]])    

    for scene in char_123[movie_name]['mov_pro2']:
        passs = 0
        embs = torch.zeros(1,3,400)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro2.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro2_no']:
        passs = 0
        embs = torch.zeros(1,3,400)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro2_no.append([embs,[0]])  

    for scene in char_123[movie_name]['mov_pro3']:
        passs = 0
        embs = torch.zeros(1,3,400)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro3.append([embs,[1]])
            
    for scene in char_123[movie_name]['mov_pro3_no']:
        passs = 0
        embs = torch.zeros(1,3,400)
        embs[0][-1] = mov_mean
        
        emb_name = base + str(scene)
        if emb_name in w2v.wv.index_to_key:
            embs[0][0] = torch.from_numpy(w2v.wv[emb_name])
        else:
            passs = 1
            continue
        if not passs:        
            mov_pro3_no.append([embs,[0]])  


In [184]:
#random.Random(4).shuffle(mov_pro1)
#random.Random(4).shuffle(mov_pro1_no)
train_data = mov_pro1[:20000]+mov_pro1_no[:20000]
valid_data = mov_pro1[20000:24000]+mov_pro1_no[20000:24000]
test_data = mov_pro1[24000:28000]+mov_pro1_no[24000:28000]
random.Random(4).shuffle(train_data)

In [185]:
bert = BertModel(d_model=400).to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=1)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:20<00:00, 124.94it/s]


 Average training loss: 0.64


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1013.20it/s]


 Average valid loss: 0.68
AUC_SCORE:  0.6373437500000001  acc:  0.598625 f1:  0.5504689906201875


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1035.10it/s]


 Average valid loss: 0.64
AUC_SCORE:  0.6993004374999999  acc:  0.652375 f1:  0.6124041811846689


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1022.21it/s]

 Average valid loss: 0.64
AUC_SCORE:  0.6993004374999999  acc:  0.652375 f1:  0.6124041811846689
BEST MODEL
AUC_SCORE:  0.6993004374999999  acc:  0.652375 f1:  0.6124041811846689
{'auc_micro': 0.6993004374999999, 'acc': 0.652375, 'f1': 0.6124041811846689, 'pred': [[0, 1, 0, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 1, 1, 1], [1, 1, 0, 1, 1, 1, 1, 1, 0, 1], [0, 1, 1, 0, 0, 1, 1, 1, 1, 1], [0, 1, 1, 0, 1, 0, 1, 1, 1, 0], [1, 0, 1, 1, 1, 1, 1, 0, 1, 0], [0, 1, 1, 1, 1, 0, 1, 1, 1, 1], [1, 1, 1, 0, 0, 1, 1, 1, 1, 1], [1, 1, 0, 1, 1, 1, 1, 0, 1, 1], [1, 1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 1, 0, 0, 0, 0, 0, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0, 1, 1, 1], [0, 0, 1, 0, 0, 0, 1, 0, 1, 1], [0, 0, 0, 1, 0, 1, 0, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0, 0, 0, 1], [0, 1, 0, 0, 1, 0, 0, 0, 1, 1], [1, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 1, 1, 1, 0], [1, 0, 0, 0, 0, 0, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 1, 0, 0, 1, 0],




In [186]:
#random.Random(4).shuffle(mov_pro2)
#random.Random(4).shuffle(mov_pro2_no)
train_data = mov_pro2[:20000]+mov_pro2_no[:20000]
valid_data = mov_pro2[20000:24000]+mov_pro2_no[20000:24000]
test_data = mov_pro2[24000:28000]+mov_pro2_no[24000:28000]
random.Random(4).shuffle(train_data)

In [187]:
bert = BertModel(d_model=400).to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=1)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:22<00:00, 124.01it/s]


 Average training loss: 0.65


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1013.25it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.5901612500000001  acc:  0.57025 f1:  0.5846822904083111


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1033.62it/s]


 Average valid loss: 0.64
AUC_SCORE:  0.689416375  acc:  0.639375 f1:  0.6335577289470341


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1021.81it/s]


 Average valid loss: 0.64
AUC_SCORE:  0.689416375  acc:  0.639375 f1:  0.6335577289470341
BEST MODEL
AUC_SCORE:  0.689416375  acc:  0.639375 f1:  0.6335577289470341
{'auc_micro': 0.689416375, 'acc': 0.639375, 'f1': 0.6335577289470341, 'pred': [[1, 0, 1, 1, 0, 1, 1, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1, 0, 1, 1, 1, 0, 0, 1, 1], [1, 1, 0, 1, 0, 0, 0, 0, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1, 1], [1, 0, 0, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 1, 1, 1, 0, 1, 1], [0, 0, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [0, 0, 1, 0, 0, 1, 0, 0, 1, 0], [1, 1, 1, 0, 1, 1, 0, 1, 1, 0], [1, 0, 0, 1, 0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0, 1, 1, 0, 1], [0, 1, 0, 0, 0, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 1, 1, 0, 1], [0, 1, 1, 0, 0, 1, 0, 1, 1, 1], [1, 1, 0, 0, 1, 0, 0, 0, 1, 1], [1, 0, 0, 1, 0, 0, 1, 1, 1, 0], [1, 1, 0, 1, 1, 1, 1, 0, 1, 1], [1, 1, 0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 1, 0, 0, 0, 0, 0, 1, 1], [0, 0, 1, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 1, 0, 1], [0, 0, 0, 1, 0, 0, 0

In [188]:
#random.Random(4).shuffle(mov_pro3)
#random.Random(4).shuffle(mov_pro3_no)
train_data = mov_pro3[:20000]+mov_pro3_no[:20000]
valid_data = mov_pro3[20000:24000]+mov_pro3_no[20000:24000]
test_data = mov_pro3[24000:28000]+mov_pro3_no[24000:28000]
random.Random(4).shuffle(train_data)

In [189]:
bert = BertModel(d_model=400).to('cuda')
score = fit(bert, train_data,valid_data,test_data,EPOCHS=1)
print(score)

EPOCHS:1
TRAIN


100%|████████████████████████████████████| 40000/40000 [05:21<00:00, 124.40it/s]


 Average training loss: 0.65


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1011.99it/s]


 Average valid loss: 0.71
AUC_SCORE:  0.57878259375  acc:  0.554 f1:  0.547093170855547


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1033.58it/s]


 Average valid loss: 0.70
AUC_SCORE:  0.60536115625  acc:  0.574625 f1:  0.5746781652293463


100%|█████████████████████████████████████| 8000/8000 [00:07<00:00, 1024.24it/s]

 Average valid loss: 0.70
AUC_SCORE:  0.60536115625  acc:  0.574625 f1:  0.5746781652293463
BEST MODEL
AUC_SCORE:  0.60536115625  acc:  0.574625 f1:  0.5746781652293463
{'auc_micro': 0.60536115625, 'acc': 0.574625, 'f1': 0.5746781652293463, 'pred': [[1, 0, 1, 0, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 1, 1, 1], [1, 0, 1, 1, 1, 1, 1, 1, 0, 1], [0, 1, 1, 1, 0, 1, 1, 1, 1, 1], [1, 0, 1, 0, 1, 1, 0, 1, 1, 0], [1, 1, 1, 1, 1, 0, 1, 1, 0, 1], [1, 0, 1, 1, 1, 0, 1, 0, 1, 1], [0, 1, 1, 1, 1, 0, 1, 0, 1, 0], [0, 1, 0, 0, 1, 0, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 0, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 0, 1, 1, 1, 1, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 0, 1, 0, 1, 1, 0, 1, 0, 1], [1, 0, 0, 0, 1, 1, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 0, 1, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0, 1], [0, 0, 0, 1, 1, 1, 1, 1, 1, 1], [0, 1, 0, 1, 1, 1, 1, 1, 1, 0], [1, 0, 1, 1, 1, 1, 1, 0, 1, 1], [1, 0, 1, 0, 1, 1, 1, 0, 0, 1], [0, 1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 0, 1, 1




In [None]:
.0
3..0