In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import pipeline

import csv
import pandas as pd
pd.options.display.max_rows = 100

import pickle
import math
import numpy as np

from collections import Counter
from tqdm import tqdm

#from konlpy.tag import Kkma
#from konlpy.utils import pprint
#kkma = Kkma()

In [2]:
#https://huggingface.co/kykim/bert-kor-base
#model.eval() for model information

tokenizer = AutoTokenizer.from_pretrained("kykim/bert-kor-base")
model = AutoModelForMaskedLM.from_pretrained("kykim/bert-kor-base")

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# top_k can be adjusted from 1 (output with top probability) to tokenizer.vocab_size 
unmask = pipeline(task="fill-mask", model=model, tokenizer=tokenizer, top_k=1)

In [15]:
def getentropy(q):
    #entropy = -(q * np.log2(q))
    entropy = -(q * np.log2(q) + (1-q) * np.log2(1-q))
    return entropy

In [25]:
def main(filename):
    """Run BERT for experiment items."""
    
    data_path = "{FILE}.csv".format(FILE=filename)
    vocab_dict = pd.read_pickle('vocab_pos_df.pickle').set_index('token').to_dict()['pos']
    df_output = pd.read_csv(data_path)
    
    cond = []
    item = []
    text = []
    pos = []
    prob = []
    surprisal = []
    
    with tqdm(total=df_output.shape[0]) as pbar:
        for index, row in df_output.iterrows():
            
            condition = row['Condition name']
            cond.append(condition)
            
            itemindex = row['Lexicalization']
            item.append(itemindex)
            
            sentence = row['Item']
            responses = unmask(sentence)
                    
            for dicts in responses:
                for dict_entry in range(len(responses)):                
                    for vocab in vocab_dict:
                        if responses[dict_entry]['token_str'] == vocab:
                            responses[dict_entry]['pos'] = vocab_dict[vocab]
                            
            text.append(responses[dict_entry]['token_str'])
            pos.append(responses[dict_entry]['pos'])
            prob.append(responses[dict_entry]['score'])
            surprisal.append(-math.log2(responses[dict_entry]['score']))
            #entropy.append(getentropy(responses[dict_entry]['score']))

            pbar.update(1)
    
    df_output['text'] = text
    df_output['pos'] = pos
    df_output['prob'] = prob
    df_output['surprisal'] = surprisal
    
    df_output.to_csv("{TASK}_topk1_output.csv".format(TASK=filename), index=False)
    
    return df_output

In [26]:
output = main("3cond_bert")

100%|█████████████████████████████████████████| 360/360 [00:46<00:00,  7.80it/s]


In [27]:
len(output)

360

In [28]:
output

Unnamed: 0,Idx,Lexicalization,Condition,Condition name,Item,text,pos,prob,surprisal
0,0,1,1,nc-nom,지은이 사실을 [MASK].,밝혔다,"[(밝히, VV), (었, EPT), (다, EFN)]",0.103494,3.272378
1,1,2,1,nc-nom,호준이 용돈을 [MASK].,.,"[(., SF)]",0.179533,2.477681
2,2,3,1,nc-nom,은지가 불을 [MASK].,끈다,"[(끄, VV), (ㄴ다, EFN)]",0.117682,3.087038
3,3,4,1,nc-nom,용준이 마중을 [MASK].,나왔어요,"[(나오, VV), (았, EPT), (어요, EFN)]",0.165672,2.593596
4,4,5,1,nc-nom,은영이 저녁을 [MASK].,먹는다,"[(먹, VV), (는, EPT), (다, EFN)]",0.252464,1.985851
...,...,...,...,...,...,...,...,...,...
355,355,56,6,nom-to,경희가 잘못을 일삼았다. 규호도 잘못을 [MASK].,했다,"[(하, VV), (었, EPT), (다, EFN)]",0.665108,0.588340
356,356,57,6,nom-to,호영이 싸움을 일으켰다. 은주도 싸움을 [MASK].,일어났다,"[(일어나, VV), (었, EPT), (다, EFN)]",0.554497,0.850748
357,357,58,6,nom-to,미진이 지갑을 잃었다. 윤호도 지갑을 [MASK].,찾았다,"[(찾, VV), (았, EPT), (다, EFN)]",0.519524,0.944737
358,358,59,6,nom-to,두호가 약속을 잊었다. 하원도 약속을 [MASK].,했다,"[(하, VV), (었, EPT), (다, EFN)]",0.505473,0.984295
