In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from random import sample

In [72]:
# pt1 = "beomi/KoAlpaca-Polyglot-5.8B"
# pt2 = "EleutherAI/polyglot-ko-5.8b"

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/polyglot-ko-5.8b")
model = AutoModelForCausalLM.from_pretrained("beomi/KoAlpaca-Polyglot", torch_dtype=torch.float16).cuda()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [73]:
model.config.max_length = 2048
model.config.pad_token_id = 0

In [74]:
DATA_IN_PATH = './data_in/KOR'
DATA_OUT_PATH = './data_out/KOR'
DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, 'naver_movie', 'ratings_train.txt')
DATA_TEST_PATH = os.path.join(DATA_IN_PATH, 'naver_movie', 'ratings_test.txt')

train_data = pd.read_csv(DATA_TRAIN_PATH, header=0, delimiter='\t')
train_data = train_data.dropna()

In [75]:
sent_lens = [len(tokenizer(s).input_ids) for s in tqdm(train_data['document'])]

print('Few shot 케이스 토큰 평균 길이: ', np.mean(sent_lens))
print('Few shot 케이스 토큰 최대 길이: ', np.max(sent_lens))
print('Few shot 케이스 토큰 길이 표준편차: ',np.std(sent_lens))
print('Few shot 케이스 토큰 길이 80 퍼센타일: ',np.percentile(sent_lens, 80))

  0%|          | 0/149995 [00:00<?, ?it/s]

Few shot 케이스 토큰 평균 길이:  20.19508650288343
Few shot 케이스 토큰 최대 길이:  280
Few shot 케이스 토큰 길이 표준편차:  16.431122357214313
Few shot 케이스 토큰 길이 80 퍼센타일:  27.0


In [76]:
train_fewshot_data = list()
for train_sent, train_label in tqdm(train_data[['document', 'label']].values):
    tokens = tokenizer(train_sent).input_ids
    if len(tokens) <= 25:
        train_fewshot_data.append((train_sent, train_label))

  0%|          | 0/149995 [00:00<?, ?it/s]

In [77]:
print(len(train_fewshot_data))
print(train_fewshot_data[:3])

115629
[('아 더빙.. 진짜 짜증나네요 목소리', 0), ('흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', 1), ('너무재밓었다그래서보는것을추천한다', 0)]


In [78]:
# Sampled Dataset
sample_size = 500

train_fewshot_samples = []

for _ in range(sample_size):
    fewshot_examples = sample(train_fewshot_data, 10)
    train_fewshot_samples.append(fewshot_examples)

In [79]:
test_data = pd.read_csv(DATA_TEST_PATH, header=0, delimiter='\t', quoting=3)
test_data = test_data.dropna()
test_data.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


In [80]:
if sample_size < len(test_data['id']):
    test_data = test_data.sample(sample_size, random_state=0)

In [81]:
import re

def build_prompt_text(sent):
    return "문장: " + sent + "\n감정: "

def clean_text(sent):
    sent_clean = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", sent)
    return sent_clean

In [82]:
real_labels = list()
pred_tokens = list()

total_len = len(test_data[['document', 'label']].values)

In [None]:
for i, (test_sent, test_label) in tqdm(enumerate(test_data[['document', 'label']].values), total=total_len):
    prompt_text = ''
    for ex in train_fewshot_samples[i]:
        example_text, example_label = ex
        cleaned_example_text = clean_text(example_text)
        appended_prompt_example_text = build_prompt_text(cleaned_example_text)
        appended_prompt_example_text += '긍정\n' if example_label == 1 else '부정\n'
        prompt_text += appended_prompt_example_text
        
    cleaned_sent = clean_text(test_sent)
    appended_prompt_sent = build_prompt_text(cleaned_sent)
    prompt_text += appended_prompt_sent
    
    tokens = tokenizer(prompt_text, return_tensors='pt')
    token_ids, attn_mask = tokens.input_ids.cuda(), tokens.attention_mask.cuda()
    gen_tokens = model.generate(input_ids=token_ids, attention_mask=attn_mask, max_new_tokens=1, pad_token_id=0)
    # print(tokenizer.batch_decode(gen_tokens))
    pred = tokenizer.batch_decode(gen_tokens[:, -1])[0].strip()
    
    pred_tokens.append(pred)
    real_labels.append('긍정' if test_label==1 else "부정")

  0%|          | 0/500 [00:00<?, ?it/s]

In [84]:
print(pred_tokens)
print(real_labels)

['', '', 'ing', '', '....', '', '긍정', '', '', '긍정', '不', '', '...', '이런', '좋', '', '', '긍정', '....', '', '긍정', '', '', '....', '미스터', '....', '', '', '', '', '....', '', '...', '', '긍정', '', '', '', '', '', '긍정', 'low', '....', '', '', '', '', '...', '', '', '', '', '', '....', '긍정', '....', '긍정', '별로', '', '...', '', '', 'ab', '', '긍정', '긍정', '', '긍정', '보', 'ok', '....', '', '...', '', '...', '', '보', '저', '', '1', '', '', '', '....', '....', '', '', 'ing', '....', '보', '', '', 'o', 'z', '', '', '', '', 'ab', '긍정', '', '....', '<|endoftext|>', '....', '좋', '', '아', '긍정', '', '존', '캐스팅', '<|endoftext|>', '음', '긍정', '', '긍정', '', '', '', '....', '', '<|endoftext|>', '긍정', '', '부정', '긍정', '', '....', '...', '', 'ing', '', '긍정', '....', '....', '', '좋', '보', '', 'o', '', '', '', '', 'z', '', '긍정', '', '좋', '', '^', '', '보', '...', '긍정', '', '좋', 'ap', 'ant', '', '....', '', '', '긍정', '', '', 'ol', 'ab', '긍정', 'z', '', '긍정', '....', '긍정', '', '', '음', '긍정', '이', '<|endoftext|>', '', '긍정', 

In [86]:
accuracy_match = [p == t for p, t in zip(pred_tokens, real_labels)]
accuracy = len([m for m in accuracy_match if m]) / len(real_labels)

print(accuracy)

0.122
