<a href="https://colab.research.google.com/github/kmalicekim/NLP_practice_mine/blob/master/14.%20BERT%EC%9D%98%20%EC%9D%B4%ED%95%B4%EC%99%80%20%EA%B0%84%EB%8B%A8%ED%95%9C%20%ED%99%9C%EC%9A%A9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from transformers import pipeline

clf = pipeline('sentiment-analysis')
result = clf('what a beautiful day!')[0]
print('감성분석 결과: %s, 감성스코어: %.4f'%(result['label'], result['score']))

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


감성분석 결과: POSITIVE, 감성스코어: 0.9999


In [4]:
clf = pipeline('sentiment-analysis')
result = clf('is it right?')
print(result)
# print('감성분석 결과: %s, 감성스코어: %.4f'%(result['label'], result['score']))

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9992733597755432}]


In [5]:
# 문서 생성 예시

# 사용은 쉬우나, 이런 식의 사용은 제약이 많음. 
# e.g. BERT는 기본적으로 512개의 토큰을 사용하나, 문장을 토큰화한 결과가 이 숫자를 넘게 되면 바로 에러 발생
# 그렇게 되면 더 많은 지식과 다른 사용법이 필요함

text_generator = pipeline('text-generation')
result = text_generator('Alice was beginning to get very tired of sitting by her sister on the bank,')
print(result)
print(result[0]['generated_text'])


No model was supplied, defaulted to gpt2 and revision 6c0e608 (https://huggingface.co/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Alice was beginning to get very tired of sitting by her sister on the bank, and she knew she was not being paid for her work in the country, or at least not on her own account. There didn't seem to be anyone who knew how"}]
Alice was beginning to get very tired of sitting by her sister on the bank, and she knew she was not being paid for her work in the country, or at least not on her own account. There didn't seem to be anyone who knew how


## 14.5 자동 클래스를 이용한 토크나이저와 모형의 사용

In [6]:
# mrpc (The Microsoft Research Paraphrase Corpus): 의미적으로 유사한 문장의 페어와 그렇지 않은 문장의 페어로 구성하여 두 문장의 의미적 유사성을 학습할 수 있도록 만든 데이터셋 
#'bert-base-cased-finetuned-mrpc : 사전학습된 모형으로 다시 MRPC 데이터셋에 대해 미세조정학습을 진행함

from transformers import AutoTokenizer, AutoModelForSequenceClassification 
import torch

# Auto Classes를 이용해 사전학습된 내용에 맞는 토크나이저와 모형을 자동으로 설정
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased-finetuned-mrpc') 
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased-finetuned-mrpc') 

# 의미적으로 유사한 두 문장을 선언
input_sentence = "She angered me with her inappropriate comments, rumor-spreading, and discrespectfulness at the formal dinner table"
target_sequence = 'She made me angry when she was rude at dinner'


# 토큰화
tokens = tokenizer(input_sentence, target_sequence, return_tensors='pt') 

# 모형으로 결과 예측 
# logits : the raw(non-normalized) scores for classification model before Softmax
logits = model(**tokens).logits 

# 소프트맥스를 이용하여 결과값을 클래스에 대한 확률로 변환 
results = torch.softmax(logits, dim=1).tolist()[0] 

for i, label in enumerate(['no', 'yes']):
  print(f"{label}: {int(round(results[i]*100))}%")

no: 26%
yes: 74%


In [7]:
# 전혀 관련 없는 문장을 target_sequence로 설정하여 실행

target_sequence = 'The boy quickly ran across the finish line, seizing yet another victory'
tokens = tokenizer(input_sentence, target_sequence, return_tensors='pt') 
logits = model(**tokens).logits 
results = torch.softmax(logits, dim=1).tolist()[0] 

for i, label in enumerate(['no', 'yes']):
  print(f"{label}:{int(round(results[i]*100))}%")

no:95%
yes:5%


In [None]:
# 감성분석에 BERT 이용하기

import nltk 
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split 
import numpy as np

nltk.download('movie_reviews') 

# movie review data에서 file id를 가져옴 
fileids = movie_reviews.fileids() 

# file id를 이용하여 raw text file 가져옴
reviews = [movie_reviews.raw(fileids) for fileid in fileids] 
categories = [movie_reviews.categories(fileid)[0] for fileid in fileids] 

# label을 0,1의 값으로 변환
label_dict = {'pos':1, 'neg':0} 
y = np.array([label_dict[c] for c in categories]) 

X_train, X_test, y_train, y_test = train_test_split(reviews, y, test_size=0.2, random_state=7) 
print('Train set count:', len(X_train)) 
print('Test set count:', len(X_test))


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification 
import torch
import torch.nn.functional as F  

# cuda를 이용한 GPU연산이 가능하다면 cuda를 사용하고, 아니면 cpu를 사용 
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 

# Auto Classes를 이용하여 사전학습된 내용에 맞는 토크나이저와 모형을 자동으로 설정 
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english') 
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english') 

# 모델을 gpu로 옮겨서 연산 준비
model = model.to(device) 

# 모형으로 한번에 예측할 데이터의 수
batch_size = 10

# 전체 예측결과를 저장 
y_pred = [] 

num_batch = len(y_test)//batch_size 

for i in range(num_batch):
  inputs = tokenizer(X_test[i*batch_size:(i+1)*batch_size], truncation=True, padding=True, return_tensors='pt') 

  # 토큰화 결과를 GPU로 이동
  inputs = inputs.to(device) 

  # 모형으로 결과를 예측
  logits = model(**inputs).logits 

  # 결과값을 클래스에 대한 확률로 변환 
  pred = F.softmax(logits, dim=-1) 

  # 예측결과를 CPU로 가져와서 넘파이로 변환한 후, argmax로 확률이 가장 큰 클래스를 선택함 
  results = pred.cpu().detach().numpy().argmax(axis=1) 

  # 전체 예측결과에 추가 
  y_pred.extend(results.tolist()) 


# gpu 메모리를 비움
torch.cuda.empty_cache() 

score = sum(y_test == np.array(y_pred)/len(y_test)) 
print('NLTK 영화리뷰 감성분석 정확도:', score)