# BERT(Bidirectional Encoder Representations from Transformers)

---

# 1 영화 리뷰(IMDB) 감성 분석 - BERT by Hugging Face

- [BERT](https://wikidocs.net/115055)
- [Hugging Face](https://huggingface.co/models)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.rc('figure', figsize=(10, 6))

from matplotlib import rcParams
rcParams['font.family'] = 'New Gulim'
rcParams['font.size'] = 10
rcParams['axes.unicode_minus'] = False

#### 데이터 로드 및 분할

In [None]:
#!pip install tensorflow-datasets

In [None]:
# IMDB 데이터 불러오기
import tensorflow_datasets as tfds

# Split the training set into 60% and 40% to end up with 15,000 examples
# for training, 10,000 examples for validation and 25,000 examples for testing.
train_data, validation_data, test_data = tfds.load(
    name='imdb_reviews', 
    split=('train[:60%]', 'train[60%:]', 'test'),
    as_supervised=True)

#### 데이터 확인하기
- 0: 부정
- 1: 긍정

In [None]:
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))

In [None]:
# Labels
train_labels_batch

In [None]:
# Sentences
train_examples_batch[0]

#### Set GPU device name

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print(f'We will use the GPU:{torch.cuda.get_device_name(0)}')
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device('cpu')

#### Load model

In [None]:
#!pip install transformers

In [None]:
from transformers import pipeline

In [None]:
# model: distilbert-base-uncased-finetuned-sst-2-english
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

##### 예제 문장 테스트

In [None]:
sent = ['Today is a beautiful day!', 'we all still have terrible days']

# Max length of sentence: 512
result = classifier(sent, truncation=True)

for i, sentence in enumerate(sent):
    print(f"Label: {result[i]['label']}, Score: {result[i]['score']:.2f}, Sentence: {sentence}")

#### 데이터 전처리

In [None]:
# 1,000개 데이터 선정
CNT = 1000
sentences, labels = next(iter(train_data.batch(CNT)))

In [None]:
# Labels
label_list = labels.numpy()

In [None]:
# Sentences
sent_bytes = sentences.numpy().tolist()

# decode bytes to string
sent_list = []
for ss in sent_bytes:
    sent_list.append(ss.decode('utf-8'))

#### Sentiment analysis: IMDB

In [None]:
%%time
# Max length of sentence: 512
result = classifier(sent_list, truncation=True)

#### 결과 확인

In [None]:
idx = 0

print(f"Label: {result[idx]['label']}, Score: {result[idx]['score']:.2f}")
print(f"Sentence: {sent_list[idx]}")

In [None]:
for i, sentence in enumerate(sent_list):
    print(f"Label: {result[i]['label']}, Score: {result[i]['score']:.2f}")
    print(f"Sentence: {sentence}")

#### 결과 예측

In [None]:
# 결과 예측
lst = []
for res in result:
    if   res['label'] == 'NEGATIVE':
        lst.append(0)
    elif res['label'] == 'POSITIVE':
        lst.append(1)
    else:
        lst.append(2)
        
pred = np.array(lst)

#### 결과 평가

In [None]:
from sklearn.metrics import classification_report
print(classification_report(label_list, pred, zero_division=True))

---

In [None]:
# End of file