<a href="https://colab.research.google.com/github/jeongminia/NLP_paper_study/blob/main/code/RoBERTa_1006.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RoBERTa 모델
RoBERTa 사용하여 문장을 인코딩하고, 그로부터 특징(features)을 추출한 다음 분류 작업을 수행


https://pytorch.kr/hub/pytorch_fairseq_roberta/

In [1]:
!pip install regex requests hydra-core omegaconf



In [3]:
!pip install bitarray

Collecting bitarray
  Downloading bitarray-2.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (34 kB)
Downloading bitarray-2.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (288 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/288.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m286.7/288.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.3/288.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitarray
Successfully installed bitarray-2.9.2


In [9]:
!pip uninstall fairseq -y
!pip install fairseq

[0mCollecting fairseq
  Using cached fairseq-0.12.2.tar.gz (9.6 MB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core<1.1,>=1.0.7 (from fairseq)
  Using cached hydra_core-1.0.7-py3-none-any.whl.metadata (3.7 kB)
Collecting omegaconf<2.1 (from fairseq)
  Using cached omegaconf-2.0.6-py3-none-any.whl.metadata (3.0 kB)
Requested omegaconf<2.1 from https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl (from fairseq) has invalid metadata: .* suffix can only be used with `==` or `!=` operators
    PyYAML (>=5.1.*)
            ~~~~~~^
Please use pip<24.1 if you need to use this version.[0m[33m
[0m  Using cached omegaconf-2.0.5-py3-none-any.whl.metadata (3.0 kB)
Requested omegaconf<2.1 from https://files.python

In [None]:
import torch
roberta = torch.hub.load('pytorch/fairseq', 'roberta.base', source='github')
roberta.eval()  # 드롭아웃 비활성화 (또는 학습 모드 비활성화)

### 입력 텍스트에 Byte-Pair Encoding (BPE) 적용하기

In [None]:
tokens = roberta.encode('Hello world!')
assert tokens.tolist() == [0, 31414, 232, 328, 2]
assert roberta.decode(tokens) == 'Hello world!'

### RoBERTa에서 특징(feature) 추출

In [None]:
# 마지막 계층의 특징 추출
last_layer_features = roberta.extract_features(tokens)
assert last_layer_features.size() == torch.Size([1, 5, 1024])

# 모든 계층의 특징 추출
all_layers = roberta.extract_features(tokens, return_all_hiddens=True)
assert len(all_layers) == 25
assert torch.all(all_layers[-1] == last_layer_features)

### 문장 관계 분류(sentence-pair classification) 태스크에 RoBERTa 사용하기

In [None]:
# MNLI에 대해 미세조정된 RoBERTa 다운로드
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
roberta.eval()  # 평가를 위해 드롭아웃 비활성화

with torch.no_grad():
    # 한 쌍의 문장을 인코딩하고 예측
    tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.')
    prediction = roberta.predict('mnli', tokens).argmax().item()
    assert prediction == 0  # contradiction

    # 다른 문장 쌍을 인코딩하고 예측
    tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.')
    prediction = roberta.predict('mnli', tokens).argmax().item()
    assert prediction == 2  # entailment

 roberta.large.mnli 모델을 로드하여 MNLI 작업을 수행
- 두 개의 문장을 입력으로 제공하여, 두 문장의 논리적 관계(모순, 중립, 함축)를 예측
- predict('mnli', tokens): MNLI 작업에 대한 예측 결과를 반환하며, argmax()로 가장 높은 확률을 가진 클래스를 반환
- prediction == 0: 예측이 **모순(contradiction)**을 나타냄을 확인

### 새로운 분류층 적용하기

In [None]:
roberta.register_classification_head('new_task', num_classes=3)
logprobs = roberta.predict('new_task', tokens)  # tensor([[-1.1050, -1.0672, -1.1245]], grad_fn=<LogSoftmaxBackward>)

분류 헤드 추가
- register_classification_head('new_task', num_classes=3): 새로운 작업을 위한 분류 헤드를 등록하며, 이 작업은 3개의 클래스로 분류
- predict('new_task', tokens): 새로 등록한 작업에 대해 입력된 토큰의 분류 결과를 예측합니다.
logprobs: 예측된 로그 확률이 반환


# Huggingface 비교
NSP 제거한 것 확인 가능

In [2]:
!pip install transformers==4.40.1 datasets==2.19.0 huggingface_hub==0.23.0 -qqq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency r

In [12]:
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForSequenceClassification

bert_tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')
bert_tokenizer([['배가 고프다', '밥 먹고 싶다']])

roberta_tokenizer = AutoTokenizer.from_pretrained('klue/roberta-base')
roberta_tokenizer([['배가 고프다', '밥 먹고 싶다']])

#en_roberta_tokenizer = AutoTokenizer.from_pretrained('roberta-base')
#en_roberta_tokenizer([['I am hungry', 'I want to eat meal']])

{'input_ids': [[0, 1131, 2116, 22779, 2062, 2, 1127, 1059, 2088, 1335, 2062, 2]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [13]:
# 학습할 때 2개의 문장이 이어지는지 맞추는 NSP 작업 활용 -> 문장 구분하는 토큰 타입 아이디 생성
print('klue/bert-base > ')
print(bert_tokenizer([['첫 번째 문장', '두 번째 문장']]))
print()

# 해당 모델의 경우 NSP 작업을 학습 과정에서 제거했기에 문장 토큰 구분이 필요 없음
print('klue/roberta-base > ')
print(roberta_tokenizer([['첫 번째 문장', '두 번째 문장']]))
print()

# roberta-base : 원본 영어에서는 아예 없음
#print('roberta-base > ')
#print(en_roberta_tokenizer([['first sentence', 'second sentence']]))

klue/bert-base > 
{'input_ids': [[2, 1656, 1141, 3135, 6265, 3, 864, 1141, 3135, 6265, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

klue/roberta-base > 
{'input_ids': [[0, 1656, 1141, 3135, 6265, 2, 864, 1141, 3135, 6265, 2]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}



# 감정분류
https://www.kaggle.com/code/guslovesmath/go-emotions-hugging-face
- Text classification 모델로 `AutoModelForSequenceClassification` 클래스 적용
- Text Sequence Classsification을 위해 Head가 포함된 모델을 불러옴

In [4]:
from transformers import AutoModelForSequenceClassification
model_id = 'SamLowe/roberta-base-go_emotions'
classification_model = AutoModelForSequenceClassification.from_pretrained(model_id)

config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [19]:
# Model Imports
import transformers

# Pulling model
classifier = transformers.pipeline(
    task="text-classification",
    model="SamLowe/roberta-base-go_emotions",
    top_k=None
)

# Sentences to test
sentences = [
    "I LOVE math!",
    "I am not having a great day",
    "Leave me alone, please"
]

# Using model to get text classes
model_outputs = classifier(sentences)

tokenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [20]:
model_outputs

[[{'label': 'love', 'score': 0.9506372809410095},
  {'label': 'admiration', 'score': 0.04889809712767601},
  {'label': 'approval', 'score': 0.017201809212565422},
  {'label': 'joy', 'score': 0.01613575592637062},
  {'label': 'neutral', 'score': 0.008875682950019836},
  {'label': 'gratitude', 'score': 0.006547166034579277},
  {'label': 'excitement', 'score': 0.006077087018638849},
  {'label': 'optimism', 'score': 0.005017284769564867},
  {'label': 'annoyance', 'score': 0.004670716356486082},
  {'label': 'realization', 'score': 0.004402701742947102},
  {'label': 'disapproval', 'score': 0.00425735954195261},
  {'label': 'desire', 'score': 0.004224502947181463},
  {'label': 'anger', 'score': 0.003921826835721731},
  {'label': 'disappointment', 'score': 0.0036757574416697025},
  {'label': 'sadness', 'score': 0.00364437117241323},
  {'label': 'amusement', 'score': 0.003196363802999258},
  {'label': 'caring', 'score': 0.003026994178071618},
  {'label': 'confusion', 'score': 0.0028469152748584

In [21]:
# Dictionaries for each sentence
data = []
for sentence, label_list in zip(sentences, model_outputs):
    for label_dict in label_list:
        data.append({'sentence': sentence, 'label': label_dict['label'], 'score': label_dict['score']})

# Forming dataframe
df = pd.DataFrame(data)
df.set_index(['sentence', 'label'], inplace=True)

In [22]:
# Sortting DataFrame by sentence & score in descending order
df_sorted = df.sort_values(['sentence', 'score'], ascending=[True, False])

# Group by sentence and taking top 3 rows for each group
df_top3 = pd.DataFrame(df_sorted.groupby('sentence').head(3))
display(df_top3.style.background_gradient())

Unnamed: 0_level_0,Unnamed: 1_level_0,score
sentence,label,Unnamed: 2_level_1
I LOVE math!,love,0.950637
I LOVE math!,admiration,0.048898
I LOVE math!,approval,0.017202
I am not having a great day,disappointment,0.466695
I am not having a great day,sadness,0.398495
I am not having a great day,annoyance,0.068066
"Leave me alone, please",neutral,0.688471
"Leave me alone, please",sadness,0.141374
"Leave me alone, please",annoyance,0.069881
