# WWM
### Hugging Face의 Transformers 라이브러리와 WWM 모델을 활용

## english

In [None]:
# 라이브러리 설치
!pip install -q transformers

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

In [None]:
# WWM 모델과 토크나이저 로드
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# 질문과 문장 정의
question = "What is the capital of France?"
context = "France, officially the French Republic, is a country primarily located in Western Europe."

In [None]:
# 토크나이징
inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")

In [None]:
# 예측
start_scores, end_scores = model(**inputs).start_logits, model(**inputs).end_logits

In [None]:
start_scores, end_scores = model(**inputs).start_logits, model(**inputs).end_logits
start_index = torch.argmax(start_scores)
end_index = torch.argmax(end_scores)
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index+1]))

In [None]:
# 결과 출력
print("Question:", question)
print("Answer:", answer)

## 한글 QnA 기초

In [None]:
# 라이브러리 설치 및 로드
!pip install -q transformers

from transformers import AutoTokenizer, AutoModelForQuestionAnswering

In [None]:
# WWM 모델과 토크나이저 로드
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [None]:
# 질문과 문장 정의
question = "한국의 수도는 어디인가요?"
context = "대한민국의 수도는 서울입니다."

In [None]:
# 토크나이징
inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")

In [None]:
# 예측
start_scores, end_scores = model(**inputs).start_logits, model(**inputs).end_logits

In [None]:
# 예측된 답변의 토큰 위치 찾기
start_index = torch.argmax(start_scores)
end_index = torch.argmax(end_scores) + 1

In [None]:
# 토큰을 문장으로 변환
answer_tokens = inputs["input_ids"][0][start_index:end_index]
answer = tokenizer.decode(answer_tokens)

In [None]:
# 결과 출력
print("질문:", question)
print("답변:", answer)