In [18]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:99% !important;}
div.cell.code_cell.rendered{width:100%;}
div.input_prompt{padding:0px;}
div.CodeMirror {font-family:Consolas; font-size:24pt;}
div.text_cell_render.rendered_html{font-size:20pt;}
div.text_cell_render ul li, div.text_cell_render ol li p, code{font-size:22pt; line-height:30px;}
div.output {font-size:24pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:24pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{font-size:24pt;padding:5px;}
table.dataframe{font-size:24px;}
</style>
"""))

In [36]:
import warnings
import os
import logging
# 경고 제거
warnings.filterwarnings('ignore')

# transformers 로깅 레벨 조정
logging.getLogger("transformers").setLevel(logging.ERROR)

# Hugging Face symlink 경고 제거
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# from transformers import pipeline, logging as hf_logging
# hf_logging.set_verbosity_error()

# ch1. 허깅페이스
- Inference API 이용 : 모델의 결과를 surver에서
- pipeline() 이용 : 모델을 다운로드받아 모델의 결과를 local에서
- raw text -> tokenizer -> model -> [0.11, 0.55, 0.xx, ~] logits값으로 prediction 결과 출력
- 허깅페이스 transformers에서 지원하는 task
-"sentiment-analysis" : "text-classification"의 별칭(감정분석 전용으로 사용)
-"text-classification" : 감정분석, 뉴스분류, 리뷰 분류 등 일반적인 문장 분류
-"zero-shot-classification" : 레이블을 학습 없이 주어진 후보군 중에서 분류
-"token-classification" : 개체명 인식(NER ; Named Entity REcognition) 등 단위 라벨링
-"ner" : "token-classification"의 별칭
-"fill-mask" : 빈칸 채우기
-"text-generation" : 텍스트 생성 (GPT류 모델에 사용)
-"text2text-generation" : 번역, 요약 등 입력 -> 출력 변환 
-"translation" : 번역
-"summarization" : 텍스트요약
-"question-answering" : 주어진 context를 보고 질문에 답하기.
-"image-to-text" : 그림을 설명
-"image-classification" : 이미지분류

# 1. 텍스트 기반 감정분석(긍정/부정)

In [1]:
from transformers import pipeline
classifier = pipeline(task="sentiment-analysis")
classifier("I've been waiting for a HuggingFace course my whole life.")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9598049521446228}]

In [2]:
from transformers import pipeline
classifier = pipeline(task="text-classification",
                     model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
# 감정분석시 내용이 많으면 list로
classifier([
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!"
])

Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9598049521446228},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

In [3]:
classifier(["이 영화 정말 최고였어요. 감동적이고 연기가 대단해",
            "This movie was the best. It's touching, and the acting is amazing"])

[{'label': 'POSITIVE', 'score': 0.857815682888031},
 {'label': 'POSITIVE', 'score': 0.9998821020126343}]

In [4]:
classifier("이 물건 정말 사고 싶어요")

[{'label': 'POSITIVE', 'score': 0.8577604293823242}]

In [24]:
classifier(["I like you", "I hat you", "나 너가 싫어", "힘들어요"])

[{'label': 'POSITIVE', 'score': 0.9998695850372314},
 {'label': 'POSITIVE', 'score': 0.999488353729248},
 {'label': 'NEGATIVE', 'score': 0.599323034286499},
 {'label': 'POSITIVE', 'score': 0.8669533729553223}]

In [34]:
from transformers import pipeline
classifier = pipeline(task="sentiment-analysis",
                     model="matthewburke/korean_sentiment")
texts = ['나는 너가 좋아', "당신이 싫어요", "힘들어요", "오늘 기분이 최고야"]
result = classifier(texts)

In [27]:
for text, result in zip(texts, classifier(texts)):
    label = "긍정" if result['label']=='LABEL_1' else "부정"
    print(f"{text} => {label} : {result['score']:.4f}")

나는 너가 좋아 => 긍정 : 0.9558
당신이 싫어요 => 부정 : 0.9093
힘들어요 => 부정 : 0.9140
오늘 기분이 최고야 => 긍정 : 0.9714


## 2. 제로샷분류(Zero-shot분류)
- 기계학습 및 자연어처리에서 각 개별 직업에 대한 특정 교육없이 작업을 수행할 수 있는 모형(지도학습)

In [5]:
classifier = pipeline("zero-shot-classification",
                     # model="facebook/bart-large-mnli"
                     )
classifier(
    "I have a problem with my iphone that needs to be resolved asap!",
    candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"]
)


No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


{'sequence': 'I have a problem with my iphone that needs to be resolved asap!',
 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'],
 'scores': [0.5227580070495605,
  0.45814019441604614,
  0.0142647260800004,
  0.0026850001886487007,
  0.002152054337784648]}

In [33]:
sequence_to_classify = "언젠가 나는 세상을 볼 것이다"
candidate_labels = ['여행', '요리', '댄스']
classifier(sequence_to_classify, candidate_labels)

{'sequence': '언젠가 나는 세상을 볼 것이다',
 'labels': ['댄스', '여행', '요리'],
 'scores': [0.4455035626888275, 0.337342232465744, 0.21715418994426727]}

# 3. text 생성

In [30]:
from transformers import pipeline, set_seed
# set_seed(2)
generation = pipeline("text-generation", "gpt2") # 텍스트 생성 gpt3부터는 허깅페이스없음
generation(
    "in this course. We will teach you how to",
    pad_token_id=generation.tokenizer.eos_token_id
) # pad_token_id 경고를 없애려고 setting

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

[{'generated_text': 'in this course. We will teach you how to navigate a complex web application with a small team of experienced developers who will build and maintain a web application for the iPhone.\n\nIt will be a very challenging introduction to the fundamentals of programming with HTML. It will cover HTML-enabled web technologies such as:\n\n• CSS and JavaScript\n\n• CSS Grid and Header\n\n• CSS-based CSS\n\n• CSS-based JavaScript\n\n• HTML\n\n• CSS Grid\n\n• HTML5\n\n• Flash\n\n• HTML5\n\n• CSS Grid\n\n• HTML5\n\n• Flash\n\n• HTML5\n\n• CSS Grid\n\n• HTML5\n\n• Flash\n\n• HTML5\n\n• Flash\n\n• HTML5\n\n• HTML5\n\n• Flash\n\n• HTML5\n\n• Flash\n\n• HTML5\n\n• Flash\n\n• HTML5\n\n• HTML5\n\n• Flash\n\n• HTML5\n\n• HTML5\n\n• HTML5\n\n• CSS Grid\n\n• CSS-based CSS\n\n• CSS-based JavaScript\n\n• HTML\n\n• CSS Grid\n\n• CSS-based JavaScript\n\n• HTML5\n\n• Flash\n\n• HTML5\n\n'}]

In [31]:
result = generation(
    "in this course. We will teach you how to",
    pad_token_id=generation.tokenizer.eos_token_id
) 
print(result[0]['generated_text'])

in this course. We will teach you how to set up a computer, a printer with a keyboard, a laptop, and a mouse. For the first three hours we will teach you how to use a computer, then show you how to use a mouse. For the last three hours we will show you how to use a mouse. You will find a list of topics on each page. This course is for those who want to learn how to program computers and how to set up a computer.

You will learn how to learn how to open a computer and how to create a computer at the same time.

You will learn how to program computers and how to set up a computer at the same time. We will teach you how to use a computer, a printer with a keyboard, a laptop, and a mouse. We will show you how to use a mouse and keyboard.

You will learn how to use a computer, a printer with a keyboard, a laptop, and a mouse. We will show you how to use a mouse and keyboard. You will learn how to use the mouse and keyboard. You will learn how to use the mouse and keyboard. You will learn ho

In [32]:
# generation = pipeline("text-generation", "gpt2")
result = generation(
    "이 과정은 다음과 같은 방법을 알려드려요. ",
    pad_token_id = generation.tokenizer.eos_token_id
)
print(result[0]['generated_text'])

이 과정은 다음과 같은 방법을 알려드려요. 과정은 다음과 알려드려요. 과정은 다음과 알려드려요. 알려드려요. 알려드려요. 과정은 다음과 알려드려요. 알려드려요. 알려드려요. 알려드려요. 알려드려요. 알려드려요. 알려드려요. 알려드려요. 알려드려�


# 4. 마스크(빈칸) 채우기

In [8]:
unmasker = pipeline(task='fill-mask',
                   model='distilbert/distilroberta-base') # 마스크 채우기
unmasker("I'm going to hospital and meet a <mask>", top_k=2) # top_k 기본값 5

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


[{'score': 0.19275707006454468,
  'token': 3299,
  'token_str': ' doctor',
  'sequence': "I'm going to hospital and meet a doctor"},
 {'score': 0.06794589757919312,
  'token': 27321,
  'token_str': ' psychiatrist',
  'sequence': "I'm going to hospital and meet a psychiatrist"}]

In [None]:
#unmasker("병원에 가서 <mask>를 만날 거예요")

In [9]:
unmasker("Hello, I'm a <mask> model.")

[{'score': 0.0629730075597763,
  'token': 265,
  'token_str': ' business',
  'sequence': "Hello, I'm a business model."},
 {'score': 0.038101598620414734,
  'token': 18150,
  'token_str': ' freelance',
  'sequence': "Hello, I'm a freelance model."},
 {'score': 0.03764132782816887,
  'token': 774,
  'token_str': ' role',
  'sequence': "Hello, I'm a role model."},
 {'score': 0.037326786667108536,
  'token': 2734,
  'token_str': ' fashion',
  'sequence': "Hello, I'm a fashion model."},
 {'score': 0.026023676618933678,
  'token': 24526,
  'token_str': ' Playboy',
  'sequence': "Hello, I'm a Playboy model."}]

In [10]:
unmasker("안녕하세요? 나는 <mask> 모델이예요.", top_k=3)

[{'score': 0.14130638539791107,
  'token': 35,
  'token_str': ':',
  'sequence': '안녕하세요? 나는: 모델이예요.'},
 {'score': 0.1223798543214798,
  'token': 116,
  'token_str': '?',
  'sequence': '안녕하세요? 나는? 모델이예요.'},
 {'score': 0.08188082277774811,
  'token': 328,
  'token_str': '!',
  'sequence': '안녕하세요? 나는! 모델이예요.'}]

In [11]:
unmasker = pipeline(task="fill-mask",
                   model="google-bert/bert-base-uncased")
unmasker("Hello, I'm a [MASK] model.")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


[{'score': 0.1441437155008316,
  'token': 2535,
  'token_str': 'role',
  'sequence': "hello, i ' m a role model."},
 {'score': 0.14175789058208466,
  'token': 4827,
  'token_str': 'fashion',
  'sequence': "hello, i ' m a fashion model."},
 {'score': 0.062214579433202744,
  'token': 2047,
  'token_str': 'new',
  'sequence': "hello, i ' m a new model."},
 {'score': 0.041028350591659546,
  'token': 3565,
  'token_str': 'super',
  'sequence': "hello, i ' m a super model."},
 {'score': 0.025911200791597366,
  'token': 2449,
  'token_str': 'business',
  'sequence': "hello, i ' m a business model."}]

※ InferenceAPI 사용

In [14]:
from dotenv import load_dotenv
import os
load_dotenv()
# os.environ['HF_TOKEN']
# 허깅페이스 토큰을 READ 권한으로 생성하여 .env에 추가

True