## Step 1: 필수 라이브러리 설치

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:

## Step 2: Hugging Face 로그인

In [2]:
from huggingface_hub import login
from google.colab import userdata

# Hugging Face 로그인
login(userdata.get('HF_TOKEN'))


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Step 3: 모델과 토크나이저 불러오기

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# 모델과 토크나이저 불러오기
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

## Step 4: 간단한 텍스트 생성

In [4]:
# 간단한 텍스트 생성
input_text = "What is your name?"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**input_ids)
print(tokenizer.decode(outputs[0]))




<bos>What is your name?

What is your age?

What is your gender?

What


## Step 5: Logit 계산

In [5]:
# Logit 계산
tokens = input_ids['input_ids']
print(tokens)

logits = model(**input_ids).logits
for i in range(tokens.shape[-1]):
    token = tokens[0, i].item()
    print(logits[0, i, token])


tensor([[     2,   1841,    603,    861,   1503, 235336]], device='cuda:0')
tensor(-18.2747, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(-33.2665, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(-23.9536, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(-27.7627, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(-19.6064, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(-21.0372, device='cuda:0', grad_fn=<SelectBackward0>)


## Step 6: Zero-shot 분류 구현

In [6]:
import torch

def zero_shot_classification(text, task_description, labels):
    # task_description과 text를 이어붙여서 tokenize
    text_ids = tokenizer(task_description + text, return_tensors="pt").to("cuda")
    probs = []

    for label in labels:
        # label을 tokenize하고 이어붙여서 Gemma-2B에 넣기
        label_ids = tokenizer(label, return_tensors="pt").to("cuda")
        n_label_tokens = label_ids['input_ids'].shape[-1] - 1

        input_ids = {
            'input_ids': torch.cat([text_ids['input_ids'], label_ids['input_ids'][:, 1:]], axis=-1),
            'attention_mask': torch.cat([text_ids['attention_mask'], label_ids['attention_mask'][:, 1:]], axis=-1)
        }

        logits = model(**input_ids).logits
        prob = 0
        n_total = input_ids['input_ids'].shape[-1]

        # logit 계산
        for i in range(n_label_tokens, 0, -1):
            token = label_ids['input_ids'][0, i].item()
            prob += logits[0, n_total - i, token].item()

        probs.append(prob)

        del input_ids
        del logits
        torch.cuda.empty_cache()

    return probs


## Step 7: ag_news 데이터셋 불러오기

In [14]:
from datasets import load_dataset

# fancyzhx/ag_news 데이터셋 불러오기
dataset = load_dataset("fancyzhx/ag_news")


## Step 8: 테스트 데이터에 Zero-shot 분류 수행

In [15]:
import numpy as np
from tqdm import tqdm

# 테스트 데이터에서 50개의 문장만 사용
n_corrects = 0
for i in tqdm(range(50)):
    text = dataset['test'][i]['text']  # 문장 가져오기
    label = dataset['test'][i]['label']  # 레이블 가져오기

    # zero-shot classification 수행
    probs = zero_shot_classification(
        text,
        "Classify the news article into one of the following categories: ",
        labels=["World", "Sports", "Business", "Technology"]
    )

    pred = np.argmax(np.array(probs))  # 가장 높은 logit 값을 가진 class 예측
    if pred == label:
        n_corrects += 1

# 정확도 출력
accuracy = n_corrects / 50 * 100
print(f"Accuracy: {accuracy}%")


100%|██████████| 50/50 [00:38<00:00,  1.29it/s]

Accuracy: 38.0%





#### Zero-shot으로 실행하였을 때 gemma 자체의 성능이 좋지 않아서 그런지 38%의 정확도가 나왔습니다.
#### 정확도를 높이기 위해서 아래와 같은 방법을 추가로 시도해보았습니다.

## 추가 시도 1 - 전처리 및 Task Description 수정

In [11]:
import numpy as np
from tqdm import tqdm

# 전처리 함수 정의 (소문자 변환 및 공백 제거)
def preprocess_text(text):
    text = text.lower()
    text = text.strip()
    return text

# Task Description 간결하게 작성
task_description = "Classify the news article into one of these categories: World, Sports, Business, Technology."
labels = ["This article is about World news.", "This article is about Sports.", "This article is about Business.", "This article is about Technology."]

n_corrects = 0

for i in tqdm(range(50)):
    text = preprocess_text(dataset['test'][i]['text'])  # 텍스트 전처리
    label = dataset['test'][i]['label']

    # 문장 길이 제한 (128자로 설정)
    input_ids = tokenizer(text, max_length=128, truncation=True, return_tensors="pt").to("cuda")

    # zero-shot classification 수행
    probs = zero_shot_classification(text, task_description, labels)
    pred = np.argmax(np.array(probs))

    if pred == label:
        n_corrects += 1

# 정확도 출력
accuracy = n_corrects / 50 * 100
print(f"Accuracy: {accuracy}%")


100%|██████████| 50/50 [00:48<00:00,  1.03it/s]

Accuracy: 34.0%





## 추가 시도 2 - Few-shot 학습 예시 추가
Zero-shot을 하는 과제이지만 예시를 주면 정확도가 높아질지 궁금해 추가로 테스트 해보았습니다.

In [13]:
import numpy as np
from tqdm import tqdm

# 전처리 함수 정의 (소문자 변환 및 공백 제거)
def preprocess_text(text):
    text = text.lower()
    text = text.strip()
    return text

# Few-shot learning을 위한 예시 문장 추가
few_shot_examples = """
Classify the following news article into one of these categories: World, Sports, Business, or Technology.
Here are some examples:
1. The president met with other world leaders to discuss climate change and global policies. This is World news.
2. A massive earthquake affected several countries in Asia, causing global concern. This is World news.
3. United Nations releases new report on global human rights violations. This is World news.
4. The soccer team won their last match after an intense game. This is Sports news.
5. The Olympic Games have introduced new events to attract a younger audience. This is Sports news.
6. A famous athlete broke the world record in the 100m sprint. This is Sports news.
7. The stock market saw a sharp rise in tech stocks today. This is Business news.
8. The latest quarterly earnings of major corporations indicate a strong recovery. This is Business news.
9. A new startup raises millions in funding for a unique business idea. This is Business news.
10. The latest breakthrough in quantum computing has revolutionized the tech industry. This is Technology news.
11. Artificial intelligence continues to grow, with new applications in healthcare. This is Technology news.
12. A tech company launches a new innovative smartphone with cutting-edge features. This is Technology news.
"""

# 실제 Task Description에 few-shot 예시 포함
task_description = few_shot_examples + " Now, classify the following news article:"

# 레이블 설정
labels = ["World", "Sports", "Business", "Technology"]

# 테스트 데이터에서 50개의 문장만 사용하여 정확도 계산
n_corrects = 0

for i in tqdm(range(50)):
    text = preprocess_text(dataset['test'][i]['text'])  # 텍스트 전처리
    label = dataset['test'][i]['label']

    # 문장 길이 제한 (128자로 설정)
    input_ids = tokenizer(text, max_length=128, truncation=True, return_tensors="pt").to("cuda")

    # zero-shot classification 수행 (Few-shot 적용 후)
    probs = zero_shot_classification(text, task_description, labels)
    pred = np.argmax(np.array(probs))

    if pred == label:
        n_corrects += 1

# 정확도 출력
accuracy = n_corrects / 50 * 100
print(f"Accuracy: {accuracy}%")


100%|██████████| 50/50 [02:07<00:00,  2.55s/it]

Accuracy: 36.0%





## 시도 3 - 코사인 유사도 기반 분류 사용

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

def zero_shot_classification_cosine(text, labels):
    # Embed the input text
    text_input_ids = tokenizer(text, return_tensors="pt").input_ids.to("cuda")
    text_embedding = model.get_input_embeddings()(text_input_ids).mean(dim=1).detach().cpu().numpy()

    label_embeddings = []
    for label in labels:
        label_ids = tokenizer(label, return_tensors="pt").input_ids.to("cuda")
        label_embedding = model.get_input_embeddings()(label_ids).mean(dim=1).detach().cpu().numpy()
        label_embeddings.append(label_embedding)

    # Calculate cosine similarity between text and each label
    similarities = [cosine_similarity(text_embedding.reshape(1, -1), label_embedding.reshape(1, -1))[0][0] for label_embedding in label_embeddings]

    return similarities

# Evaluate the new approach
n_corrects_cosine = 0
for i in tqdm(range(50)):
    text = dataset['test'][i]['text']  # 문장 가져오기
    label = dataset['test'][i]['label']  # 레이블 가져오기
    similarities = zero_shot_classification_cosine(
        text,
        labels=["World", "Sports", "Business", "Technology"]
    )

    pred = np.argmax(np.array(similarities))
    if pred == label:
        n_corrects_cosine += 1

# 정확도 출력
accuracy_cosine = n_corrects_cosine / 50 * 100
print(f"Accuracy using cosine similarity: {accuracy_cosine}%")

100%|██████████| 50/50 [00:00<00:00, 256.42it/s]

Accuracy using cosine similarity: 54.0%





#### 코사인 유사도를 사용한 결과, 54%의 정확도로 가장 높은 성능을 보였습니다.