# Hugging Face의 LLM 모델 활용

### 1. 환경 설정

In [1]:
import torch
torch.cuda.is_available()

True

In [26]:
!nvidia-smi

Mon May  6 15:59:54 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 537.13                 Driver Version: 537.13       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080      WDDM  | 00000000:0A:00.0  On |                  N/A |
| 35%   32C    P8              23W / 225W |   7885MiB /  8192MiB |      3%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [6]:
!pip install torch
!pip install transformers
!pip install bitsandbytes
!pip install accelerate

Collecting torch
  Downloading torch-2.3.0-cp39-cp39-win_amd64.whl.metadata (26 kB)
Collecting filelock (from torch)
  Using cached filelock-3.14.0-py3-none-any.whl.metadata (2.8 kB)
Collecting sympy (from torch)
  Using cached sympy-1.12-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
Collecting mkl<=2021.4.0,>=2021.1.1 (from torch)
  Using cached mkl-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.4 kB)
Collecting intel-openmp==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)
  Using cached intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.2 kB)
Collecting tbb==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)
  Using cached tbb-2021.12.0-py3-none-win_amd64.whl.metadata (1.1 kB)
Collecting mpmath>=0.19 (from sympy->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2

In [19]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.2.2-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.2-cp312-cp312-win_amd64.whl (11.5 MB)
   ---------------------------------------- 0.0/11.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.5 MB 960.0 kB/s eta 0:00:12
   ----- ---------------------------------- 1.6/11.5 MB 20.1 MB/s eta 0:00:01
   ---------------- ----------------------- 4.7/11.5 MB 37.4 MB/s eta 0:00:01
   ----------------------- ---------------- 6.7/11.5 MB 39.0 MB/s eta 0:00:01
   ------------------------------ --------- 8.6/11.5 MB 39.4 MB/s eta 0:00:01
   ---------------------------------------  11.5/11.5 MB 54.4 MB/s eta 0:00:01
   ---------------------------------------  11.5/11.5 MB 54.4 MB/s eta 0:00:01
   ---------------------------------------- 11.5/11.5 MB 40.9 MB/s eta 0:00:00
Downloading tzdata-2024.1-py2.py3-none-any.whl

In [1]:
# 라이브러리 불러오기
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import json

In [2]:
# 허깅페이스 로그인
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
HF_TOKEN = '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'

In [4]:
WORK_DIR = 'C:/Users/dmlql/KT_AIVLE/Project/증시 상황 요약 LLM Chat bot/'
ARTICLE_PATH = os.path.join(WORK_DIR, 'articles_db.json')                         

### 2. 모델 불러오기

In [5]:
MODEL_ID = "microsoft/Phi-3-mini-128k-instruct"

In [6]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                            bnb_4bit_compute_dtype = torch.float16,
                            bnb_4bit_quant_type = 'nf4',
                            llm_int8_enable_fp32_cpu_offload=True)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype="auto",
    quantization_config=quantization_config,
    trust_remote_code=True,
    token=HF_TOKEN
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID,
                                          add_special_tokens=True,
                                          toekn=HF_TOKEN)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=32064
)


`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
def gen_prompt(article_list, pipe):
    chat = [
        {
            "role": "user",
            "content": ""
        }
    ]
    content = f'''아래 문서 내용만을 참조하여 질문에 답해줘.
    문서 : {article_list[0]}

    [질문] 문서의 내용을 보고 '긍정' 또는 '부정'으로 판단해줘!
    '''

    chat[0]['content'] = content
    prompt = pipe.tokenizer.apply_chat_template(chat,
                                            tokenize=False,
                                            add_generation_prompt=True)

    return prompt

In [None]:
article_list= []
with open(ARTICLE_PATH, 'r+', encoding='utf-8') as f:
    for i, line in tqdm(enumerate(f), desc="Processing items", unit='item'):
        article_list.append(json.loads(line)['content'])

prompt = gen_prompt(article_list, pipe)
outputs = pipe(
            prompt,
            do_sample=True,
            temperature=0.2,
            top_k=50,
            top_p=0.95,  
            add_special_tokens=True
        )
print(outputs[0]["generated_text"])


Processing items: 0item [00:00, ?item/s]

You are not running the flash-attention implementation, expect numerical differences.


In [None]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3-8B"

pipeline = transformers.pipeline(
    "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto"
)
pipeline("Hey how are you doing today?")



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
