# HuggingFace with GPU
###### 참고
- [Hugging Face - Inference on One GPU](https://huggingface.co/docs/transformers/perf_infer_gpu_one)

#### Inference
학습된 모델을 불러와 데이터 처리

## CPU and GPU Offload

위 KoLLaMa-13b 모델 load 중 에러 발생하여 [여기](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu) 참고함.

ValueError: 
                        Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
                        the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
                        these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom
                        `device_map` to `from_pretrained`. Check
                        https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
                        for more details.

In [1]:
import gc, torch
from transformers import AutoTokenizer, AutoModelForCausalLM

try:
	input_string = ["한국어 명령어를 이해하는 오픈소스 언어모델", "오픈소스를 제공하는 사이트 목록"]
	print(f"Input String is \"{input_string}\"")
	
	# 왼쪽에 padding을 추가하는 Tokenizer
	tokenizer = AutoTokenizer.from_pretrained("beomi/kollama-7b", padding_side = "left")
	
	# input_string tokenizing
	model_input = tokenizer(input_string, return_tensors = "pt", return_token_type_ids = False, padding = True)
	print(f"Model Input is \"{model_input}\"")
	
	# cpu와 gpu를 모두 사용해 효율적으로 추론하도록 auto device_map 설정
	model = AutoModelForCausalLM.from_pretrained("beomi/kollama-7b", device_map = "auto")
	print(f"model.hf_device_map: {model.hf_device_map}")
	
	print(f"\n===================================== Output =====================================")
	for result in tokenizer.batch_decode(model.generate(**model_input), clean_up_tokenization_spaces = True, skip_special_tokens = True ):
		print(result)
	
	del tokenizer, model_input, model

except Exception as e:
	print(e, e.__traceback__)


gc.collect()
torch.cuda.empty_cache()

print(f"Allocated GPU Memory: {torch.cuda.memory_allocated('cuda:0')/(1024**3):.2f}GB")
print(f"Reserved GPU Memory: {torch.cuda.memory_reserved('cuda:0')/(1024**3):.2f}GB")


Input String is "['한국어 명령어를 이해하는 오픈소스 언어모델', '오픈소스를 제공하는 사이트 목록']"
Model Input is "{'input_ids': tensor([[ 3243,   355, 13263,  6078,  2684,   463,  6609, 39661,  1297, 31962,
          2682],
        [    0,     0,     0,     0, 16623,   560,  3341, 12673,  7551,  1450,
           966]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])}"
bin C:\Users\jongg\PycharmProjects\HuggingFaceKoLLaMa13b\venv\Lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.dll


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

model.hf_device_map: {'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 'cpu', 'model.layers.13': 'cpu', 'model.layers.14': 'cpu', 'model.layers.15': 'cpu', 'model.layers.16': 'cpu', 'model.layers.17': 'cpu', 'model.layers.18': 'cpu', 'model.layers.19': 'cpu', 'model.layers.20': 'cpu', 'model.layers.21': 'cpu', 'model.layers.22': 'cpu', 'model.layers.23': 'cpu', 'model.layers.24': 'cpu', 'model.layers.25': 'cpu', 'model.layers.26': 'cpu', 'model.layers.27': 'cpu', 'model.layers.28': 'cpu', 'model.layers.29': 'cpu', 'model.layers.30': 'cpu', 'model.layers.31': 'cpu', 'model.norm': 'cpu', 'lm_head': 'cpu'}





한국어 명령어를 이해하는 오픈소스 언어모델에 대한 자세한 내용은 [https://www.
오픈소스를 제공하는 사이트 목록을 생성합니다.
    
Allocated GPU Memory: 0.01GB
Reserved GPU Memory: 0.02GB


In [2]:
from transformers import pipeline

"""
:arg device_map = "auto": 자원할당 옵션 최적화
:arg framework = "pt": Pytorch, "tf" = TensorFlow, default는 설치된 프레임워크 자동 할당인데 혹시 몰라서 씀
:arg revision = "140k": model의 140k 브랜치
"""
pipe = pipeline("text-generation", model = "beomi/kollama-7b", device_map = "auto", framework = "pt", revision = "140k")
outputs = pipe([
	"너 이름이 뭐니",
	"내 자산 맡길 수 있겠니",
	"애국가 1절 가사",
	"오늘 저녁 메뉴는 치킨"
])
for output in outputs:
	print(output[0]['generated_text'])

del pipe, output
gc.collect()
torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'


너 이름이 뭐니 부여 viaalysis 여대생+---------1270SLE974 청년도약계좌693pictureMu별로더럽고AHL693이혼
내 자산 맡길 수 있겠니~1애성제 ᅵ 만수목 시곱홍아아
애국가 1절 가사옹SET 기자간담회에서 안전사고어부행사 안산병원은 쫓상을QPGDebug 안했는데说话순환을조카
오늘 저녁 메뉴는 치킨 끝으로설에 실 *보기이다 실 연토 사랑하는 마음에 표 모 나의 가사


In [3]:
# main branch로 다시 해보자

from transformers import pipeline

pipe = pipeline("text-generation", model = "beomi/kollama-7b", device_map = "auto", framework = "pt")
outputs = pipe([
	"너 이름이 뭐니",
	"내 자산 맡길 수 있겠니",
	"애국가 1절 가사",
	"오늘 저녁 메뉴는 치킨"
])
for output in outputs:
	print(output[0]['generated_text'])

del pipe, output
gc.collect()
torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

너 이름이 뭐니.`n",
       "     
내 자산 맡길 수 있겠니?\n",
       "  
애국가 1절 가사\n",
       "    
오늘 저녁 메뉴는 치킨\n",
       "     


## Model Inference with KoLLaMa-7b

#### 변수 선언 및 메모리 정리 함수 정의

In [4]:
import gc, torch
from typing import Any

# 모델 추론 시 사용할 변수는 전부 inference_dict에 저장
inference_dict = dict({})

model_name = "beomi/kollama-7b"

def clear():
    # 변수 제거
    inference_dict.clear()
    # inference_dict["inference_input"] = "구글에 파이썬 검색"
	
    gc.collect()
    torch.cuda.empty_cache()
    print("\n======================== GPU Memory ======================== ")
    print(f"Allocated GPU Memory: {torch.cuda.memory_allocated('cuda:0')/(1024**3):.2f}GB")
    print(f"Reserved GPU Memory: {torch.cuda.memory_reserved('cuda:0')/(1024**3):.2f}GB")
    
clear()


Allocated GPU Memory: 0.01GB
Reserved GPU Memory: 0.02GB


#### Pipeline Inference

In [5]:
from transformers import pipeline

def _pipeInference() -> Any:
    inference_dict["pipe"] = pipeline("text-generation",
                                      model = model_name,
                                      device_map = "auto",
                                      # model_kwargs = model_kwargs,
                                      framework = "pt"
                                      )
    
    return inference_dict["pipe"](inference_dict["inference_input"])

def pipeInference():
	try:
		retVal = _pipeInference()
	except Exception:
		;
	else:
		return retVal
	finally:
		clear()


#### Model Load Inference 

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer

def _modelInference() -> Any:
	inference_dict["tokenizer"] = AutoTokenizer.from_pretrained(model_name,
																padding_side = "left")
	
	inference_dict["inference_token"] = inference_dict["tokenizer"](inference_dict["inference_input"],
																	return_tensors = "pt",
																	return_token_type_ids = False,
																	padding = True)
	
	print(f"\nInference Token is \"{inference_dict['inference_token']}\"")
	
	inference_dict["model"] = AutoModelForCausalLM.from_pretrained(model_name, device_map = "auto")
	
	# print(f"\nModel\n===================================================\n{inference_dict['model']}")
	print(f"model.hf_device_map: {inference_dict['model'].hf_device_map}")
	
	# Because of OOM, Comment
	# inference_dict["keys"] = inference_dict["model"](**inference_dict["inference_token"]).keys()
	# print(f"Key Dictionary of Inference Result is \"{inference_dict['keys']}\"")
	
	# modelling_llama.py LlamaForCausalLM.forward() line 760 parameter 참고
	inference_dict["result_tensor"] = inference_dict["model"].generate(**inference_dict["inference_token"])
	print(f"\nGeneration Result Tensor is \"{inference_dict['result_tensor']}\"")
	
	# modelling_llama.py LlamaForCausalLM.forward() line 795
	# tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
	# inference_dict["result_with_sst"] = inference_dict["tokenizer"].batch_decode(inference_dict["result_tensor"],
	# 																			 skip_special_tokens = True, # default False
	# 																			 clean_up_tokenization_spaces = False)
	# print(f"\nGeneration Result with Skip_Special_Tokens is \"{inference_dict['result_with_sst']}\"")
	# 
	# inference_dict["result"] = inference_dict["tokenizer"].batch_decode(inference_dict["result_tensor"],
	# 																			 clean_up_tokenization_spaces = True)
	# print(f"\nGeneration Result is \"{inference_dict['result']}\"")
	
	return inference_dict["tokenizer"].batch_decode(inference_dict["result_tensor"],
													clean_up_tokenization_spaces = True,
													skip_special_tokens = True)
	


def modelInference():
	try:
		retVal = _modelInference()
	except Exception:
		;
	else:
		return retVal
	finally:
		clear()


#### Input Strings, Results of Pipeline and Model

In [7]:
# unused
device_map = {
	"lm_head": "cpu",
	"model.embed_tokens": "cpu",
	"model.layers.0": 0,
	"model.layers.1": 0,
	"model.layers.2": 0,
	"model.layers.3": 0,
	"model.layers.4": 0,
	"model.layers.5": 0,
	"model.layers.6": 0,
	"model.layers.7": 0,
	"model.layers.8": 0,
	"model.layers.9": 0,
	"model.layers.10": 0,
	"model.layers.11": "cpu",
	"model.layers.12": "cpu",
	"model.layers.13": "cpu",
	"model.layers.14": "cpu",
	"model.layers.15": "cpu",
	"model.layers.16": "cpu",
	"model.layers.17": "cpu",
	"model.layers.18": "cpu",
	"model.layers.19": "cpu",
	"model.layers.20": "cpu",
	"model.layers.21": "cpu",
	"model.layers.22": "cpu",
	"model.layers.23": "cpu",
	"model.layers.24": "cpu",
	"model.layers.25": "cpu",
	"model.layers.26": "cpu",
	"model.layers.27": "cpu",
	"model.layers.28": "cpu",
	"model.layers.29": "cpu",
	"model.layers.30": "cpu",
	"model.layers.31": "cpu",
	"model.norm": "cpu",
	"transformer.h": 0,
	"transformer.ln_f": 0,
	"transformer.word_embeddings": 0,
	"transformer.word_embeddings_layernorm": 0
}

inputs = [
	"구글에 파이썬 검색",
	"파이썬으로 가위바위보 게임 개발",
	"너 이름이 뭐니",
	"내 자산 맡길 수 있겠니",
	"애국가 1절 가사",
	"오늘 저녁 메뉴는 치킨"
]

inference_dict["inference_input"] = inputs
print("Pipeline Inference...")
pipeline_val = pipeInference()

print()

inference_dict["inference_input"] = inputs
print("Model Inference...")
model_val = modelInference()

print(inputs, pipeline_val, model_val)

print("============================= Inference Result =============================")
for (inf_input, pipe_output, model_output) in zip(inputs, pipeline_val, model_val):
	print(f"{inf_input}, {pipe_output[0]['generated_text']}, {model_output}")
	
del pipeline_val, model_val
clear()

Pipeline Inference...


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]


Allocated GPU Memory: 0.01GB
Reserved GPU Memory: 0.02GB

Model Inference...

Inference Token is "{'input_ids': tensor([[    0,     0,     0,     0, 26660,   279, 16862,  6991,   109,  7087],
        [13903,  6991,   109,   378, 31143,  1004,   594,   472,  2233,  2409],
        [    0,     0,     0,     0,     0,     0,     0,  1338, 10598, 21807],
        [    0,     0,     0,     0,   856, 10402, 34973,   422, 11783,   386],
        [    0,     0,     0,     0,     0,  1491,  6722,   315,  1310,  9995],
        [    0,     0,     0,     0,     0,     0,  4662,  7211, 35772, 18245]]), 'attention_mask': tensor([[0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
        [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 1, 1, 1, 1]])}"


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

model.hf_device_map: {'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 'cpu', 'model.layers.12': 'cpu', 'model.layers.13': 'cpu', 'model.layers.14': 'cpu', 'model.layers.15': 'cpu', 'model.layers.16': 'cpu', 'model.layers.17': 'cpu', 'model.layers.18': 'cpu', 'model.layers.19': 'cpu', 'model.layers.20': 'cpu', 'model.layers.21': 'cpu', 'model.layers.22': 'cpu', 'model.layers.23': 'cpu', 'model.layers.24': 'cpu', 'model.layers.25': 'cpu', 'model.layers.26': 'cpu', 'model.layers.27': 'cpu', 'model.layers.28': 'cpu', 'model.layers.29': 'cpu', 'model.layers.30': 'cpu', 'model.layers.31': 'cpu', 'model.norm': 'cpu', 'lm_head': 'cpu'}

Generation Result Tensor is "tensor([[    0,     0,     0,     0, 26660,   279, 16862,  6991,   109,  7087,
           284,  5967,   96

## 결과

내 질문에 대답하는 게 아니라 입력한 문자열을 그럴듯하게 포장함.

The model is not intended to inform decisions about matters central to human life, and should not be used in such a way.
([원본](https://huggingface.co/beomi/kollama-7b#ethical-considerations))

반대로 사람이 입력한 문자열에서 키워드만 뽑아서 명령어로 만드는 것도 고려해볼만한듯

#### Pipeline(task = "text-generation")
- Model Load
	- Model.from_pretrained()
- Tokenize input
	- Tokenizer()
- Generate Token Ids
	- Model.generate(tokenized_input)
- Decode
	- Tokenizer.batch_decode(generated_ids)

# Solving "CUDA out of memory" Error

[참고](https://www.kaggle.com/discussions/getting-started/140636)

In [11]:
import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()          

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  2% |  4% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 13% |  4% |


In [19]:
import gc, torch
from numba import cuda

def clear_cache(gpu_cnt: int = 1):
	
	for i in range(gpu_cnt):
		try:
			cuda_name = torch.cuda.get_device_name(i)
			print("\n/*====================================================================*/")
			print(f"device:{i}")
			print(cuda_name)
		except AssertionError as ae:
			break
		
		print("\n/* Before */")
		print(f"Allocated GPU Memory: {torch.cuda.memory_allocated(i)/(1024**3):.2f}GB")
		print(f"Reserved GPU Memory: {torch.cuda.memory_reserved(i)/(1024**3):.2f}GB")
		
		gc.collect()
		torch.cuda.empty_cache()
		
		cuda.select_device(i)
		cuda.close()
		cuda.select_device(i)
		
		print("\n/* After */")
		print(f"Allocated GPU Memory: {torch.cuda.memory_allocated(i)/(1024**3):.2f}GB")
		print(f"Reserved GPU Memory: {torch.cuda.memory_reserved(i)/(1024**3):.2f}GB")
	
	print("/*====================================================================*/")
	
clear_cache(2)


device:0
NVIDIA GeForce RTX 3060

/* Before */
Allocated GPU Memory: 0.01GB
Reserved GPU Memory: 0.02GB

/* After */
Allocated GPU Memory: 0.01GB
Reserved GPU Memory: 0.02GB
