## Importing Libraries

In [1]:
import os
from dotenv import load_dotenv
from IPython.display import display, Markdown

# pytorch
import torch

# huggingface
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer

## Login to Hugging Face

In [2]:
login(
    token="",
    add_to_git_credential=True
)

Token is valid (permission: read).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /home/owiequhf/.cache/huggingface/token
Login successful


In [3]:
load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token="", # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

Token is valid (permission: read).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /home/owiequhf/.cache/huggingface/token
Login successful


## Device

In [4]:
# Device setup
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    "mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

Device = cuda:0


In [5]:
# Flash Attention Implementation
if device == "cuda:0":
    if torch.cuda.get_device_capability()[0] >= 8: # Ampere, Ada, or Hopper GPUs
        attn_implementation = "flash_attention_2"
        torch_dtype = torch.bfloat16
    else:
        attn_implementation = "eager"
        torch_dtype = torch.float16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float32
print(f"Attention Implementation = {attn_implementation}")

Attention Implementation = flash_attention_2


## Hyperparameters

In [6]:
################################################################################
# Tokenizer parameters
################################################################################
max_length=8192
padding="do_not_pad"  # "max_length", "longest", "do_not_pad"
truncation=True

################################################################################
# Generation parameters
################################################################################
num_return_sequences=1
max_new_tokens=1024
do_sample=True  # True for sampling, False for greedy decoding
temperature=0.6
top_p=0.9
repetition_penalty=1.1

################################################################################
# bitsandbytes parameters
################################################################################
load_in_4bit=True
bnb_4bit_compute_dtype=torch_dtype
bnb_4bit_quant_type="nf4"  # "nf4", #fp4"
bnb_4bit_use_double_quant=True

## Model

In [7]:
# Model ID
model_id = "beomi/Llama-3-KoEn-8B-Instruct-preview"

In [8]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
# Quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=load_in_4bit,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant
)

In [10]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    attn_implementation=attn_implementation,
    torch_dtype=torch_dtype,
    quantization_config=quantization_config
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [11]:
# Display the model architecture
display(Markdown(f'```{model}```'))

```LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaFlashAttention2(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=128256, bias=False)
)```

In [12]:
# Number of parameters
print(f"Number of parameters (in billions): {model.num_parameters() / 1e9:.2f}")

Number of parameters (in billions): 8.03


## Inference

In [13]:
def generate_question():
    messages = [
        {"role": "system", "content": "You are a Korean interviewer. "
                                      "You are interviewing a candidate for a computer science position. "
                                      "Please ask a question related to computer science. "
                                      "Use Korean only. 한국어만 사용하세요."},
        {"role": "user", "content": "Please create an open-ended question related to computer science. "
                                    "Do not provide an answer."},
    ]
    
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        add_special_tokens=False,
        return_tensors="pt"
    ).to(device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    outputs = model.generate(
        input_ids=input_ids,
        eos_token_id=terminators,
        num_return_sequences=num_return_sequences,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("assistant")[1].strip()
    return response

In [14]:
def generate_answer(question):
    messages = [
        {"role": "system", "content": "You are the interviewee. "
                                      "Please provide an answer to the following question."
                                      "Answer should be 3 to 5 sentences long. "
                                      "Use Korean only. 한국어만 사용하세요."},
        {"role": "user", "content": f"###Qustion: {question}"},
    ]
    
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        add_special_tokens=False,
        return_tensors="pt"
    ).to(device)
    
    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    outputs = model.generate(
        input_ids=input_ids,
        eos_token_id=terminators,
        num_return_sequences=num_return_sequences,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("assistant")[1].strip()
    return response

In [15]:
def generate_response(question, context, answer):
    messages = [
        {"role": "system", "content": "You are an interviewer. "
                                      "Use Korean only. 한국어만 사용하세요."
                                      "First, observe the 'Question'. "
                                      "Then, based on the 'Context', evaluate the 'User Answer' in a scale of 0 to 5."
                                      "Explain the reasoning behind your evaluation and provide feedback."
                                      "###Output format: "
                                      "**평가**: + 평가 내용 + \n"
                                      "**이유**: + 이유 내용 + \n"
                                      "**피드백**: + 피드백 내용 + \n"}, 
        {"role": "user", "content": f"###Question: + {question} + \n"
                                    f"###Context: + {context} + \n"
                                    f"###User Answer: + {answer} + \n"},
    ]
    
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        add_special_tokens=False,
        return_tensors="pt"
    ).to(device)
    
    outputs = model.generate(
        input_ids=input_ids,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=num_return_sequences,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("assistant")[1].strip()
    return response

In [16]:
# 사용자에게 주어지는 첫 번째 질문
first_question = generate_question()
print(first_question)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Here's the question: "컴퓨터 과학에서 '이산 공간'과 '연속 공간'의 차이점을 설명하고, 이 개념을 프로그래밍에 적용하는 방법을 예를 들어 설명하십시오."


In [None]:
# 사용자에게 주어지는 첫 번째 질문에 대한 완벽한 답변 (모델이 생성, 사용자에게 보여지지 않음)
perfect_answer = generate_answer(first_question)
print(perfect_answer)

In [None]:
# 사용자의 답변
user_answer = "별로라고 생각합니다"

In [None]:
# 사용자의 답변에 대한 평가 및 피드백
first_response = generate_response(first_question, perfect_answer, user_answer)
print(first_response)

In [None]:
from flask import Flask, request, jsonify, Response
from flask_cors import CORS
from flask_session import Session

app = Flask(__name__)
CORS(app)
# 임시 세션 저장소 (초기 질문 저장용 - 배포시 반드시 분리)
session_store = {}


# init에서의 응답은 면접자 와 컴퓨터가 동시에 대답해야하는 질문 : 전역으로 저장해둠 -> interview에서 string을 조합할때 question에서 씀
@app.route("/init", methods=["GET"])
def init():
    system_prompt = "당신은 테크 기업 개발자의 기술면접관 입니다. 컴퓨터공학 전공 지식과 관련된 간단한 면접 질문을 시작하세요."

    init_question = generate_question()

    #초기 질문에 대한 사전 답변 미리 생성
    model_answer = generate_answer(init_question)
    session_store[1] = {'init_question': init_question , 'model_answer' : model_answer}
    return Response(init_question, content_type="text/plain")


@app.route("/ask", methods=["POST"])
def ask():
    # 임시 세션 저장소 - 초기 질문 저장용
    session_id = 1
    init_question = session_store[session_id].get('init_question')
    model_answer = session_store[session_id].get('model_answer')
    
    params = request.get_json()
    # system_prompt = "'Context' 의 내용을 바탕으로 질문 'Question' 에 대한 답변 'Answer' 가 5점 만점에 몇 점짜리 답변인지 평가하고 그 이유를 설명해주세요. 그리고 만점을 받기 위해서 답변을 어떻게 개선해야할지 설명해주세요."
    user_answer = params["answer"]  # prompt :사용자가 답변한 내용.
    # context: init에서만든질문을 모델이답변 question: init에서 얻은 질문  answer: 사용자 답변

    evaluate_result = generate_response(init_question, model_answer, user_answer) # 답변평가용 (세개 인자)
    # 질문 = generate_question() - init_question
    # 인공지능답변 = generate_answer(question) - model_answer
    # 질문_인공지능_사용자_평가결과 =generate_response(question, context, answer)
    return Response(evaluate_result, content_type="text/plain")


app.run(host="163.180.160.32", port=5000)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://163.180.160.32:5000
Press CTRL+C to quit
