In [1]:
import torch
import transformers
from transformers import LlamaForCausalLM, LlamaTokenizer  # noqa: F402
import json

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
with open ('../data/movie_rating_instruction/train.json') as file:
    data = json.load(file)

In [9]:
base_model = "../model"  # 미리 다운로드된 LLaMA 모델이 있는 경로
cutoff_len = 512  # 토큰 최대 길이 설정

# 토크나이저 로드
tokenizer = LlamaTokenizer.from_pretrained(base_model)
tokenizer.pad_token_id = 0  # 패딩 토큰 ID를 0으로 설정
tokenizer.padding_side = "left"

# 토크나이저 함수 정의
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=cutoff_len,
        padding=False,
        return_tensors=None,  # 텐서 형태로 반환하지 않음
    )
    # EOS 토큰 추가
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cutoff_len
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)
    
    result["labels"] = result["input_ids"].copy()
    return result

# 프롬프트 생성 함수
def generate_prompt(data_point):
    if data_point["input"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Input:
{data_point["input"]}

### Response:
{data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Response:
{data_point["output"]}"""

max_token=0
for entry in data:
    prompt = generate_prompt(entry)
    tokenized_result = tokenize(prompt)
    
    if len(tokenized_result['input_ids']) > max_token:
        max_token = len(tokenized_result['input_ids'])

print(max_token)

364
