In [None]:
!pip install -q bitsandbytes accelerate sentence-transformers==2.2.2

In [None]:
import torch
import pandas as pd

from google.colab import drive
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

In [None]:
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/wallpaper-defects-qa/

In [None]:
test_file_name = 'test.csv'
test_df = pd.read_csv('data/open/' + test_file_name)
test_df.head()

In [None]:
q_list = test_df['질문'].values.tolist()

In [None]:
q_list[0]

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "hongzoh/wdqa-v1"

In [None]:
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [None]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [None]:
def gen(x):
    return model.generate(
        **tokenizer(x, return_tensors='pt', padding=True, return_token_type_ids=False),
        max_new_tokens=512,
        early_stopping=True,
        do_sample=True,
        eos_token_id=46332,
    )

In [None]:
gen_text_encoded = gen(q_list)

In [None]:
gen_text_decoded = [tokenizer.decode(encoded) for encoded in gen_text_encoded]

In [None]:
emb_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [None]:
emb = emb_model.encode(gen_text_decoded)

In [None]:
submission_file_name = 'sample_submission.csv'
submission_df = pd.read_csv('data/open/' + submission_file_name)
submission_df.head()

In [None]:
assert len(emb) == len(submission_df)

In [None]:
for test_idx in range(len(emb)):
    for vec_idx in range(512):
        submission_df.at[test_idx, 'vec_' + str(vec_idx)] = emb[test_idx][vec_idx]
submission_df.head()

In [None]:
suffix = 'test'
submission_df.to_csv('submission_' + suffix + '.csv', index=False)