In [None]:
import getpass
import os

# os.environ["OPENAI_API_KEY"] = getpass.getpass() # 만약 Openai Embedding 미사용시 key 입력 필요 x

In [None]:
os.environ["LANGCHAIN_TRACING_V2"] = "false"
# os.environ["LANGCHAIN_API_KEY"] = getpass.getpass() # langchain hub를 통해 prompt 다운 시 필요

In [None]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma, FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import DataFrameLoader

In [None]:
import pandas as pd
from itertools import product

train = pd.read_csv("./train.csv")
test = pd.read_csv('./test.csv')

train_data = []

for q,a in list(product([f"질문_{x}" for x in range(1,3)],[f"답변_{x}" for x in range(1,6)])):
    for i in range(len(train)):
        train_data.append(
            "### 질문: "+ train.at[i,q] + "\n### 답변 : " + train.at[i,a]
        )
len(train_data)

In [None]:
df = pd.DataFrame({
    "context":train_data
    }
)

In [None]:
loader = DataFrameLoader(df, page_content_column="context")

In [None]:
docs = loader.load()

아래는 document size가 큰 경우에 적용하면 좋은 splitter 코드 입니다.

In [None]:
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1000, chunk_overlap=200, add_start_index=True
# )
# all_splits = text_splitter.split_documents(docs)
# 이 부분은 그런 document들이 길면 짜르는 부분인데 저희 train set은 짧기 때문에 안했습니다


In [None]:
# all_splits[1].metadata['start_index']

- OpenAI Embedding을 통해 document searching 을 진행합니다.
- document를 저장하는 DB 는 [Chroma](https://docs.trychroma.com/getting-started) 입니다

In [None]:
# embeddings = OpenAIEmbeddings()
# vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)
# retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})

- 만약 embedding 방식이나 DB 호출 방식을 변경하고 싶으면 아래 코드를 참고해주세요

In [None]:
### HuggingFace SentTran의 Embedding 사용
from langchain_community.embeddings import HuggingFaceEmbeddings

modelPath = "distiluse-base-multilingual-cased-v1"

model_kwargs = {'device':'cuda'}

encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
### FAISS 사용

# BUILD
db = FAISS.from_documents(documents=docs, embedding=embeddings)
db.save_local("faiss_index")    # FAISS Index 저장

# 아래 코드는 저장된 FAISS를 loading 하는 부분입니다. BUILD를 했다면 BUILD 없이 불러오기만 하면 됩니다
db = FAISS.load_local("faiss_index", embeddings)

# retriever 정의
retriever = db.as_retriever(search_kwargs={"k": 10})

- 아래 `get_seperated_question` 함수는 test의 문장을 문장 마침표를 기준으로 복합질문을 단일 질문으로 변경하는 function입니다.
- 아래 함수는 사용하지 않았던 방법이 점수가 높았으나, 문장을 단순하게 출력하게 하고 concat을 한다면 성능이 향상될 가능성은 있습니다.

In [None]:
def get_seperated_question(q) -> list:
    q_list = []
    q_count = q.count('?')
    d_count = q.count('.')
    end_point_cnt = q_count + d_count
    if end_point_cnt == 1:
        q_list.append(q)
    else:
        if q_count > d_count:
            temp_qs = [(x + '?').strip() for x in q.split('?') if x != '']
            q_list.extend(temp_qs)

        elif q_count < d_count:
            temp_qs = [(x + '.').strip() for x in q.split('.') if x != '']
            q_list.extend(temp_qs)

        else:
            if q.index('.') < q.index('?'): # 질문1. 질문2?
                temp_qs = [x.strip() for x in q.split('.')]
                temp_qs[0] += '.'
                q_list.extend(temp_qs)
            else:
                temp_qs = [x.strip() for x in q.split('?')]
                temp_qs[0] += '?'
                q_list.extend(temp_qs)
    
    return q_list

- Langchain Prompt 입니다.
- Langchain Hub에서 사용한 몇 예시들을 합쳐놓은 겁니다.

In [None]:
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

template = """You are a kind guide who answers user questions. Your main task is to answer the given question. Here is the instruction when you will provide the answer.
1. Use the "Following Context" to answer the question. If the answer can find in the context, use the context that exactly matches given answer. If not, use the context as a knowledge document and provide the answer by yourself.
2. Provide the most direct and brief answers. Let your answer be not longer than length 300.
3. Refrain from adding any supplementary comments, such as apologies or additional explanations.
4. Do not reply with recurring sentences.
5. Your answer should always be in the same language as the query.

Following Context:
{context}

 ### 질문: {question} 
 ### 답변: """

In [None]:
prompt = ChatPromptTemplate.from_template(template)

In [None]:
import torch

from transformers import (AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

from peft import PeftModel

In [None]:
model_name = 'heavytail/kullm-solar-S'

use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

device_map = "auto"

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    'CurtisJeon/heavytail-kullm-solar-S-4bit',
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LoRA Adaptor
model = PeftModel.from_pretrained(
    model,
    'CurtisJeon/heavytail-kullm-solar-S-lora'
)

model.eval()

In [None]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [None]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300)
hf = HuggingFacePipeline(pipeline=pipe)

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | hf
    | StrOutputParser()
)

In [None]:
# for test
for chunk in rag_chain.stream('당신은 누구입니까?'):
    print(chunk, end="", flush=True)

In [None]:
from tqdm.auto import tqdm

result = []

for i in tqdm(range(len(test)), total=len(test)):
    q = test.at[i,'질문']
    print('질문:', q)
    print('답변:', end=" ")
    for chunk in rag_chain.stream(q):
        result.append(chunk)
        print(chunk, end="", flush=True)
    print()
    print('------------------------------------------------------------------')

In [None]:
# 만약 get_seperated question을 사용한다면
from tqdm.auto import tqdm

result = []

for i in tqdm(range(len(test)), total=len(test)):
    original_question = test.at[i,'질문']
    question_split = get_seperated_question(original_question)
    print('원초질문:', original_question)
    answers = []
    for q in question_split:
        print('질문:', q)
        print('답변:', end=" ")
        for chunk in rag_chain.stream(q):
            answers.append(chunk)
            print(chunk, end="", flush=True)
        print()
            
    result.append(answers)
    print('------------------------------------------------------------------')

In [None]:
def remove_repetitions(text):
    sentences = text.split('. ')
    unique_sentences = []
    for sentence in sentences:
        if sentence not in unique_sentences:
            unique_sentences.append(sentence)
    return '. '.join(unique_sentences)

def cut_to_last_dot(text):
    for i in range(len(text)-1, -1, -1):
        if text[i] == '.':
            break
    return text[:i+1]

In [None]:
preds = []

for r in result:
    new_r = "\n".join([cut_to_last_dot(remove_repetitions(x)) for x in r])
    preds.append(new_r)

In [None]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
# 평가를 위한 Embedding Vector 추출에 활용하는 모델은 'distiluse-base-multilingual-cased-v1' 이므로 반드시 확인해주세요.
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
emb_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# 생성한 모든 응답(답변)으로부터 Embedding Vector 추출
pred_embeddings = emb_model.encode(preds)
pred_embeddings.shape

In [None]:
submit = pd.read_csv('./sample_submission.csv')
# 제출 양식 파일(sample_submission.csv)을 활용하여 Embedding Vector로 변환한 결과를 삽입
submit.iloc[:,1:] = pred_embeddings
# submit.head()

In [None]:
submit.to_csv('./submission_rag.csv', index=False)