In [None]:
!pip install -q langchain faiss-gpu sentence-transformers==2.2.2

In [None]:
import os
import pickle
import pandas as pd

from tqdm import tqdm
from google.colab import drive
from langchain.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders.csv_loader import CSVLoader
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

In [None]:
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/wallpaper-defects-qa/

# Documents

In [None]:
document_name = 'document_using_train_data.csv'

In [None]:
loader = CSVLoader(file_path=f'data/documents/{document_name}')
data = loader.load()

In [None]:
len(data)

# Embedding model

In [None]:
embedding_model_name = 'distiluse-base-multilingual-cased-v1'
model_kwargs = {'device':'cuda'}
encode_kwargs = {'normalize_embeddings': False}

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Index

In [None]:
vectorstore = FAISS.from_documents(data, embedding=embeddings)

In [None]:
vectorstore.save_local(f'faiss_index/faiss_index_{os.path.basename(document_name)}')

In [None]:
# vectorstore = FAISS.load_local("faiss_index", embeddings)

# Retriever

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# LLM

In [None]:
model_name = "hongzoh/wdqa-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(0)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0,
    do_sample=True,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=512,
    )
hf = HuggingFacePipeline(pipeline=pipe)

# Chain

In [None]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

In [None]:
prompt = ChatPromptTemplate.from_template(template)

In [None]:
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

In [None]:
chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | hf
    | StrOutputParser()
)

# Test data

In [None]:
test_file_name = 'test.csv'
test_df = pd.read_csv('data/open/' + test_file_name)
test_df.head()

In [None]:
q_list = test_df['질문'].values.tolist()

# Output

In [None]:
answers = []

for q in tqdm(q_list):
    answer = chain.invoke(q)
    answers.append(answer)
    print(answer)

In [None]:
with open(f'outputs/output_{os.path.basename(model_name)}_rag_{document_name}.pickle', 'wb') as f:
    pickle.dump(answers, f)

# Embedding

In [None]:
emb_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [None]:
emb = emb_model.encode(answers)

In [None]:
submission_file_name = f'sample_submission.csv'
submission_df = pd.read_csv('data/open/' + submission_file_name)

In [None]:
assert len(emb) == len(submission_df)

In [None]:
for test_idx in range(len(emb)):
    for vec_idx in range(512):
        submission_df.at[test_idx, 'vec_' + str(vec_idx)] = emb[test_idx][vec_idx]
submission_df.head()

In [None]:
suffix = f'{os.path.basename(model_name)}_rag_{document_name}'
submission_df.to_csv(f'submission/submission_{suffix}.csv', index=False)