In [None]:
!pip install -q langchain faiss-gpu sentence-transformers==2.2.2

In [None]:
import os
import pickle
import pandas as pd

from tqdm import tqdm
from google.colab import drive
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders.csv_loader import CSVLoader
from sentence_transformers import SentenceTransformer
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain

In [None]:
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/wallpaper-defects-qa/

# Documents

In [None]:
document_name = 'document_using_train_data.csv'

In [None]:
loader = CSVLoader(file_path=f'data/documents/{document_name}', encoding='utf-8')
data = loader.load()

In [None]:
len(data)

# Embedding model

In [None]:
embedding_model_name = 'distiluse-base-multilingual-cased-v1'
model_kwargs = {'device':'cuda'}
encode_kwargs = {'normalize_embeddings': False}

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Index

In [None]:
vectorstore = FAISS.from_documents(data, embedding=embeddings)

In [None]:
vectorstore.save_local(f'faiss_index/faiss_index_{os.path.basename(document_name)}')

In [None]:
# vectorstore = FAISS.load_local("faiss_index", embeddings)

# Retriever

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# LLM

In [None]:
model_id = "hongzoh/wdqa-v2"
model_config = AutoConfig.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, config=model_config).to(0)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
generate_text = pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    task='text-generation',
    max_new_tokens=512,
    repetition_penalty=1.1
)

In [None]:
llm = HuggingFacePipeline(pipeline=generate_text)

# Chain

In [None]:
chain = ConversationalRetrievalChain.from_llm(llm, retriever, return_source_documents=True)

# Test data

In [None]:
test_file_name = 'test.csv'
test_df = pd.read_csv('data/open/' + test_file_name)
test_df.head()

In [None]:
q_list = test_df['질문'].values.tolist()

# Output

In [None]:
results = []

for q in tqdm(q_list):
    result = chain({"question": q, "chat_history": []})
    results.append(result)

    print(f'\n질문: {q}')
    print(f'\n답변: {result['answer']}')
    print('\n================================================================')

In [None]:
answers = []

for result in results:
    answers.append(result['answer'])

In [None]:
with open(f'outputs/output_{os.path.basename(model_id)}_rag_conv_{document_name}.pickle', 'wb') as f:
    pickle.dump(answers, f)

# Embedding

In [None]:
emb_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [None]:
emb = emb_model.encode(answers)

In [None]:
submission_file_name = f'sample_submission.csv'
submission_df = pd.read_csv('data/open/' + submission_file_name)

In [None]:
assert len(emb) == len(submission_df)

In [None]:
for test_idx in range(len(emb)):
    for vec_idx in range(512):
        submission_df.at[test_idx, 'vec_' + str(vec_idx)] = emb[test_idx][vec_idx]
submission_df.head()

In [None]:
suffix = f'{os.path.basename(model_id)}_rag_conv_{document_name}'
submission_df.to_csv(f'submission/submission_{suffix}.csv', index=False)