In [1]:
# langchain + json + qdrant 테스트 

In [35]:
# qdrant + huggingface local embedding model
from langchain.vectorstores import FAISS
from langchain.document_loaders import JSONLoader
from langchain.prompts import PromptTemplate 
from langchain.embeddings import GPT4AllEmbeddings
from langchain.llms import Ollama
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [36]:
import json 
# json load and store in docs
# windows jq error

docs = []

file_path = "./drake_data.json"
with open(file_path, "r", encoding='cp949') as f :
    json_load = json.load(f)

# 100 data
cnt = 0
for data in json_load :
    if cnt == 100 :
        break
    album = data["album"]
    title = data["lyrics_title"][:-7]
    url = data["lyrics_url"]
    lyrics = data["lyrics"]
    track_views = data["track_views"]

    if track_views is None : 
        track_views = "0.0"
    else :
        if track_views[-1] == 'K' : 
            track_views = str(float(track_views[:-1]) * 1000)
        elif track_views[-1] == 'M' : 
            track_views = str(float(track_views[:-1]) * 1000000)

    cnt += 1
    
    content = lyrics
    metadata = dict(album=album, title=title, views=track_views, ref_url=url, ref_file=file_path)
    docs.append(Document(page_content=content, metadata=metadata))

In [37]:
docs[0]

Document(page_content="Lyrics from CLB Merch\n\n[Verse]\nPut my feelings on ice\nAlways been a gem\nCertified lover boy, somehow still heartless\nHeart is only gettin' colder", metadata={'album': 'Certified Lover Boy', 'title': 'Certified Lover Boy*', 'views': '8700.0', 'ref_url': 'https://genius.com/Drake-certified-lover-boy-lyrics', 'ref_file': './drake_data.json'})

In [39]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(docs)

In [40]:
# embedding 모델 연결 및 벡터 db 구성
# FAISS-cpu

embedding = GPT4AllEmbeddings()
vector_store = FAISS.from_documents(docs, embedding)

Downloading: 100%|██████████| 45.9M/45.9M [00:01<00:00, 24.4MiB/s]
Verifying: 100%|██████████| 45.9M/45.9M [00:00<00:00, 753MiB/s]


In [43]:
# vector store test
print(vector_store.index.ntotal)
query = "Lyrics about humanity"
result = vector_store.similarity_search(query)
print(result)

394
[Document(page_content="[Verse 1: Drake]\nUh, man, fresh up out the sand, February tan\nIt's The Boy but I'm still the man, come and get your mans\nI don't know, first you caught the hands, then you took the stand\nIt's a joke, but you say you real—I don't understand\nOn a yacht, me and all the dogs actin' like some dogs\nWe evolved, used to think vacation meant Niagara Falls\nSwear to God, shout to Buffalo, never duckin' low\nI don't stop, man, I'm stuck on go, always hug the road\nFuck a opp, make his body roll, yeah, a lot of those\nStarted out doin' college shows, Calipari flow\nThen I popped like you never seen\nWe with everything, I went off in the '16, give me '17\n\n[Chorus: Drake]\nWant a lot, can't have everything, can't have everything\nWant a lot, can't have everything but I want everything", metadata={'album': 'More Life', 'title': "Can't Have Everything", 'views': '597200.0', 'ref_url': 'https://genius.com/Drake-cant-have-everything-lyrics', 'ref_file': './drake_data.

In [45]:
# retriever
search_config = {
    "k" : 5,
}

retriever = vector_store.as_retriever(search_config=search_config)
llm = Ollama(model="llama3:8b-instruct-q4_K_M")

In [49]:
# RAG

from langchain.schema.runnable import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

template = """
The following information will help you. You can refer to this to answer.
{context}

Question : {question}
Answer :
"""

rag_prompt = PromptTemplate.from_template(template)

output_parser = StrOutputParser()
rag_chain = {"context": retriever, "question": RunnablePassthrough()} | rag_prompt | llm | output_parser

In [50]:
rag_chain.invoke("What are top 5 famous songs of Drake.")

'Based on the provided documents, here are the top 5 most famous songs of Drake:\n\n1. **"5 AM in Toronto"** from the album "Care Package" - with 1,400,000 views.\n2. **"That\'s How You Feel"** (feat. Nicki Minaj and DJ Boof) from the album "Scorpion" - with 660,400 views.\n3. **"Money in the Grave"** (feat. Rick Ross) from the album "The Best in the World Pack" - with 1,200,000 views.\n4. **"Nice For What"** (feat. Big Freedia and Letitia Wright) from the album "Scorpion" - with 2,800,000 views.\n5. **"In My Feelings"** (not included in the provided documents, but it\'s one of Drake\'s most famous songs, with over 4 billion views on YouTube).\n\nNote that these view counts are based on the provided data and may not reflect the actual popularity or views of these songs at the time of your query.'