In [None]:
!pip -q install langchain sentence-transformers faiss-cpu pandas transformers torch

In [None]:
!ls


In [None]:
!unzip archive.zip


In [None]:
import pandas as pd

df = pd.read_csv("/content/vgsales.csv")
df.head()


In [None]:
import pandas as pd

df = pd.read_csv("/content/vgsales.csv")
df = df.dropna(subset=["Name","Platform","Genre","Publisher","Year","Global_Sales"])
df["Year"] = df["Year"].astype(int)

df.head()


In [None]:
!pip -q install langchain langchain-community langchain-core


In [None]:
from langchain_core.documents import Document


In [None]:
from langchain_core.documents import Document

documents = []
for _, r in df.iterrows():
    text = (
        f"Name: {r['Name']} | Platform: {r['Platform']} | "
        f"Genre: {r['Genre']} | Publisher: {r['Publisher']} | "
        f"Year: {r['Year']} | Global Sales: {r['Global_Sales']} million"
    )
    documents.append(Document(page_content=text))

len(documents)

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [None]:
from langchain_community.vectorstores import FAISS

vectorstore = FAISS.from_documents(documents, embeddings)

In [None]:
query = "Which platform has the highest global sales?"
results = vectorstore.similarity_search(query, k=5)

for i, r in enumerate(results, 1):
    print(f"\n--- Source {i} ---")
    print(r.page_content)

In [None]:
!pip -q install transformers torch

In [None]:
from transformers import pipeline

generator = pipeline("text2text-generation", model="google/flan-t5-base")

In [None]:
def rag_answer(question, k=5, max_new_tokens=128):
    # 1) Retrieve
    hits = vectorstore.similarity_search(question, k=k)
    context = "\n".join([h.page_content for h in hits])

    # 2) Generate (with constraints)
    prompt = f"""Answer the question using ONLY the information below."

Context:
{context}

Question: {question}
"""

    answer = generator(prompt, max_new_tokens=max_new_tokens)[0]["generated_text"]

    return answer, hits


In [None]:
answer, sources = rag_answer("What are some best-selling sports games after 2010?", k=5)
print(answer)

In [None]:
for i, s in enumerate(sources, 1):
  print(f"\n[Source {i}] {s.page_content}")

In [None]:
%%writefile requirements.text
pandas
langchain
langchain_core
langchain_community
sentence-transformers
faiss-cpu
transformers
torch