# 1. Packages

In [30]:
import os 
from getpass import getpass 
import sys

from langchain_mistralai import ChatMistralAI, MistralAIEmbeddings

from operator import itemgetter
from typing import Dict, List

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import Runnable, RunnableParallel, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_core.vectorstores import VectorStore
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from langchain_core.runnables import chain
from langchain_core.messages import AIMessage, HumanMessage
from pinecone import Pinecone


from pinecone.data.index import Index
from dotenv import load_dotenv

# 2. Setup

In [2]:
load_dotenv(dotenv_path="../../.env")

True

In [67]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ["LANGCHAIN_ENDPOINT"] ="https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"]="kn-synthetic-data"

In [4]:
MISTRAL_API_KEY = os.environ["MISTRAL_API_KEY"]

In [48]:
mistral = ChatMistralAI(model="mistral-large-latest", api_key=MISTRAL_API_KEY)
embeddings = MistralAIEmbeddings(model="mistral-embed", api_key=MISTRAL_API_KEY)



### Ingest evaluation sources for RAG if necessary

In [10]:
articles_sources_links = [
    "https://paulgraham.com/foundermode.html",
    "https://www.paulgraham.com/persistence.html",
    "https://www.paulgraham.com/reddits.html",
    "https://www.paulgraham.com/google.html",
    "https://www.hopsworks.ai/post/mlops-to-ml-systems-with-fti-pipelines",
    "https://www.palladiummag.com/2024/08/30/when-the-mismanagerial-class-destroys-great-companies/",
]

# Collection of short news videos to which the LLM could have had access to when trained. 
video_sources_links = [
    "https://www.youtube.com/watch?v=8QLVX9A7hqI",
    "https://www.youtube.com/watch?v=TNc14W8YOuI",
    "https://www.youtube.com/watch?v=sic0OJyyeZ0",
    "https://www.youtube.com/watch?v=2HGWuflXCUY",
    "https://www.youtube.com/watch?v=EDgD7NMY60U",
    "https://www.youtube.com/watch?v=GUr2AA6ljeU",
    "https://www.youtube.com/watch?v=DUPH2n3g5bg",
    "https://www.youtube.com/watch?v=0kOu4GLZRo0",
    "https://www.youtube.com/watch?v=SsH23u6XiGY",
    "https://www.youtube.com/watch?v=rvu8N6bA3PI"
]

In [8]:
from langchain_community.document_loaders.web_base import WebBaseLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [15]:
def load_article(url):
    """Load article and extract text content"""
    loader = WebBaseLoader(
        web_paths=(str(url),),
    )
    docs = loader.load()
    return docs[0].page_content

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

In [21]:
articles = dict()
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = {executor.submit(load_article, url): url for url in articles_sources_links}
    for res in as_completed(futures):
        url = futures[res]
        articles[url] = res.result()

In [23]:
from langchain_community.document_loaders import YoutubeLoader

In [24]:
def load_video(url):
    """Load video and extract text transcript"""
    loader = YoutubeLoader.from_youtube_url(url, add_video_info=False)
    docs = loader.load()
    return docs[0].page_content

In [27]:
videos = dict()
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = {executor.submit(load_video, url): url for url in video_sources_links}
    for res in as_completed(futures):
        url = futures[res]
        videos[url] = res.result()

# 4. Evals

### A. Create Dataset

In [32]:
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List

In [76]:
class SyntheticQAPair(BaseModel):
    """A QA Pair for a specific document"""

    question: str = Field(..., description="A question which answers lies in the document, should be precise and be completely answered with the document")
    answer: str = Field(..., description="The answer expected from the question, only reformulating material from the document")

class SyntheticQAPairs(BaseModel):
    """A list of QA pairs from a specific document"""
    pairs: List[SyntheticQAPair] = Field(..., description="A list of question / answer pair extracted from the document, always includes between 2 and 5 pairs")



In [36]:
from langchain_core.prompts import PromptTemplate

In [77]:
def generate_document_pairs(document):
    instructions = """
                You are an expert teacher tasked with generating question/answer pairs for an exam.
                You are given a document and your goal is to create meaningful and relevant questions 
                that can be answered using material contained in the document.

                Instructions:
                
                Read the entire document carefully to understand its content and context.
                Identify key points and significant information within the document, don't focus too much on details.
                Generate a set of questions that are directly answerable using the information from the document.
                Ensure that each question has an interest that extends beyond the scope of this document, yet can be
                answered from the document.
                For each question, provide a corresponding answer that is accurate and concise.
                Ensure that the questions are varied in nature, covering different aspects of the document.
                Avoid creating questions that require external knowledge or information not present in the document.
                
                Your Task:
                
                Generate a set of question/answer pairs based on the following document: \n\n{document}
            """
    prompt = PromptTemplate.from_template(instructions)
    llm = mistral.with_structured_output(SyntheticQAPairs)
    chain = prompt | llm 
    return chain.invoke(dict(document=document))


In [85]:
qa_pairs = []

for article in articles.values():
    pairs = generate_document_pairs(article)
    qa_pairs.extend(pairs)

for video in videos.values():
    pairs = generate_document_pairs(video)
    qa_pairs.extend(pairs)

In [88]:
len(qa_pairs)

16

In [89]:
true_pairs = [pair for pairs in qa_pairs for pair in pairs[1]]

In [95]:
from langsmith import Client
from langsmith.evaluation import evaluate

In [96]:
client = Client()
base_dataset_name = "kn-eval-synthetic-qa"

In [99]:
dataset_name = base_dataset_name + "-v0"
datasets = client.list_datasets(dataset_name=dataset_name)
try:
    next(datasets) # dataset already exists
    print(f"Dataset {dataset_name} already exists")
    pass
except StopIteration:
    # dataset does not exist
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description="Synthetic QA pairs for evaluation of RAG KN",
    )
    client.create_examples(
        inputs=[{"question": p.question} for p in true_pairs],
        outputs=[{"answer": p.answer} for p in true_pairs],
        dataset_id=dataset.id,
    )