<a href="https://colab.research.google.com/github/jaydenchoe/ragas-test/blob/main/generate_RAGAS_QnA_samples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Generate RAGAS synthetic documents**

In [3]:
import os
COLAB = os.getenv("COLAB_RELEASE_TAG") is not None

if COLAB:
  print ( "COLAB" )
  !pip install --quiet langchain==0.0.170
  !pip install --quiet pyarrow==14.0.1
  !pip install --quiet requests==2.31.0
  !pip install --quiet cudf-cu12==24.4.1 ibis-framework==8.0.0 google-colab==1.0.0
  !pip install --quiet datasets==2.19.0
  !pip install --quiet --upgrade langchain-openai
  !pip install --quiet pypdf

COLAB
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m834.2/834.2 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
if COLAB:
  print ( "COLAB" )
  !pip install --quiet \
    chromadb \
    langchain \
    langchain_chroma \
    optuna \
    plotly \
    polars \
    ragas
else:
  !pip install --quiet \
    chromadb \
    langchain \
    datasets \
    langchain_chroma \
    optuna \
    plotly \
    polars \
    ragas

COLAB
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m559.5/559.5 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.1/86.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m84.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━

In [5]:
# Importing the packages
from functools import reduce
import json
import os
import requests
import warnings

import chromadb
from chromadb.api.models.Collection import Collection as ChromaCollection
from datasets import load_dataset, Dataset
from getpass import getpass
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.runnables.base import RunnableSequence
from langchain_community.document_loaders import WebBaseLoader, PolarsDataFrameLoader
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from operator import itemgetter
import optuna
import pandas as pd
import plotly.express as px
import polars as pl
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_correctness
)
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context, conditional



In [6]:
# Providing api key for OPENAI
from google.colab import userdata

if COLAB:
  from google.colab import userdata, data_table
  print( "COLAB" )
  # Secrets
  OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
  os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
  runtime_info = "Colab runtime"

  # Enabling Colab's data formatter for pandas
  data_table.enable_dataframe_formatter()
elif OPENAI_API_KEY := os.environ.get('OPENAI_API_KEY'):
  # Secrets
  runtime_info = "Non Colab runtime"
else:
  OPENAI_API_KEY = getpass("OPENAI_API_KEY")
  os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
  runtime_info = "Non Colab runtime"

print(runtime_info)

COLAB
Colab runtime


In [8]:
# Getting example docs into vectordb
urls = ["https://en.wikipedia.org/wiki/Large_language_model"]

wikis_loader = WebBaseLoader(urls)
wikis = wikis_loader.load()
#wikis[0]

from langchain.document_loaders import PyPDFLoader

# PDF 파일의 경로를 지정합니다. 실제 경로로 변경해주세요.
pdf_path = "ENN SDK Developer Guide.pdf"

# PyPDFLoader를 사용하여 PDF 파일을 로드합니다.
pdf_loader = PyPDFLoader(pdf_path)

# PDF 내용을 로드합니다.
pdf_pages = pdf_loader.load()

# 첫 번째 페이지의 내용을 출력합니다 (선택사항).
print(pdf_pages[0].page_content)

ENN SDK Dev eloper Guide
Abstract
This guide describes the method to use Exynos Neural Network Software Development Kit (ENN SDK). It provides instructions for converting Neural Network (NN)
models to Neural Network Container (NNC) models. It also provides information about the ENN framework, providing input to the model, executing the model, and
obtaining the output.
1. Intr oduction
ENN SDK  allows users to convert trained TensorFlow Lite  neural network models to a format that can run efficiently in Samsung Exynos  hardware.
This guide is applicable for users who want to test or construct an application to run inference on ENN SDK.
Structur e of Documentation
Chapter 1  introduces ENN SDK and its eco-system.
Chapter 2  provides information on the features of ENN SDK.
Chapter 3  provides information on tools provided with ENN SDK.
Chapter 4  provides information on ENN framework API.
The subsequent chapters provide additional information on ENN SDK.
Samples
The list of samples for EN

In [9]:
# Examining question evolution types evailable in ragas library
llm35 = ChatOpenAI(model="gpt-3.5-turbo")
llm4 = ChatOpenAI(model="gpt-4-turbo")
generator_llm = llm35
critic_llm = llm35
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", deployment="text-embedding-3-small")

example_generator=None
example_generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings,
    chunk_size=1024
)

# Change resulting question type distribution
list_of_distributions = [{simple: 1}, {reasoning: 1}, {multi_context: 1}, {conditional: 1}]

In [10]:
# This step COSTS $$$ ...
# Generating the example evolutions
#avoid_costs = True
avoid_costs = False

if not avoid_costs:
  # Running ragas to get examples of question evolutions
  question_evolution_types = list(map(lambda x: example_generator.generate_with_langchain_docs(pdf_pages, 10, x), list_of_distributions))
  print(question_evolution_types)
  question_evolution_types_pd = reduce(lambda x, y: pd.concat([x, y], axis=0), [x.to_pandas() for x in question_evolution_types])
  print(question_evolution_types_pd)
  question_evolution_types_pd = question_evolution_types_pd.loc[:, ["evolution_type", "question", "ground_truth"]]
else:
  # Downloading examples for question evolutions discussed in the article:
  question_evolution_types_pd  = pl.read_csv(
    "https://gist.github.com/gox6/bfd422a6f203ba73f081b08c9bb25e66/raw/example-question-evolution-types-in-ragas.csv",
    separator=",",
).drop("index").to_pandas()

In [11]:
if COLAB:
  display(data_table.DataTable(question_evolution_types_pd, include_index=False, num_rows_per_page=100))
else:
  display(question_evolution_types_pd)

Unnamed: 0,evolution_type,question,ground_truth
0,simple,What are some key models developed by OpenAI i...,"GPT-1, GPT-2, GPT-3, GPT-4"
1,reasoning,"How do ""sleeper agents"" in LLM models pose sec...",The potential presence of 'sleeper agents' wit...
2,multi_context,How do researchers perceive large language mod...,NLP researchers were split on whether LLMs cou...
3,conditional,How does toxic content and low-quality data im...,Toxic content and low-quality data impact LLM ...
