In [1]:
#Importing the AsyncHtmlLoader
from langchain_community.document_loaders import AsyncHtmlLoader

#This is the url of the wikipedia page on the 2023 Cricket World Cup
url="https://en.wikipedia.org/wiki/2023_Cricket_World_Cup"

#Instantiating the AsyncHtmlLoader
loader = AsyncHtmlLoader (url)

#Loading the extracted information
data = loader.load()

from langchain_community.document_transformers import Html2TextTransformer

#Instantiate the Html2TextTransformer function
html2text = Html2TextTransformer()


#Call transform_documents
data_transformed = html2text.transform_documents(data)

Fetching pages: 100%|###########################################################################| 1/1 [00:00<00:00, 17.17it/s]


In [2]:
%pip install ragas==0.1.11
 
# Import necessary libraries
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Instantiate the models
generator_llm = ChatOpenAI(model="gpt-4o-mini")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

# Create the TestsetGenerator
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Call the generator
testset = generator.generate_with_langchain_docs(
          data_transformed, 
          test_size=20, 
          distributions={    
                    simple: 0.5, 
          reasoning: 0.25, 
          multi_context: 0.25}
          )

Collecting langchain-text-splitters<0.1,>=0.0.1 (from langchain->ragas==0.1.11)
  Using cached langchain_text_splitters-0.0.2-py3-none-any.whl.metadata (2.2 kB)
Using cached langchain_text_splitters-0.0.2-py3-none-any.whl (23 kB)
Installing collected packages: langchain-text-splitters
  Attempting uninstall: langchain-text-splitters
    Found existing installation: langchain-text-splitters 0.2.4
    Uninstalling langchain-text-splitters-0.2.4:
      Successfully uninstalled langchain-text-splitters-0.2.4
Successfully installed langchain-text-splitters-0.0.2
Note: you may need to restart the kernel to use updated packages.


embedding nodes:   0%|          | 0/36 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/20 [00:00<?, ?it/s]

In [4]:
testset.to_pandas()

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What was the public's reaction to the Official...,"[ 2023.\n 24. **^** ""ICC announce Match Offic...",The public's reaction to the Official Cricket ...,simple,[{'source': 'https://en.wikipedia.org/wiki/202...,True
1,What was the deadline for teams to finalize th...,[ full members who took\npart in the knock-out...,Teams were asked to finalize their 15-player s...,simple,[{'source': 'https://en.wikipedia.org/wiki/202...,True
2,What was the outcome of the match between Indi...,[ **Netherlands** \n223 (46.3 overs) \n---|-...,India won by 7 wickets against Pakistan on 14 ...,simple,[{'source': 'https://en.wikipedia.org/wiki/202...,True
3,What events are scheduled for international cr...,[ Qatar\n * Ireland v Afghanistan in the UAE\...,The context does not provide specific events s...,simple,[{'source': 'https://en.wikipedia.org/wiki/202...,True
4,What was the margin of victory for India in th...,[ \n388 (49.2 overs) | **v** | **New Zealand*...,India won by 100 runs against England.,simple,[{'source': 'https://en.wikipedia.org/wiki/202...,True
5,What significant cricket match took place at E...,[ \n**India won by 243 runs** \nEden Gardens...,"On 5 November 2023, India secured a significan...",simple,[{'source': 'https://en.wikipedia.org/wiki/202...,True
6,What was the outcome of the match between Sout...,"[ runs** \nM. A. Chidambaram Stadium, Chennai...",South Africa won by 229 runs against England o...,simple,[{'source': 'https://en.wikipedia.org/wiki/202...,True
7,What was the margin of victory for South Afric...,[ \n388 (49.2 overs) | **v** | **New Zealand*...,South Africa won by 190 runs against New Zealand.,simple,[{'source': 'https://en.wikipedia.org/wiki/202...,True
8,What significant events marked the start of th...,[\n---|---|--- \nMehidy Hasan 74 (89) \nReec...,The group stage of the 2023 Cricket World Cup ...,simple,[{'source': 'https://en.wikipedia.org/wiki/202...,True
9,What was the outcome of the match between Aust...,[ \n388 (49.2 overs) | **v** | **New Zealand*...,Australia won by 5 runs against New Zealand on...,simple,[{'source': 'https://en.wikipedia.org/wiki/202...,True


# Recreating RAG pipeline

#### Indexing and storage

In [6]:

#import libraries
from langchain_text_splitters import CharacterTextSplitter
#Set the CharacterTextSplitter parameters
text_splitter = CharacterTextSplitter(
    separator="\n",    #The character that should be used to split
    chunk_size=1000,   #Number of characters in each chunk
    chunk_overlap=200, #Number of overlapping characters between chunks
)

#Create Chunks
chunks=text_splitter.create_documents([data_transformed[0].page_content])

#Show the number of chunks created
print(f"The number of chunks created : {len(chunks)}")


The number of chunks created : 65


In [9]:

# Import FAISS class from vectorstore library
from langchain_community.vectorstores import FAISS

# Import OpenAIEmbeddings from the library
from langchain_openai import OpenAIEmbeddings

# Chunks from earlier
chunks=chunks

# Instantiate the embeddings object
embeddings=OpenAIEmbeddings(model="text-embedding-3-large")

# Create the database
db=FAISS.from_documents(chunks,embeddings)

# Check the number of chunks that have been indexed
db.index.ntotal

65

### Retrieval function

In [11]:
# Import FAISS class from vectorstore library
from langchain_community.vectorstores import FAISS

# Import OpenAIEmbeddings from the library
from langchain_openai import OpenAIEmbeddings

def retrieve_context(query, db):
    embeddings=OpenAIEmbeddings(model="text-embedding-3-large")

    # Load the database stored in the local directory
    # db=FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
    db=db

    # Ranking the chunks in descending order of similarity
    docs = db.similarity_search(query)
    # Selecting first chunk as the retrieved information
    retrieved_context=docs[0].page_content

    return str(retrieved_context)


#### Augmentation function

In [12]:
def create_augmeted(query, db_path):

    retrieved_context=retrieve_context(query,db_path)

    # Creating the prompt
    augmented_prompt=f"""

    Given the context below answer the question.

    Question: {query} 

    Context : {retrieved_context}

    Remember to answer only based on the context provided and not from any other source. 

    If the question cannot be answered based on the provided context, say I don’t know.

    """

    return retrieved_context, str(augmented_prompt)

#### RAG function

In [13]:
# Importing the OpenAI library
from openai import OpenAI

def create_rag(query, db):

    retrieved_context, augmented_prompt=create_augmeted(query,db)

    # Instantiate the OpenAI client
    client = OpenAI()

    # Make the API call passing the augmented prompt to the LLM
    response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=	[
        {"role": "user", "content": augmented_prompt}
  		]
    )

    # Extract the answer from the response object
    answer=response.choices[0].message.content

    return retrieved_context, answer

# Testing the pipeline

In [14]:

create_rag("Who won the 2023 Cricket World Cup?", db )

("Participants| 10  \nMatches| 48  \nAttendance| 1,250,307 (26,048 per match)  \nPlayer of the series|  Virat Kohli  \nMost runs|  Virat Kohli (765)  \nMost wickets|  Mohammed Shami (24)  \nOfficial website| cricketworldcup.com  \n<- 2019 England & Wales _South Africa, Zimbabwe and Namibia 2027_ ->  \n  \nThe **2023 ICC Men's Cricket World Cup** (also referred to as simply the\n**2023 Cricket World Cup**) was the 13th edition of the  Cricket World Cup, a\nquadrennial One Day International (ODI) cricket tournament organized by the\nInternational Cricket Council (ICC). It was hosted from 5 October to 19\nNovember 2023 across ten venues in India. This was the fourth World Cup held\nin India, but the first where India was the sole host.\nThe tournament was contested by ten national teams, maintaining the same\nformat used in 2019. After six weeks of round-robin matches, India, South\nAfrica, Australia, and New Zealand finished as the top four and qualified for",
 'I don’t know.')

In [15]:
create_rag("What was Virat Kohli's achievement in the Cup?", db)

('format used in 2019. After six weeks of round-robin matches, India, South\nAfrica, Australia, and New Zealand finished as the top four and qualified for\nthe knockout stage. In the knockout stage, India and Australia beat New\nZealand and South Africa, respectively, to advance to the final, played on 19\nNovember at the Narendra Modi Stadium in Ahmedabad. Australia won the final by\nsix wickets, winning their sixth Cricket World Cup title.\nVirat Kohli was named the player of the tournament and also scored the most\nruns, while Mohammed Shami was the leading wicket-taker. A total of 1,250,307\nspectators attended the matches, the highest number in any Cricket World Cup\nto date.[1] The tournament final set viewership records in India, drawing 518\nmillion viewers, with a peak of 57 million streaming viewers.\n## Background\nOn 11 December 2017, India was announced by the ICC as hosts of the 2023\nCricket World Cup; while India had served as a co-host during three previous',
 'Virat K

# Evaluations

In [17]:
# Create Lists for Questions and Ground Truths from testset
questions_list=testset.to_pandas().question.to_list()
gt_list=testset.to_pandas().ground_truth.to_list()

answer_list=[]
context_list=[]

# Iterate through the testset to generate response for questions
for record in testset.test_data:
    
     # Call the RAG function
     rag_context, rag_answer=create_rag(record.question,db)
     ground_truth=record.ground_truth
     answer_list.append(rag_answer)
     context_list.append([rag_context])

# Create dictionary of question, answer, context and ground truth
data_samples={
    'question':questions_list,
    'answer':answer_list,
    'contexts': context_list,
    'ground_truth':gt_list
}

For RAGAs, the evaluation set needs to be in the Dataset format. Datasets is a lightweight library from HuggingFace.

In [18]:
# Install the datasets package
%pip install datasets==2.20.0

# Import the Datasets library
from datasets import Dataset

# Create Dataset from the dictionary
dataset = Dataset.from_dict(data_samples)


Note: you may need to restart the kernel to use updated packages.


In [21]:
dataset[2]

{'question': 'What was the outcome of the match between India and Pakistan on 14 October 2023?',
 'answer': 'India won the match against Pakistan on 14 October 2023 by 7 wickets.',
 'contexts': ['261/3 (41.3 overs)  \n---|---|---  \n|  |   \n**India won by 7 wickets**  \nMaharashtra Cricket Association Stadium, Pune  \n---  \n  \n20 October 2023  \nScorecard  \n---  \n**Australia **  \n367/9 (50 overs) | **v** | **Pakistan**  \n305 (45.3 overs)  \n---|---|---  \n|  |   \n**Australia won by 62 runs**  \nM. Chinnaswamy Stadium, Bangalore  \n---  \n  \n21 October 2023  \nScorecard  \n---  \n**Netherlands **  \n262 (49.4 overs) | **v** | **Sri Lanka**  \n263/5 (48.2 overs)  \n---|---|---  \n|  |   \n**Sri Lanka won by 5 wickets**  \nBRSABV Ekana Cricket Stadium, Lucknow  \n---  \n  \n21 October 2023  \nScorecard  \n---  \n**South Africa **  \n399/7 (50 overs) | **v** | **England**  \n170 (22 overs)  \n---|---|---  \n|  |   \n**South Africa won by 229 runs**  \nWankhede Stadium, Mumbai  \n-

In [22]:
dataset[7]

{'question': 'What was the margin of victory for South Africa in their match against New Zealand?',
 'answer': 'I don’t know.',
 'contexts': ['* The match was reduced to 37 overs per side due to rain.\n  * England were set a revised target of 197 runs from 37 overs due to rain. \n2 October 2023  \n14:00  \nScorecard  \n---  \n**New Zealand **  \n321/6 (50 overs) | **v** | **South Africa**  \n211/4 (37 overs)  \n---|---|---  \nDevon Conway 78 (73)  \nLungi Ngidi 3/33 (7 overs) |  | Quinton de Kock 84* (89)   \nTrent Boult 2/20 (5 overs)  \n**New Zealand won by 7 runs (DLS method)**  \nGreenfield International Stadium, Thiruvananthapuram  \nUmpires: Chris Gaffaney (NZ) and Richard Kettleborough (Eng)  \n---  \n  \n  * New Zealand won the toss and elected to bat.\n  * South Africa innings curtailed at 37 overs due to rain; DLS par score was 219. \n3 October 2023  \n14:00  \nScorecard  \n---  \n**Sri Lanka **  \n294 (46.2 overs) | **v** | **Afghanistan**  \n261/4 (38.1 overs)  \n---|---|--

## Calculating evaluation metrics

We will use RAGAS metrics which will compare the answers to the ground truths.

In [19]:
#Import all the libraries
from ragas import evaluate

from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_entity_recall,
    answer_similarity,
    answer_correctness
)

from ragas.metrics.critique import (
    harmfulness, 
    maliciousness, 
    coherence, 
    correctness, 
    conciseness
)

# Calculate the metrics for the dataset 

result = evaluate(
    dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        context_entity_recall,
        answer_similarity,
        answer_correctness,
        harmfulness, 
        maliciousness, 
        coherence, 
        correctness, 
        conciseness

    ],
)


Evaluating:   0%|          | 0/240 [00:00<?, ?it/s]

In [20]:
import json

print(json.dumps(result, indent=4))

{
    "context_precision": 0.749999999925,
    "faithfulness": 0.48583333333333334,
    "answer_relevancy": 0.6636592815353355,
    "context_recall": 0.6083333333333333,
    "context_entity_recall": 0.3728388270962975,
    "answer_similarity": 0.8874326228624116,
    "answer_correctness": 0.46588081819686256,
    "harmfulness": 0.0,
    "maliciousness": 0.05,
    "coherence": 0.7,
    "correctness": 0.7,
    "conciseness": 0.75
}
