In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
#from datasets import load_dataset
#from nltk.tokenize import word_tokenize
import re

  from .autonotebook import tqdm as notebook_tqdm





Task
1. Creat a RAG pipeline that can take following text and answer following questions
2. Try different types of chunking to get better answers?
3. Does asking questions differently give better answers? Why?
4. Try a different similarity search instead of cosine similarity - do the answers improve?



In [6]:
sample_text = """
The Amazon rainforest is the largest tropical rainforest in the world, covering approximately 5.5 million square kilometers. It spans across nine countries, including Brazil, Peru, and Colombia. The rainforest is home to around 10% of the known species on Earth, including jaguars, sloths, and thousands of species of insects and birds.

Deforestation is a significant threat to the Amazon, with thousands of square kilometers lost each year due to agriculture, logging, and urbanization. This deforestation contributes to climate change, as the rainforest acts as a major carbon sink, absorbing millions of tons of carbon dioxide annually.

Indigenous tribes have lived in the Amazon for thousands of years, relying on its rich biodiversity for food, medicine, and shelter. These tribes have unique languages, traditions, and knowledge of the ecosystem. However, many face threats from illegal land encroachment and industrial activities.

Scientists believe that the Amazon plays a crucial role in global weather patterns by releasing water vapor into the atmosphere, which influences rainfall across South America and even other continents. The Amazon River, which flows through the rainforest, is the second longest river in the world and carries more water than any other river.

Efforts to protect the Amazon include international agreements, conservation programs, and sustainable development projects that aim to balance economic growth with environmental protection. Many organizations and governments are working to reduce illegal logging and promote reforestation initiatives.
"""

In [7]:
questions = [
    "What is the Amazon rainforest?",
    "Which countries does the Amazon span across?",
    "Why is deforestation a problem in the Amazon?",
    "How does the Amazon rainforest affect global weather patterns?",
    "What role do indigenous tribes play in the Amazon?",
    "What is the importance of the Amazon River?",
    "What types of wildlife can be found in the Amazon?",
    "How does deforestation contribute to climate change?",
    "What efforts are being made to protect the Amazon?",
    "Why is the Amazon considered a major carbon sink?"
]

### 1. Creat a RAG pipeline that can take following text and answer following questions

In [2]:
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)

In [3]:
# Function to generate embeddings
def get_transformer_embeddings(texts):
    return model.encode(texts, convert_to_numpy=True)

# Function to retrieve relevant passage
def retrieve_passage(stored_texts, stored_embeddings, query):
    query_embedding = get_transformer_embeddings([query])

    similarities = cosine_similarity(query_embedding, stored_embeddings)[0]  # Ensure correct shape
    best_match_idx = np.argmax(similarities)
    return stored_texts[best_match_idx]

# Function to answer questions based on stored content
def answer_question(stored_texts, stored_embeddings, query):
    relevant_passage = retrieve_passage(stored_texts, stored_embeddings, query)
    return relevant_passage

In [20]:
def RAG_pipeline(stored_texts, _questions):
    stored_embeddings = model.encode(stored_texts, convert_to_numpy=True)  # Store embeddings for each chunk
    
    print("\nSample Questions and Answers:\n")
    for question in _questions:        
        response = answer_question(stored_texts, stored_embeddings, question)
        print(f"Q: {question}\nA: {response}\n")

### 2. Try different types of chunking to get better answers?

In [8]:
# Function to split text into meaningful chunks (paragraphs)
def split_text(text):
    return [para.strip() for para in re.split("\n+", text) if para.strip()]

# Store document embeddings
stored_texts = split_text(sample_text)  # Store each paragraph separately

In [9]:
stored_texts

['The Amazon rainforest is the largest tropical rainforest in the world, covering approximately 5.5 million square kilometers. It spans across nine countries, including Brazil, Peru, and Colombia. The rainforest is home to around 10% of the known species on Earth, including jaguars, sloths, and thousands of species of insects and birds.',
 'Deforestation is a significant threat to the Amazon, with thousands of square kilometers lost each year due to agriculture, logging, and urbanization. This deforestation contributes to climate change, as the rainforest acts as a major carbon sink, absorbing millions of tons of carbon dioxide annually.',
 'Indigenous tribes have lived in the Amazon for thousands of years, relying on its rich biodiversity for food, medicine, and shelter. These tribes have unique languages, traditions, and knowledge of the ecosystem. However, many face threats from illegal land encroachment and industrial activities.',
 'Scientists believe that the Amazon plays a cruci

In [None]:
RAG_pipeline(stored_texts, questions)


Sample Questions and Answers:

Q: What is the Amazon rainforest?
A: The Amazon rainforest is the largest tropical rainforest in the world, covering approximately 5.5 million square kilometers. It spans across nine countries, including Brazil, Peru, and Colombia. The rainforest is home to around 10% of the known species on Earth, including jaguars, sloths, and thousands of species of insects and birds.

Q: Which countries does the Amazon span across?
A: The Amazon rainforest is the largest tropical rainforest in the world, covering approximately 5.5 million square kilometers. It spans across nine countries, including Brazil, Peru, and Colombia. The rainforest is home to around 10% of the known species on Earth, including jaguars, sloths, and thousands of species of insects and birds.

Q: Why is deforestation a problem in the Amazon?
A: Deforestation is a significant threat to the Amazon, with thousands of square kilometers lost each year due to agriculture, logging, and urbanization. T

##### This is the best one, doing it by sentences gives the best, shortest answer

In [13]:
#Alternative form of chunking

# Function to split text into meaningful chunks (sentences)
def split_text(text):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

# Store document embeddings
stored_texts = split_text(sample_text)  # Store each sentence separately

In [14]:
stored_texts

['The Amazon rainforest is the largest tropical rainforest in the world, covering approximately 5.5 million square kilometers.',
 'It spans across nine countries, including Brazil, Peru, and Colombia.',
 'The rainforest is home to around 10% of the known species on Earth, including jaguars, sloths, and thousands of species of insects and birds.',
 'Deforestation is a significant threat to the Amazon, with thousands of square kilometers lost each year due to agriculture, logging, and urbanization.',
 'This deforestation contributes to climate change, as the rainforest acts as a major carbon sink, absorbing millions of tons of carbon dioxide annually.',
 'Indigenous tribes have lived in the Amazon for thousands of years, relying on its rich biodiversity for food, medicine, and shelter.',
 'These tribes have unique languages, traditions, and knowledge of the ecosystem.',
 'However, many face threats from illegal land encroachment and industrial activities.',
 'Scientists believe that the 

In [None]:
RAG_pipeline(stored_texts, questions)


Sample Questions and Answers:

Q: What is the Amazon rainforest?
A: The Amazon rainforest is the largest tropical rainforest in the world, covering approximately 5.5 million square kilometers.

Q: Which countries does the Amazon span across?
A: It spans across nine countries, including Brazil, Peru, and Colombia.

Q: Why is deforestation a problem in the Amazon?
A: Deforestation is a significant threat to the Amazon, with thousands of square kilometers lost each year due to agriculture, logging, and urbanization.

Q: How does the Amazon rainforest affect global weather patterns?
A: Scientists believe that the Amazon plays a crucial role in global weather patterns by releasing water vapor into the atmosphere, which influences rainfall across South America and even other continents.

Q: What role do indigenous tribes play in the Amazon?
A: Indigenous tribes have lived in the Amazon for thousands of years, relying on its rich biodiversity for food, medicine, and shelter.

Q: What is the 

In [76]:
#Alternative form of chunking

# Function to split text into meaningful chunks (comma seperated)
def split_text(text):
    sentences =  re.split(r",", text)

    cleaned_sentences = []
    for sentence in sentences:
        cleaned_text =  re.sub(r"\.\n\n", ". ", sentence)
        cleaned_sentences.append(cleaned_text)

    return [sentence.strip() for sentence in cleaned_sentences if sentence.strip()]

# Store document embeddings
stored_texts = split_text(sample_text)  # Store each sentence separately

In [77]:
stored_texts

['The Amazon rainforest is the largest tropical rainforest in the world',
 'covering approximately 5.5 million square kilometers. It spans across nine countries',
 'including Brazil',
 'Peru',
 'and Colombia. The rainforest is home to around 10% of the known species on Earth',
 'including jaguars',
 'sloths',
 'and thousands of species of insects and birds. Deforestation is a significant threat to the Amazon',
 'with thousands of square kilometers lost each year due to agriculture',
 'logging',
 'and urbanization. This deforestation contributes to climate change',
 'as the rainforest acts as a major carbon sink',
 'absorbing millions of tons of carbon dioxide annually. Indigenous tribes have lived in the Amazon for thousands of years',
 'relying on its rich biodiversity for food',
 'medicine',
 'and shelter. These tribes have unique languages',
 'traditions',
 'and knowledge of the ecosystem. However',
 'many face threats from illegal land encroachment and industrial activities. Scient

In [78]:
RAG_pipeline(stored_texts, questions)


Sample Questions and Answers:

Q: What is the Amazon rainforest?
A: The Amazon rainforest is the largest tropical rainforest in the world

Q: Which countries does the Amazon span across?
A: is the second longest river in the world and carries more water than any other river. Efforts to protect the Amazon include international agreements

Q: Why is deforestation a problem in the Amazon?
A: and thousands of species of insects and birds. Deforestation is a significant threat to the Amazon

Q: How does the Amazon rainforest affect global weather patterns?
A: many face threats from illegal land encroachment and industrial activities. Scientists believe that the Amazon plays a crucial role in global weather patterns by releasing water vapor into the atmosphere

Q: What role do indigenous tribes play in the Amazon?
A: absorbing millions of tons of carbon dioxide annually. Indigenous tribes have lived in the Amazon for thousands of years

Q: What is the importance of the Amazon River?
A: is t

#### letting a tokenizer do the chunking

In [35]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Bobby\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [None]:
def chunk_text_by_sentences(text):
    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk = ""
    chunk_length = 100 #Adjust based on token constraints 

    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 <= chunk_length: #+1 for space.
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

In [80]:
sentence_chunks = chunk_text_by_sentences(sample_text)
sentence_chunks

['',
 'The Amazon rainforest is the largest tropical rainforest in the world, covering approximately 5.5 million square kilometers.',
 'It spans across nine countries, including Brazil, Peru, and Colombia.',
 'The rainforest is home to around 10% of the known species on Earth, including jaguars, sloths, and thousands of species of insects and birds.',
 'Deforestation is a significant threat to the Amazon, with thousands of square kilometers lost each year due to agriculture, logging, and urbanization.',
 'This deforestation contributes to climate change, as the rainforest acts as a major carbon sink, absorbing millions of tons of carbon dioxide annually.',
 'Indigenous tribes have lived in the Amazon for thousands of years, relying on its rich biodiversity for food, medicine, and shelter.',
 'These tribes have unique languages, traditions, and knowledge of the ecosystem.',
 'However, many face threats from illegal land encroachment and industrial activities.',
 'Scientists believe that

##### This adds an empty chunk, for cases when there is no match, potentially better model, depending on you business logic

In [47]:
RAG_pipeline(sentence_chunks, questions)


Sample Questions and Answers:

Q: What is the Amazon rainforest?
A: The Amazon rainforest is the largest tropical rainforest in the world, covering approximately 5.5 million square kilometers.

Q: Which countries does the Amazon span across?
A: It spans across nine countries, including Brazil, Peru, and Colombia.

Q: Why is deforestation a problem in the Amazon?
A: Deforestation is a significant threat to the Amazon, with thousands of square kilometers lost each year due to agriculture, logging, and urbanization.

Q: How does the Amazon rainforest affect global weather patterns?
A: Scientists believe that the Amazon plays a crucial role in global weather patterns by releasing water vapor into the atmosphere, which influences rainfall across South America and even other continents.

Q: What role do indigenous tribes play in the Amazon?
A: Indigenous tribes have lived in the Amazon for thousands of years, relying on its rich biodiversity for food, medicine, and shelter.

Q: What is the 

### 3. Does asking questions differently give better answers? Why?

In [81]:
#Alternative form of chunking

# Function to split text into meaningful chunks (sentences)
def split_text(text):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

# Store document embeddings
stored_texts = split_text(sample_text)  # Store each sentence separately

In [256]:
questions1 = [
    "Tell me about the Amazon rainforest",
    "How many countries does the Amazon span across?",
    "Why is deforestation a problem for rainforest",
    "Does the Amazon rainforest affect global weather patterns?",
    "Do indigenous tribes play a role in the Amazon?",
    "What about the Amazon River?",
    "What types of wildlife cannot be found in the Amazon?",
    "What type of species life in the rainforest?",
    "How does climate change?",
    "What efforts are being made to defend the Amazon?",
    "Why is the Amazon considered a carbon sink?"
]

In [249]:
RAG_pipeline(stored_texts, questions1)


Sample Questions and Answers:

Q: Tell me about the Amazon rainforest
A: The Amazon rainforest is the largest tropical rainforest in the world, covering approximately 5.5 million square kilometers.

Q: How many countries does the Amazon span across?
A: It spans across nine countries, including Brazil, Peru, and Colombia.

Q: Why is deforestation a problem for rainforest
A: Deforestation is a significant threat to the Amazon, with thousands of square kilometers lost each year due to agriculture, logging, and urbanization.

Q: Does the Amazon rainforest affect global weather patterns?
A: Scientists believe that the Amazon plays a crucial role in global weather patterns by releasing water vapor into the atmosphere, which influences rainfall across South America and even other continents.

Q: Do indigenous tribes play a role in the Amazon?
A: Indigenous tribes have lived in the Amazon for thousands of years, relying on its rich biodiversity for food, medicine, and shelter.

Q: What about 

In [96]:
questions2 = [
    "What is rain",
    "What can you say about species?",
    "How old is the rainforest",
    "what is the meaning of life",
    "How are you today, mister robot",
    "What issues does the ranforest face",
    "Is it illegal to cut down the rainforst?",
    "Is it against the law to cut down the rainforst?",
    "If i may ask a stupid question, my good sir, on the behalf of the rainforest of course, what would you dare say be the most important issue that the rainforest currently faces?"
]

In [95]:
RAG_pipeline(stored_texts, questions2)


Sample Questions and Answers:

Q: What is rain
A: Scientists believe that the Amazon plays a crucial role in global weather patterns by releasing water vapor into the atmosphere, which influences rainfall across South America and even other continents.

Q: What can you say about species?
A: The rainforest is home to around 10% of the known species on Earth, including jaguars, sloths, and thousands of species of insects and birds.

Q: How old is the rainforest
A: The rainforest is home to around 10% of the known species on Earth, including jaguars, sloths, and thousands of species of insects and birds.

Q: what is the meaning of life
A: These tribes have unique languages, traditions, and knowledge of the ecosystem.

Q: How are you today, mister robot
A: However, many face threats from illegal land encroachment and industrial activities.

Q: What issues does the ranforest face
A: However, many face threats from illegal land encroachment and industrial activities.

Q: Is it illegal to cu

In [224]:
questions4 = [
    "Can you fish in spaghetti",
    "mexico",
    "sweden",
    "italy",
    "norway",
    "many fish",
    "many fish skill",
    "working"
]

In [225]:
RAG_pipeline(sentence_chunks, questions4) # stored_texts, sentence_chunks


Sample Questions and Answers:

Q: Can you fish in spaghetti
A: 

Q: mexico
A: It spans across nine countries, including Brazil, Peru, and Colombia.

Q: sweden
A: 

Q: italy
A: It spans across nine countries, including Brazil, Peru, and Colombia.

Q: norway
A: It spans across nine countries, including Brazil, Peru, and Colombia.

Q: many fish
A: 

Q: many fish skill
A: These tribes have unique languages, traditions, and knowledge of the ecosystem.

Q: working
A: 



#### Wildlife is not used in the text, have to use species in question

### 4. Try a different similarity search instead of cosine similarity - do the answers improve?

In [264]:
def euclidean_distance(vec1, vec2):
    #Calculates the Euclidean distance between two vectors
    return np.linalg.norm(vec1 - vec2)

# Function to generate embeddings
def get_transformer_embeddings2(texts):
    return model.encode(texts, convert_to_numpy=True)

# Function to retrieve relevant passage
def retrieve_passage2(stored_texts, stored_embeddings, query):
    query_embedding = get_transformer_embeddings2([query])
    distances = [euclidean_distance(query_embedding, stored_vec) for stored_vec in stored_embeddings]
    best_match_idx = np.argmin(distances)

    return stored_texts[best_match_idx]

# Function to answer questions based on stored content
def answer_question2(stored_texts, stored_embeddings, query):
    relevant_passage = retrieve_passage2(stored_texts, stored_embeddings, query)
    return relevant_passage

In [265]:
def RAG_pipeline2(stored_texts, _questions):
    stored_embeddings = model.encode(stored_texts, convert_to_numpy=True)  # Store embeddings for each chunk
    
    print("\nSample Questions and Answers:\n")
    for question in _questions:        
        response = answer_question2(stored_texts, stored_embeddings, question)
        print(f"Q: {question}\nA: {response}\n")

In [266]:
RAG_pipeline2(sentence_chunks, questions)


Sample Questions and Answers:

Q: What is the Amazon rainforest?
A: The Amazon rainforest is the largest tropical rainforest in the world, covering approximately 5.5 million square kilometers.

Q: Which countries does the Amazon span across?
A: It spans across nine countries, including Brazil, Peru, and Colombia.

Q: Why is deforestation a problem in the Amazon?
A: Deforestation is a significant threat to the Amazon, with thousands of square kilometers lost each year due to agriculture, logging, and urbanization.

Q: How does the Amazon rainforest affect global weather patterns?
A: Scientists believe that the Amazon plays a crucial role in global weather patterns by releasing water vapor into the atmosphere, which influences rainfall across South America and even other continents.

Q: What role do indigenous tribes play in the Amazon?
A: Indigenous tribes have lived in the Amazon for thousands of years, relying on its rich biodiversity for food, medicine, and shelter.

Q: What is the 

In [297]:
def euclidean_distance(vec1, vec2):
    #Calculates the Euclidean distance between two vectors
    # when using normalize on vectors like this, it should give the exact same result as a cosine_simularity, but that is not what I'm seeing
    # the max features should have a impact on the model, maybe getting that right will give the same result, but that takes effort to get right
    return np.linalg.norm(vec1 - vec2) 
    #return np.linalg.multi_dot(vec1 - vec2) # here the size of the vector is not normalized, meaning that their length could affect the result, got the same result in this case

# Function to retrieve relevant passage
def retrieve_passage3(stored_texts, query):
    vectorizer = TfidfVectorizer(max_features=100)
    tfidf_matrix = vectorizer.fit_transform(stored_texts)
    query_vector = vectorizer.transform([query])
    
    similarities = cosine_similarity(query_vector, tfidf_matrix)

    # Get the most similar document
    most_similar_index = similarities.argmax()
    return stored_texts[most_similar_index]


# Function to answer questions based on stored content
def answer_question3(stored_texts, query):
    relevant_passage = retrieve_passage3(stored_texts, query)
    return relevant_passage

In [295]:
def RAG_pipeline3(stored_texts, _questions):
    #stored_embeddings = model.encode(stored_texts, convert_to_numpy=True)  # Store embeddings for each chunk
    
    print("\nSample Questions and Answers:\n")
    for question in _questions:        
        response = answer_question3(stored_texts, question)
        print(f"Q: {question}\nA: {response}\n")

In [296]:
RAG_pipeline3(sentence_chunks, questions)


Sample Questions and Answers:

Q: What is the Amazon rainforest?
A: The Amazon rainforest is the largest tropical rainforest in the world, covering approximately 5.5 million square kilometers.

Q: Which countries does the Amazon span across?
A: It spans across nine countries, including Brazil, Peru, and Colombia.

Q: Why is deforestation a problem in the Amazon?
A: The Amazon rainforest is the largest tropical rainforest in the world, covering approximately 5.5 million square kilometers.

Q: How does the Amazon rainforest affect global weather patterns?
A: Scientists believe that the Amazon plays a crucial role in global weather patterns by releasing water vapor into the atmosphere, which influences rainfall across South America and even other continents.

Q: What role do indigenous tribes play in the Amazon?
A: Indigenous tribes have lived in the Amazon for thousands of years, relying on its rich biodiversity for food, medicine, and shelter.

Q: What is the importance of the Amazon R

##### Really hard to tell if what makes the model better, have to read the document and read the input and output manually, requires alot to get right