In [1]:
import os
import re
from warnings import filterwarnings
from typing import List
import pandas as pd
from langchain.text_splitter import SpacyTextSplitter
from langchain_community.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [2]:
filterwarnings("ignore")

# setting up OpenAI API key as environment variable for augmented generation
openai_api_key = "sk-....."
os.environ["OPENAI_API_KEY"] = openai_api_key

# Loading the data

To get know a bit about the length of texts, information about the number of characters and the number of tokens (according to the basic word search pattern) was added to the dataframe.

In [3]:
data = pd.read_csv("data/medium.csv")

data["N_Characters"] = data["Text"].apply(lambda article: len(article))
data["N_Tokens"] = data["Text"].apply(lambda article: len(re.findall(r"(?u)\b\w\w+\b", article)))

data.head()

Unnamed: 0,Title,Text,N_Characters,N_Tokens
0,A Beginner’s Guide to Word Embedding with Gens...,1. Introduction of Word2vec\n\nWord2vec is one...,10559,1460
1,Hands-on Graph Neural Networks with PyTorch & ...,"In my last article, I introduced the concept o...",829,137
2,How to Use ggplot2 in Python,Introduction\n\nThanks to its strict implement...,5656,935
3,Databricks: How to Save Data Frames as CSV Fil...,Photo credit to Mika Baumeister from Unsplash\...,1784,278
4,A Step-by-Step Implementation of Gradient Desc...,A Step-by-Step Implementation of Gradient Desc...,4797,712


The articles vary in length, ranging from 250 to over 47 000 characters.

In [4]:
data.describe().T.round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
N_Characters,1391.0,5566.38,5558.4,250.0,1896.5,3063.0,7627.5,47509.0
N_Tokens,1391.0,877.46,867.84,43.0,302.0,495.0,1193.5,7385.0


Loading data into langchain format:

In [5]:
loader = DataFrameLoader(data_frame=data, page_content_column="Text")
articles = loader.load()

# Indexing

SpacyTextSplitter is used to semantically divide articles. It employs a chunk size of 1000 tokens and a chunk overlap of 150 tokens to efficiently process large documents.\
HuggingFaceEmbeddings is then used to obtain embeddings for each chunk.\
Finally, Chroma is used to create a vector database (db) from the split documents and their embeddings, persisting it to the specified directory ("db") - not included in repository due to size over 130 MB.

In [6]:
text_splitter = SpacyTextSplitter(chunk_size=1000, chunk_overlap=150)
splits = text_splitter.split_documents(articles)
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

db = Chroma.from_documents(splits,
                            embeddings,
                            persist_directory="db")

# once created, the database can be quickly loaded if needed
# db = Chroma(persist_directory="db", embedding_function=embeddings)

# Retrieval

Utilities:

In [7]:
def get_retriever(vector_store_database, search_type: str, k: int):
    """
    Get set VectorStoreRetriever object.
    
    Args:
        vector_store_database: Vector database.
        search_type: Use "similarity" for vector similarity search,
                or "mmr" for optimizing both similarity and retrieved
                chunks diversity.
        k: Number of returned chunks.
    Returns:
        retriever: VectorStoreRetriever object.
    """
    return vector_store_database.as_retriever(search_type=search_type,
                                              search_kwargs={"k": k})


def print_retrieved_fragments(results: list):
    """Print retrieved article fragments prettier."""
    for doc in results:
        print(f"From article: {doc.metadata['Title']}")
        print("-"*70)
        print(doc.page_content)
        print("\n","#"*70, "\n")


def get_and_save_retrievals_to_csv(questions: List[str],
                                   path: str,
                                   db,
                                   search_type: str = "similarity",
                                   k: int = 5):
    """
    Get retrieved article chunks for many questions and save to csv file.
    
    Args:
        questions: List of strings - query questions for retrieval.
        path: Save csv file path.
        db: Vector store database.
        search_type: Use "similarity" for vector similarity search,
                or "mmr" for optimizing both similarity and retrieved
                chunks diversity.
        k: Number of returned chunks.
    Returns:
        results_df: pd.DataFrame
    """
    all_titles = []
    all_chunks = []
    all_questions = []

    for question in questions:
        retriever = get_retriever(db, search_type, k)
        results = retriever.invoke(question)
        
        titles = [doc.metadata["Title"] for doc in results]
        chunks = [doc.page_content for doc in results]
        questions = [question] * len(results)

        all_titles.extend(titles)
        all_chunks.extend(chunks)
        all_questions.extend(questions)
    
    results_dict = {
        "Title": all_titles,
        "Text_chunk": all_chunks,
        "Question": all_questions 
    }
    
    results_df = pd.DataFrame(results_dict)
    results_df.to_csv(path, index=False)

    return results_df

Saving retrieved chunks with questions to csv file for further analysis or usage:

In [8]:
question_1 = "What is deep learning and how to learn it?"
question_2 = "How to use PCA and what are the benefits?"

save_path = "retrieval_results.csv"
df = get_and_save_retrievals_to_csv(questions=[question_1, question_2],
                                    path=save_path,
                                    db=db)
df

Unnamed: 0,Title,Text_chunk,Question
0,Why Deep Learning Is Not A Silver Bullet For A...,Deep learning is a form of AI that was designe...,What is deep learning and how to learn it?
1,Convolutional Neural Network: A Step By Step G...,You’ll be told further in this tutorial.)\n\n\...,What is deep learning and how to learn it?
2,A deep intuition to deep learning,“Deep Learning is the process of learning the ...,What is deep learning and how to learn it?
3,Convolutional Neural Network: A Step By Step G...,Step 5: Exploring Deep Learning\n\nDeep learni...,What is deep learning and how to learn it?
4,Machine Learning in Energy,Deep learning = a family of machine learning m...,What is deep learning and how to learn it?
5,Identifying the right meaning of the words usi...,Principal Component Analysis (PCA)\n\nPCA is a...,How to use PCA and what are the benefits?
6,Tidying up with PCA: An Introduction to Princi...,Principal component analysis (PCA) is a techni...,How to use PCA and what are the benefits?
7,Dimensionality Reduction toolbox in python,Let’s try to understand more about PCA and how...,How to use PCA and what are the benefits?
8,Principal Component Analysis for Dimensionalit...,Introduction to Principal Component Analysis\n...,How to use PCA and what are the benefits?
9,Principal Component Analysis — Math and Intuit...,"As promised, this is the third and last post o...",How to use PCA and what are the benefits?


Displaying retrieval results for question_1 (What is deep learning and how to learn it?):

In [9]:
retriever = get_retriever(db, search_type="mmr", k=5)

results = retriever.invoke(question_1)
print_retrieved_fragments(results)

From article: Why Deep Learning Is Not A Silver Bullet For Autonomous Vehicles
----------------------------------------------------------------------
Deep learning is a form of AI that was designed to work like the human brain.



Engineers teach it much the same way a human learns.

As an example, imagine you’re trying to create a deep-learning algorithm that can detect a cat in an image.

The question the algorithm attempts to answer has an objectively correct response.

There’s either a cat in the picture, or there isn’t.

In mathematical terms, the problem space has a clear global minima.

Because the requested output is simple, we have a good understanding of how to program this cat-detecting algorithm.

First, you “train” the algorithm with data — tens of thousands of pictures of cats, as well as pictures of not-cats, such as dogs and people and bears.

Soon, the algorithm can look at new, never-before-seen photographs, and decide with a high rate of certainty whether the set of 

# Example usage for augmented generation

Context based question answering using gpt-3.5-turbo model.

In [10]:
template = """Use the following pieces of context to answer the question below. 
If you don't know the answer (because there's not a relevant information in the context), 
just say that you don't know, don't try to make it up. 
Analyze the context carefully to give the best possible answer. Your answer should be helpful 
and informative, but concise. Then, give titles the original context texts you finally used. 
Don't give any sources if you didn't find the answer. 
Question: {question}
Context: {context}
Helpful answer: your answer
Sources: numbered list of article titles you used"""

prompt = PromptTemplate.from_template(template)

# initializing an LLM for generating responses
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# initializing RAG chain for context-aware generation
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [11]:
completion = rag_chain.invoke("What is deep learning and how to learn it?")
print(completion)

Deep learning is a form of AI that uses multi-layered neural networks to approximate functions. To learn deep learning, one can start by understanding neural networks and then dive into deep learning concepts through various learning mediums such as blogs, books, videos, or online courses. It is essential to repeat learning steps, test skills, join communities, ask questions, and follow recent research to master deep learning. Persistence is key in overcoming initial difficulties in learning deep learning concepts.

1. Why Deep Learning Is Not A Silver Bullet For Autonomous Vehicles
2. Convolutional Neural Network: A Step By Step Guide
3. Machine Learning in Energy
4. How to build a DIY deep learning framework in NumPy


In [12]:
completion = rag_chain.invoke("How to use PCA and what are the benefits?")
print(completion)

To use PCA, you can apply an orthogonal transformation to reduce the dimension of vectors. PCA finds special basis vectors (eigenvectors) to maximize the variance of the reduced-dimension data. The benefits of using PCA include the ability to project high-dimensional vectors into a lower-dimensional space for visualization and keeping the maximum possible variance, which helps in identifying classes in the data. PCA is useful for visualizing data, preparing it for other machine learning algorithms, and selecting important features for a target variable. It is also beneficial for dealing with high-dimensional data sets and simplifying complex data structures.

1. Identifying the right meaning of the words using BERT
2. Tidying up with PCA: An Introduction to Principal Components Analysis
3. Feature Transformation. How to handle different feature types…
4. Principal Components of PCA
5. Principal Component Analysis for Dimensionality Reduction


**Control questions about topics that are not present in the dataset:**

In [13]:
completion = rag_chain.invoke("What are the most popular home plants?")
print(completion)

I don't know.


In [14]:
completion = rag_chain.invoke("What is the best food for a little dog?")
print(completion)

I don't know.
