In [2]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
import random as rd
import yaml


from extract import DocumentProcessor
from qa_dataset_manager import QADatasetManager
from rag import rag_manager
from utils import load_config

# Transformers
from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder
from FlagEmbedding import FlagReranker

#LLM
from huggingface_hub import InferenceClient

# search
import faiss

In [3]:
# "IPython magic command" to automatically reload any module whose
# implementation has been modified during the execution of the notebook
%load_ext autoreload
%autoreload 2

In [4]:
# Load hugging Face token
config = load_config('../config.yaml')
hugging_face_api_key = config['huggingface']['token_api']

### Extract documents from folders

In [None]:
# Extract markdown documents and store them in "data/documents.csv"
processor = DocumentProcessor( root_dir='../content', output_path='../data/documents.csv')
processor.process_documents()

Extracting files from folder: account-and-profile
Extracting files from folder: actions
Extracting files from folder: admin
Extracting files from folder: apps
Extracting files from folder: authentication
Extracting files from folder: billing
Extracting files from folder: code-security
Extracting files from folder: codespaces
Extracting files from folder: communities
Extracting files from folder: contributing
Extracting files from folder: copilot
Extracting files from folder: desktop
Extracting files from folder: discussions
Extracting files from folder: education
Extracting files from folder: get-started
Extracting files from folder: github-cli
Extracting files from folder: graphql
Extracting files from folder: index.md
Extracting files from folder: issues
Extracting files from folder: migrations
Extracting files from folder: organizations
Extracting files from folder: packages
Extracting files from folder: pages
Extracting files from folder: pull-requests
Extracting files from folder:

In [15]:
dataset_manager = QADatasetManager()
df = pd.read_csv("../data/documents.csv")
data = df['content'].to_list()
nodes = dataset_manager.parse_documents(texts=data,chunk_size=2048)

The documents have been divided into 6014 chunks


In [38]:
dataset_manager.save_nodes(nodes=nodes, saving_path='../data/nodes.json')

saved nodes to --> ../data/nodes.json 


### Generate QA Dataset from documents

In [4]:
inference = InferenceClient(token=hugging_face_api_key)
model_zephyr ="HuggingFaceH4/zephyr-7b-beta"

In [5]:
df = pd.read_csv("../data/documents.csv")
data = df['content'].to_list()
df.head()

Unnamed: 0,content,title
0,\n\nChoosing how to unsubscribe\n\nTo unwatch ...,managing-your-subscriptions
1,\n\nDiagnosing why you receive too many notifi...,viewing-your-subscriptions
2,\n\nNotifications and subscriptions\n\nYou can...,about-notifications
3,\n\nNotification delivery options\n\nYou can r...,configuring-notifications
4,\n\nStarting your inbox triage\n\nBefore you s...,customizing-a-workflow-for-triaging-your-notif...


In [6]:
dataset_manager = QADatasetManager()

In [6]:
dataset_manager.metadata

{'max_index': 755,
 'last_updated': '2023-12-23 21:47',
 'creation_date': '2023-12-21 15:40'}

In [7]:
dataset_manager.create_qa_pairs(
    texts=data,
    client=InferenceClient(token=hugging_face_api_key),
    model= model_zephyr,
    max_new_tokens=200,
    num_questions_per_chunk=2,
    chunk_size = 2048)

641 - Number of chunks in text 259: 2
642 - Number of chunks in text 1340: 2
643 - Number of chunks in text 281: 1
644 - Number of chunks in text 1126: 4
645 - Number of chunks in text 178: 7
Updating QA dataset
Saved questions to --> ../data/qa_dataset_intermed.json
Updated metadata
646 - Number of chunks in text 711: 2
647 - Number of chunks in text 554: 1
648 - Number of chunks in text 1257: 3
649 - Number of chunks in text 1111: 3
650 - Number of chunks in text 2113: 5
Updating QA dataset
Saved questions to --> ../data/qa_dataset_intermed.json
Updated metadata
651 - Number of chunks in text 946: 1
652 - Number of chunks in text 519: 2
653 - Number of chunks in text 2069: 27
654 - Number of chunks in text 1529: 1
655 - Number of chunks in text 382: 1
Updating QA dataset
Saved questions to --> ../data/qa_dataset_intermed.json
Updated metadata
656 - Number of chunks in text 1539: 1
657 - Number of chunks in text 991: 3
658 - Number of chunks in text 862: 8
659 - Number of chunks in te

In [7]:
dataset_manager.add_qa_to_dataset()

Added questions from ../data/qa_dataset_intermed.json to --> ../data/qa_dataset.json


In [6]:
qa_dataset = dataset_manager.get_qa_dataset()

In [7]:
len(qa_dataset['queries']), len(qa_dataset['answers'])

(7160, 7160)

In [8]:
dataset_manager.create_answers(qa_dataset=qa_dataset,
                               client=InferenceClient(token=hugging_face_api_key),
                               model= model_zephyr,
                               max_new_tokens=500)

2 to be answered
Answering qery b6432350-03b2-4f4f-8664-098d526ba9ef
query:  How can I generate a
Answering qery 8c788494-6cb4-41e6-90ef-facca8249af6
query:  How can a status report be posted without taking any other action on {% data variables.product.prodname_dotcom_the_website %} during a deployment
Updating answers dict
Saved questions to --> ../data/answers_intermed.json


In [9]:
dataset_manager.add_answers_to_dataset()

Added answers from ../data/answers_intermed.json to --> ../data/qa_dataset.json


### Create FAISS indexes

In [7]:
inference = InferenceClient(token=hugging_face_api_key)
model_zephyr ="HuggingFaceH4/zephyr-7b-beta"
model_mistral = "mistralai/Mistral-7B-v0.1"
model_falcon = "tiiuae/falcon-7b-instruct"
model_open = "openchat/openchat_3.5"

model_L6_v2 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
model_mpnet =  SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

dataset_manager = QADatasetManager()
nodes = dataset_manager.load_nodes('../data/nodes.json')

Successfuly loaded nodes from ../data/nodes.json


In [6]:
rag_chain = rag_manager(nodes=nodes, client=inference)

##### Creating FlatIP

In [168]:
index_flatIP, embeddings = rag_chain.create_index(model=model_L6_v2)

Embeddings created


In [169]:
embeddings.shape

(6014, 384)

In [50]:
rag_chain.save_index(index_flatIP, '../data/indexes/L6_V2_index_flatIP')

Successfuly saved index to ../data/indexes/index_flatIP


In [8]:
index_flatIP_mpnet, embeddings = rag_chain.create_index(model=model_mpnet)

Embeddings created


In [10]:
rag_chain.save_index(index_flatIP_mpnet, '../data/indexes/mpnet_index_flatIP')

Successfuly saved index to ../data/indexes/index_flatIP_mpnet


##### Creating Inverted File Index (IVF)

In [174]:
n_clusters = [50,100, 200, 500, 1000]
for nlist in n_clusters:
    dimension = index_flatIP.d
    quantizer = faiss.IndexFlatL2(dimension)

    indexIVFFlat = faiss.IndexIVFFlat(quantizer, dimension, nlist)
    indexIVFFlat.train(embeddings)
    indexIVFFlat.add(embeddings)
    index_path = "../data/indexes/L6_V2_index_IVFFlat_" + str(nlist) 
    rag_chain.save_index(indexIVFFlat, index_path)

Successfuly saved index to ../data/indexes/L6_V2_index_IVFFlat_50
Successfuly saved index to ../data/indexes/L6_V2_index_IVFFlat_100
Successfuly saved index to ../data/indexes/L6_V2_index_IVFFlat_200
Successfuly saved index to ../data/indexes/L6_V2_index_IVFFlat_500
Successfuly saved index to ../data/indexes/L6_V2_index_IVFFlat_1000


In [18]:
n_clusters = [50,100, 200, 500, 1000]
for nlist in n_clusters:
    dimension = index_flatIP_mpnet.d
    quantizer = faiss.IndexFlatL2(dimension)

    indexIVFFlat = faiss.IndexIVFFlat(quantizer, dimension, nlist)
    indexIVFFlat.train(embeddings)
    indexIVFFlat.add(embeddings)
    index_path = "../data/indexes/mpnet_index_IVFFlat_" + str(nlist) 
    rag_chain.save_index(indexIVFFlat, index_path)

Successfuly saved index to ../data/indexes/mpnet_index_IVFFlat_50
Successfuly saved index to ../data/indexes/mpnet_index_IVFFlat_100
Successfuly saved index to ../data/indexes/mpnet_index_IVFFlat_200
Successfuly saved index to ../data/indexes/mpnet_index_IVFFlat_500
Successfuly saved index to ../data/indexes/mpnet_index_IVFFlat_1000


##### Creating Inverted File Index Product Quantization (IVFPQ)

In [178]:
type(embeddings[0][0]), embeddings.shape[1]

(numpy.float32, 384)

In [181]:
n_centroids = [8, 16, 32, 64, 128] # number of centroid IDs in final compressed vectors
for m in n_centroids:
    bits = 8 # number of bits in each centroid reduced from 32 -> 8

    quantizer = faiss.IndexFlatL2(dimension)  # we keep the same L2 distance flat index
    indexIVFPQ = faiss.IndexIVFPQ(quantizer, dimension, nlist, m, bits) 
    indexIVFPQ.train(embeddings)
    indexIVFPQ.add(embeddings)
    index_path = "../data/indexes/L6_V2_index_IVFPQ_" + str(m) 
    rag_chain.save_index(indexIVFPQ, index_path)

Successfuly saved index to ../data/indexes/L6_V2_index_IVFPQ_8
Successfuly saved index to ../data/indexes/L6_V2_index_IVFPQ_16
Successfuly saved index to ../data/indexes/L6_V2_index_IVFPQ_32
Successfuly saved index to ../data/indexes/L6_V2_index_IVFPQ_64
Successfuly saved index to ../data/indexes/L6_V2_index_IVFPQ_128


In [22]:
n_centroids = [8, 16, 32, 64, 128] # number of centroid IDs in final compressed vectors
for m in n_centroids:
    bits = 8 # number of bits in each centroid reduced from 32 -> 8

    quantizer = faiss.IndexFlatL2(dimension)  # we keep the same L2 distance flat index
    indexIVFPQ = faiss.IndexIVFPQ(quantizer, dimension, nlist, m, bits) 
    indexIVFPQ.train(embeddings)
    indexIVFPQ.add(embeddings)
    index_path = "../data/indexes/mpnet_index_IVFPQ_" + str(m) 
    rag_chain.save_index(indexIVFPQ, index_path)

Successfuly saved index to ../data/indexes/mpnet_index_IVFPQ_8
Successfuly saved index to ../data/indexes/mpnet_index_IVFPQ_16
Successfuly saved index to ../data/indexes/mpnet_index_IVFPQ_32
Successfuly saved index to ../data/indexes/mpnet_index_IVFPQ_64
Successfuly saved index to ../data/indexes/mpnet_index_IVFPQ_128


### Evaluate RAG chain

In [39]:
inference = InferenceClient(token=hugging_face_api_key)
model_zephyr ="HuggingFaceH4/zephyr-7b-beta"
model_mistral = "mistralai/Mistral-7B-v0.1"
model_falcon = "tiiuae/falcon-7b-instruct"
model_open = "openchat/openchat-3.5-1210"

model_L6_v2 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
model_mpnet =  SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)
reranker = FlagReranker('BAAI/bge-reranker-base', use_fp16=True) 

dataset_manager = QADatasetManager()
nodes = dataset_manager.load_nodes('../data/nodes.json')

Successfuly loaded nodes from ../data/nodes.json


In [5]:
qa_dataset = dataset_manager.get_qa_dataset()

In [6]:
starts = ['how', 'when', 'who', 'what']
filtered_questions = [
    (key, question) for key, question in qa_dataset['queries'].items()
    if 'example' not in question.lower() and 'context' not in question.lower() and len(question) > 20
    and any(question.lower().startswith(start) for start in starts)
]

# Extracting keys and questions separately
selected_keys = [key for key, _ in filtered_questions]
questions = [question for _, question in filtered_questions]
ground_truths = [qa_dataset['answers'].get(key) for key in selected_keys]
relevant_docs = [qa_dataset['relevant_docs'].get(key)[0] for key in selected_keys]

len(questions)

4837

In [7]:
rag_chain = rag_manager(nodes=nodes, client=inference)

In [8]:
index_flatIP = rag_chain.load_index('../data/indexes/L6_V2_index_flatIP')

Successfuly loaded index from ../data/indexes/index_flatIP_L6_V2


In [9]:
# Evaluating MiniLM-L6-v2 FlatIP indexes while varying the number of contexts retreived

k_range = [1,3,5,10,20]
scores= {}
for k in k_range :
    contexts, contexts_ids, search_time = rag_chain.retrieve_context(index=index_flatIP, embedding_model=model_L6_v2, queries=questions, k=k)
    data = {
        "questions": questions,
        "contexts": contexts,
        "contexts_ids": contexts_ids,
        "relevant_docs": relevant_docs
    }
    scores_k = rag_chain.evaluate_rag(metrics=['mrr', 'hit'], data = data)
    scores_k['search_time'] = search_time
    scores[k] = scores_k
pd.DataFrame(scores)

Unnamed: 0,1,3,5,10,20
mrr,0.399421,0.469678,0.484615,0.495215,0.499392
hit_score,0.399421,0.559438,0.625388,0.703742,0.764523
search_time,80.47,74.41,75.15,75.19,77.9


In [9]:
index_flatIP_mpnet = rag_chain.load_index('../data/indexes/mpnet_index_flatIP')

Successfuly loaded index from ../data/indexes/index_flatIP_mpnet


In [17]:
# Evaluating mpnet-base-V2 FlatIP indexes while varying the number of contexts retreived
k_range = [1,3,5,10,20]
scores= {}
for k in k_range :
    contexts, contexts_ids, search_time = rag_chain.retrieve_context(index=index_flatIP_mpnet, embedding_model=model_mpnet, queries=questions, k=k)
    data = {
        "questions": questions,
        "contexts": contexts,
        "contexts_ids": contexts_ids,
        "relevant_docs": relevant_docs
    }
    scores_k = rag_chain.evaluate_rag(metrics=['mrr', 'hit'], data = data)
    scores_k['search_time'] = search_time
    scores[k] = scores_k
pd.DataFrame(scores)

Unnamed: 0,1,3,5,10,20
mrr,0.439115,0.522259,0.537168,0.547778,0.55217
hit_score,0.439115,0.625388,0.690924,0.770105,0.829646
search_time,410.45,416.81,407.22,431.2,409.74


In [186]:
# Evaluating MiniLM-L6-v2 IVF indexes for differents values of clusters

k = 10
n_clusters = [50,100, 200, 500, 1000]
root_path = "../data/indexes/L6_V2_index_"
scores= {}
for nlist in n_clusters :
    path_suffix = "IVFFlat_" + str(nlist)
    indexIVFFlat = rag_chain.load_index(root_path + path_suffix, verbose=False)
    contexts, contexts_ids, search_time = rag_chain.retrieve_context(index=indexIVFFlat, embedding_model=model_L6_v2, queries=questions, k=k)
    data = {
        "questions": questions,
        "contexts": contexts,
        "contexts_ids": contexts_ids,
        "relevant_docs": relevant_docs
    }
    scores_nlist = rag_chain.evaluate_rag(metrics=['mrr', 'hit'], data = data)
    scores_nlist['search_time'] = search_time
    scores[path_suffix] = scores_nlist
pd.DataFrame(scores)

Unnamed: 0,IVFFlat_50,IVFFlat_100,IVFFlat_200,IVFFlat_500,IVFFlat_1000
mrr,0.278428,0.270392,0.27953,0.287882,0.294979
hit_score,0.374406,0.363862,0.366963,0.36655,0.35766
search_time,78.08,76.86,75.41,76.45,76.82


In [23]:
# Evaluating mpnet-base-V2 IVF indexes for differents values of clusters
k = 10
n_clusters = [50,100, 200, 500, 1000]
root_path = "../data/indexes/mpnet_index_"
scores= {}
for nlist in n_clusters :
    path_suffix = "IVFFlat_" + str(nlist)
    indexIVFFlat = rag_chain.load_index(root_path + path_suffix, verbose=False)
    contexts, contexts_ids, search_time = rag_chain.retrieve_context(index=indexIVFFlat, embedding_model=model_mpnet, queries=questions, k=k)
    data = {
        "questions": questions,
        "contexts": contexts,
        "contexts_ids": contexts_ids,
        "relevant_docs": relevant_docs
    }
    scores_nlist = rag_chain.evaluate_rag(metrics=['mrr', 'hit'], data = data)
    scores_nlist['search_time'] = search_time
    scores[path_suffix] = scores_nlist
pd.DataFrame(scores)

Unnamed: 0,IVFFlat_50,IVFFlat_100,IVFFlat_200,IVFFlat_500,IVFFlat_1000
mrr,0.361355,0.35108,0.344517,0.349495,0.356847
hit_score,0.484184,0.470746,0.455654,0.445731,0.436841
search_time,390.32,418.47,395.83,399.21,398.94


In [12]:
# Evaluating MiniLM-L6-v2 IVFPQ indexes for differents values of clusters
k = 10
n_clusters = [8, 16, 32, 64, 128]
root_path = "../data/indexes/L6_V2_index_"
scores= {}
for nlist in n_clusters :
    path_suffix = "IVFPQ_" + str(nlist)
    indexIVFFlat = rag_chain.load_index(root_path + path_suffix, verbose=False)
    contexts, contexts_ids, search_time = rag_chain.retrieve_context(index=indexIVFFlat, embedding_model=model_L6_v2, queries=questions, k=k)
    data = {
        "questions": questions,
        "contexts": contexts,
        "contexts_ids": contexts_ids,
        "relevant_docs": relevant_docs
    }
    scores_nlist = rag_chain.evaluate_rag(metrics=['mrr', 'hit'], data = data)
    scores_nlist['search_time'] = search_time
    scores[path_suffix] = scores_nlist
pd.DataFrame(scores)

Unnamed: 0,IVFPQ_8,IVFPQ_16,IVFPQ_32,IVFPQ_64,IVFPQ_128
mrr,0.275281,0.286447,0.292997,0.296531,0.294752
hit_score,0.353938,0.355386,0.356626,0.357866,0.357866
search_time,96.02,76.65,71.24,72.1,78.3


In [13]:
# Evaluating mpnet-base-V2 IVFPQ indexes for differents values of clusters
k = 10
n_clusters = [8, 16, 32, 64, 128]
root_path = "../data/indexes/mpnet_index_"
scores= {}
for nlist in n_clusters :
    path_suffix = "IVFPQ_" + str(nlist)
    indexIVFFlat = rag_chain.load_index(root_path + path_suffix, verbose=False)
    contexts, contexts_ids, search_time = rag_chain.retrieve_context(index=indexIVFFlat, embedding_model=model_mpnet, queries=questions, k=k)
    data = {
        "questions": questions,
        "contexts": contexts,
        "contexts_ids": contexts_ids,
        "relevant_docs": relevant_docs
    }
    scores_nlist = rag_chain.evaluate_rag(metrics=['mrr', 'hit'], data = data)
    scores_nlist['search_time'] = search_time
    scores[path_suffix] = scores_nlist
pd.DataFrame(scores)

Unnamed: 0,IVFPQ_8,IVFPQ_16,IVFPQ_32,IVFPQ_64,IVFPQ_128
mrr,0.321853,0.340079,0.34635,0.352782,0.355102
hit_score,0.43312,0.433533,0.434774,0.43498,0.435394
search_time,386.25,389.92,409.41,389.71,392.74


In [63]:
rd.seed(12345)
q_ids = list(range(len(questions)))
rd.shuffle(q_ids)


questions = np.array(questions)[q_ids][:300].tolist()
ground_truths = np.array(ground_truths)[q_ids][:300].tolist()
relevant_docs = np.array(relevant_docs)[q_ids][:300].tolist()

In [64]:
# Evaluating MiniLM-L6-v2 FlatIP indexes with BAAI reranker  while varying the number of contexts retreived
k_range = [5,10,20]
scores= {}
for k in k_range :
    contexts, contexts_ids, search_time = rag_chain.retrieve_context(index=index_flatIP, 
                                                                 embedding_model=model_L6_v2, 
                                                                 queries=questions, 
                                                                 reranker=reranker,
                                                                 k=k, 
                                                                 k_after_rerank=3)
    data = {
        "questions": questions,
        "contexts": contexts,
        "contexts_ids": contexts_ids,
        "relevant_docs": relevant_docs
    }
    scores_k = rag_chain.evaluate_rag(metrics=['mrr', 'hit'], data = data)
    scores_k['search_time'] = search_time
    scores[k] = scores_k
pd.DataFrame(scores)

Unnamed: 0,5,10,20
mrr,0.261667,0.188889,0.118889
hit_score,0.403333,0.263333,0.176667
search_time,2080.97,4034.42,7916.01


In [14]:
k_range = [1,3,5]
scores = {}
for k in k_range:
    answers, contexts, contexts_ids = rag_chain.augmented_retrieval_generation(queries=questions, 
                                                                            index=index_flatIP,
                                                                            embedding_model=model_L6_v2, 
                                                                            llm_model=model_zephyr
                                                                            k=k)
    data = {
        "questions": questions,
        "answers": answers,
        "contexts": contexts,
        "contexts_ids": contexts_ids,
        "ground_truths": ground_truths,
        "relevant_docs": relevant_docs
    }
    score_k = rag_chain.evaluate_rag(metrics=['mrr', 'hit', 'semantic'], data = data)
    scores[k] = score_k

Answering query: How can code review assignments be configured to automatically request code owners for review, while still allowing for individual review requests to be made in addition to the team request
Answering query: What is the process for privately collaborating to fix a vulnerability in a temporary private fork, and why might this be necessary
Answering query: What is the process for publishing a paid plan for an app on {% data variables.product.prodname_marketplace %}
Answering query: How can I recover access to a GitHub account that is locked due to two-factor authentication (2FA) if I have lost my 2FA credentials and am unable to recover them? What alternative options are available to me in this situation?
Answering query: How can I configure the `publishConfig` fields in my `package.json` file to limit publishing to a specific registry
Answering query: How can I write detailed logs to a specific directory during the execution of the CLI
Answering query: How can configurat

In [15]:
pd.DataFrame(scores)

Unnamed: 0,1,3,5
mrr,0.386667,0.441111,0.461944
hit_score,0.386667,0.516667,0.606667
semantic_similarity_avg,5.989561,5.590974,5.559326


In [40]:
llm_models = [model_open, model_falcon]
scores_llms = {}
scores_llms[model_zephyr] = scores[5]
for llm in llm_models:
    answers, contexts, contexts_ids = rag_chain.augmented_retrieval_generation(queries=questions, 
                                                                            index=index_flatIP,
                                                                            embedding_model=model_L6_v2, 
                                                                            llm_model=llm
                                                                            k=5)
    data = {
        "questions": questions,
        "answers": answers,
        "contexts": contexts,
        "contexts_ids": contexts_ids,
        "ground_truths": ground_truths,
        "relevant_docs": relevant_docs
    }
    score_model= rag_chain.evaluate_rag(metrics=['mrr', 'hit', 'semantic'], data = data)
    scores_llms[llm] = score_model

Answering query: How can code review assignments be configured to automatically request code owners for review, while still allowing for individual review requests to be made in addition to the team request
answer:  The context doesn't give enough information to give an answer.
Answering query: What is the process for privately collaborating to fix a vulnerability in a temporary private fork, and why might this be necessary
answer:  The process for privately collaborating to fix a vulnerability in a temporary private fork is to first create a temporary private fork, then add collaborators to the security advisory, and then collaborate on a patch in private. This might be necessary to keep information about vulnerabilities secure, as integrations, including CI, cannot access temporary private forks.
Answering query: What is the process for publishing a paid plan for an app on {% data variables.product.prodname_marketplace %}
answer:  To publish a paid plan for an app on {% data variable

In [41]:
pd.DataFrame(scores_llms)

Unnamed: 0,HuggingFaceH4/zephyr-7b-beta,openchat/openchat-3.5-1210,tiiuae/falcon-7b-instruct
mrr,0.461944,0.461944,0.461944
hit_score,0.606667,0.606667,0.606667
semantic_similarity_avg,5.559326,4.476016,4.321162
