In [1]:
import pandas as pd
import numpy as np
import random as rd
import yaml


from extract import DocumentProcessor
from qa_dataset_manager import QADatasetManager
from rag import rag_manager

# Transformers
from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder

#LLM
from huggingface_hub import InferenceClient

# search
import faiss

In [2]:
# "IPython magic command" to automatically reload any module whose
# implementation has been modified during the execution of the notebook
%load_ext autoreload
%autoreload 2

In [3]:
# Load hugging Face token
with open('../config.yaml', 'r') as config_file:
        config = yaml.safe_load(config_file)
hugging_face_api_key = config['huggingface']['token_api']

### Extract documents from folders

In [None]:
# Extract markdown documents and store them in "data/documents.csv"
processor = DocumentProcessor( root_dir='../content', output_path='../data/documents.csv')
processor.process_documents()

Extracting files from folder: account-and-profile
Extracting files from folder: actions
Extracting files from folder: admin
Extracting files from folder: apps
Extracting files from folder: authentication
Extracting files from folder: billing
Extracting files from folder: code-security
Extracting files from folder: codespaces
Extracting files from folder: communities
Extracting files from folder: contributing
Extracting files from folder: copilot
Extracting files from folder: desktop
Extracting files from folder: discussions
Extracting files from folder: education
Extracting files from folder: get-started
Extracting files from folder: github-cli
Extracting files from folder: graphql
Extracting files from folder: index.md
Extracting files from folder: issues
Extracting files from folder: migrations
Extracting files from folder: organizations
Extracting files from folder: packages
Extracting files from folder: pages
Extracting files from folder: pull-requests
Extracting files from folder:

### Generate QA Dataset from documents

In [4]:
inference = InferenceClient(token=hugging_face_api_key)
model_zephyr ="HuggingFaceH4/zephyr-7b-beta"
model_mistral = "mistralai/Mistral-7B-v0.1"
model_falcon = "tiiuae/falcon-7b-instruct"
model_open = "openchat/openchat_3.5"

In [5]:
df = pd.read_csv("../data/documents.csv")
data = df['content'].to_list()
df.head()

Unnamed: 0,content,title
0,\n\nChoosing how to unsubscribe\n\nTo unwatch ...,managing-your-subscriptions
1,\n\nDiagnosing why you receive too many notifi...,viewing-your-subscriptions
2,\n\nNotifications and subscriptions\n\nYou can...,about-notifications
3,\n\nNotification delivery options\n\nYou can r...,configuring-notifications
4,\n\nStarting your inbox triage\n\nBefore you s...,customizing-a-workflow-for-triaging-your-notif...


In [6]:
dataset_manager = QADatasetManager()

In [6]:
dataset_manager.metadata

{'max_index': 755,
 'last_updated': '2023-12-23 21:47',
 'creation_date': '2023-12-21 15:40'}

In [7]:
dataset_manager.create_qa_pairs(
    texts=data,
    client=InferenceClient(token=hugging_face_api_key),
    model= model_zephyr,
    max_new_tokens=200,
    num_questions_per_chunk=2,
    chunk_size = 2048)

641 - Number of chunks in text 259: 2
642 - Number of chunks in text 1340: 2
643 - Number of chunks in text 281: 1
644 - Number of chunks in text 1126: 4
645 - Number of chunks in text 178: 7
Updating QA dataset
Saved questions to --> ../data/qa_dataset_intermed.json
Updated metadata
646 - Number of chunks in text 711: 2
647 - Number of chunks in text 554: 1
648 - Number of chunks in text 1257: 3
649 - Number of chunks in text 1111: 3
650 - Number of chunks in text 2113: 5
Updating QA dataset
Saved questions to --> ../data/qa_dataset_intermed.json
Updated metadata
651 - Number of chunks in text 946: 1
652 - Number of chunks in text 519: 2
653 - Number of chunks in text 2069: 27
654 - Number of chunks in text 1529: 1
655 - Number of chunks in text 382: 1
Updating QA dataset
Saved questions to --> ../data/qa_dataset_intermed.json
Updated metadata
656 - Number of chunks in text 1539: 1
657 - Number of chunks in text 991: 3
658 - Number of chunks in text 862: 8
659 - Number of chunks in te

In [7]:
dataset_manager.add_qa_to_dataset()

Added questions from ../data/qa_dataset_intermed.json to --> ../data/qa_dataset.json


In [6]:
qa_dataset = dataset_manager.get_qa_dataset()

In [7]:
len(qa_dataset['queries']), len(qa_dataset['answers'])

(7160, 7160)

In [8]:
dataset_manager.create_answers(qa_dataset=qa_dataset,
                               client=InferenceClient(token=hugging_face_api_key),
                               model= model_zephyr,
                               max_new_tokens=500)

2 to be answered
Answering qery b6432350-03b2-4f4f-8664-098d526ba9ef
query:  How can I generate a
Answering qery 8c788494-6cb4-41e6-90ef-facca8249af6
query:  How can a status report be posted without taking any other action on {% data variables.product.prodname_dotcom_the_website %} during a deployment
Updating answers dict
Saved questions to --> ../data/answers_intermed.json


In [9]:
dataset_manager.add_answers_to_dataset()

Added answers from ../data/answers_intermed.json to --> ../data/qa_dataset.json


### Create FAISS indexes

In [38]:
inference = InferenceClient(token=hugging_face_api_key)
model_zephyr ="HuggingFaceH4/zephyr-7b-beta"
model_mistral = "mistralai/Mistral-7B-v0.1"
model_falcon = "tiiuae/falcon-7b-instruct"
model_open = "openchat/openchat_3.5"

In [7]:
nodes = dataset_manager.parse_documents(texts=data,chunk_size=2048)

The documents have been divided into 6014 chunks


In [20]:
rag_chain = rag_manager(nodes=nodes, client=InferenceClient(token=hugging_face_api_key), llm_model=model_zephyr)

In [16]:
model_L6_v2 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [49]:
index_flatIP, embeddings = rag_chain.create_index(model=model_L6_v2)

Embeddings created


In [50]:
rag_chain.save_index(index_flatIP, '../data/indexes/index_flatIP')

Successfuly saved index to ../data/indexes/index_flatIP


### Evaluate RAG chain

In [57]:
inference = InferenceClient(token=hugging_face_api_key)
model_zephyr ="HuggingFaceH4/zephyr-7b-beta"
model_mistral = "mistralai/Mistral-7B-v0.1"
model_falcon = "tiiuae/falcon-7b-instruct"
model_open = "openchat/openchat_3.5"
model_L6_v2 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)

dataset_manager = QADatasetManager()
df = pd.read_csv("../data/documents.csv")
data = df['content'].to_list()
nodes = dataset_manager.parse_documents(texts=data,chunk_size=2048)

The documents have been divided into 6014 chunks


In [6]:
rag_chain = rag_manager(nodes=nodes, client=InferenceClient(token=hugging_face_api_key), llm_model=model_zephyr)

In [149]:
index = rag_chain.load_index('../data/indexes/index_flatIP_L6_V2')

Successfuly loaded index from ../data/indexes/index_flatIP_L6_V2


In [8]:
queries_test = ["How do you filter notifications?", "What are codespaces ?"]

In [11]:
answers, contexts = rag_chain.augmented_retrieval_generation(queries=queries_test, index=index,embedding_model=model_L6_v2)

totaltime: 0.16382503509521484

Answering query: How do you filter notifications?
Answering query: What are codespaces ?


In [12]:
qa_dataset = dataset_manager.get_qa_dataset()

In [146]:
starts = ['how', 'when', 'who', 'what']
filtered_questions = [
    (key, question) for key, question in qa_dataset['queries'].items()
    if 'example' not in question.lower() and 'context' not in question.lower() and len(question) > 20
    and any(question.lower().startswith(start) for start in starts)
]

# Extracting keys and questions separately
selected_keys = [key for key, _ in filtered_questions]
questions = [question for _, question in filtered_questions]
answers = [qa_dataset['answers'].get(key) for key in selected_keys]
relevant_docs = [qa_dataset['relevant_docs'].get(key)[0] for key in selected_keys]

len(questions)

4837

In [144]:
rd.seed(12345)
q_ids = list(range(len(questions)))
rd.shuffle(q_ids)


questions = np.array(questions)[q_ids][:10].tolist()
ground_truths = np.array(answers)[q_ids][:10].tolist()
relevant_docs = np.array(relevant_docs)[q_ids][:10].tolist()

answers, contexts, contexts_ids = rag_chain.augmented_retrieval_generation(queries=questions, index=index,embedding_model=model_L6_v2, k=10)


data = {
    "questions": questions,
    "answers": answers,
    "contexts": contexts,
    "contexts_ids": contexts_ids,
    "ground_truths": ground_truths,
    "relevant_docs": relevant_docs
}

total search time: 0.1949141025543213

Answering query: How can code review assignments be configured to automatically request code owners for review, while still allowing for individual review requests to be made in addition to the team request
Answering query: What is the process for privately collaborating to fix a vulnerability in a temporary private fork, and why might this be necessary
Answering query: What is the process for publishing a paid plan for an app on {% data variables.product.prodname_marketplace %}
Answering query: How can I recover access to a GitHub account that is locked due to two-factor authentication (2FA) if I have lost my 2FA credentials and am unable to recover them? What alternative options are available to me in this situation?
Answering query: How can I configure the `publishConfig` fields in my `package.json` file to limit publishing to a specific registry
Answering query: How can I write detailed logs to a specific directory during the execution of the 

In [148]:
contexts, contexts_ids, search_time = rag_chain.retrieve_context(index=index, embedding_model=model_L6_v2, queries=questions, k=10)

data = {
    "questions": questions,
    #"answers": answers,
    "contexts": contexts,
    "contexts_ids": contexts_ids,
   # "ground_truths": ground_truths,
    "relevant_docs": relevant_docs
}

total search time: 82.64047455787659

average search time per query: 0.017087084709279467



In [150]:
scores = rag_chain.evaluate_rag(metrics=['mrr', 'hit'], data = data)

In [153]:
k_range = [3,5,10,20]
scores= {}
for k in k_range :
    contexts, contexts_ids, search_time = rag_chain.retrieve_context(index=index, embedding_model=model_L6_v2, queries=questions, k=k)
    data = {
        "questions": questions,
        "contexts": contexts,
        "contexts_ids": contexts_ids,
        "relevant_docs": relevant_docs
    }
    scores_k = rag_chain.evaluate_rag(metrics=['mrr', 'hit'], data = data)
    scores_k['search_time'] = search_time
    scores[k] = scores_k
pd.DataFrame(scores)

KeyboardInterrupt: 