In [3]:
import os
import pandas as pd
import numpy as np
import json
import base64
import yaml
import time 
import faiss
import torch.nn.functional as F
from pathlib import Path
import asyncio
from typing import List
import uuid
import re
from tqdm import tqdm

# Transformers
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from FlagEmbedding import FlagModel, FlagReranker

from llama_index.schema import MetadataMode, TextNode
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.readers.database import DatabaseReader
from llama_index.node_parser import SimpleNodeParser
from llama_index import download_loader


#LLM
from huggingface_hub import InferenceClient

# Embeddings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from huggingface_hub import InferenceClient


# Retrievers
from llama_index.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
)

# Evaluator
from llama_index.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
    RetrieverEvaluator
)

In [4]:
# Load hugging Face token
with open('../config.yaml', 'r') as config_file:
        config = yaml.safe_load(config_file)
hugging_face_api_key = config['huggingface']['token_api']

In [3]:
MarkdownReader = download_loader("MarkdownReader")
loader = MarkdownReader()

In [4]:
documents = loader.load_data(file=Path('./managing-your-subscriptions.md'))

In [5]:
text = ""
for i in range( len(documents)):
    text =  text + documents[i].get_text()
text

'\n\nChoosing how to unsubscribe\n\nTo unwatch (or unsubscribe from) repositories quickly, navigate to github.com/watching to see all the repositories you\'re following. For more information, see "Unwatching repositories."\n\nTo unsubscribe from multiple notifications at the same time, you can unsubscribe using your inbox or on the subscriptions page. Both of these options offer more context about your subscriptions than the "Watched repositories" page.\n\n\n\nBenefits of unsubscribing from your inbox\n\nWhen you unsubscribe from notifications in your inbox, you have several other triaging options and can filter your notifications by custom filters and discussion types. For more information, see "AUTOTITLE."\n\n\n\nBenefits of unsubscribing from the subscriptions page\n\nWhen you unsubscribe from notifications on the subscriptions page, you can see more of the notifications you\'re subscribed to and sort them by "Most recently subscribed" or "Least recently subscribed".\n\nThe subscrip

In [6]:
with open("test_doc.txt", 'w') as fichier:
    fichier.write(text)
    fichier.close()

In [7]:
def load_docs(file_path : str)->str:
    doc = ""
    # Initialise loader and load documents from specified documents
    MarkdownReader = download_loader("MarkdownReader")
    loader = MarkdownReader()
    documents = loader.load_data(file=Path(file_path))

    for i in range(len(documents)):
        doc =  doc + documents[i].get_text()
    return doc

In [22]:
def explore_folders(root_dir):
    docs = []
    titles = []
    for foldername, subfolders, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith('.md') and filename != 'index.md':
                file_path = os.path.join(foldername, filename)
                doc = load_docs(file_path=file_path)
                docs.append(doc)
                titles.append(filename[:-3])
    return docs, titles

In [23]:
for theme_folder in os.listdir('../content'):
    if theme_folder.endswith(".md") != True:
        explore_folders(theme_folder)

In [24]:
docs, titles = explore_folders(os.path.dirname(os.getcwd()) + "\content\\" + os.listdir('../content')[0])

In [25]:
titles

['managing-your-subscriptions',
 'viewing-your-subscriptions',
 'about-notifications',
 'configuring-notifications',
 'customizing-a-workflow-for-triaging-your-notifications',
 'managing-notifications-from-your-inbox',
 'triaging-a-single-notification',
 'about-your-organizations-profile',
 'about-your-profile',
 'managing-your-profile-readme',
 'personalizing-your-profile',
 'pinning-items-to-your-profile',
 'setting-your-profile-to-private',
 'sending-enterprise-contributions-to-your-githubcom-profile',
 'showing-an-overview-of-your-activity-on-your-profile',
 'showing-your-private-contributions-and-achievements-on-your-profile',
 'troubleshooting-commits-on-your-timeline',
 'viewing-contributions-on-your-profile',
 'why-are-my-contributions-not-showing-up-on-my-profile',
 'inviting-collaborators-to-a-personal-repository',
 'maintaining-ownership-continuity-of-your-personal-accounts-repositories',
 'removing-a-collaborator-from-a-personal-repository',
 'removing-yourself-from-a-colla

In [27]:
len(docs)

57

In [34]:
folders = os.listdir('../content')
docs = []
titles = []
root_directory = os.path.dirname(os.getcwd()) + "\content\\" 
for folder in folders:
    if theme_folder.endswith(".md") != True:
        docs__in_folder, titles_in_folder = explore_folders(root_directory + folder)
        docs = docs + docs__in_folder
        titles = titles + titles_in_folder

In [36]:
len(titles)

2138

In [40]:
df = pd.DataFrame({'content': docs, 'title': titles})

In [41]:
df

Unnamed: 0,content,title
0,\n\nChoosing how to unsubscribe\n\nTo unwatch ...,managing-your-subscriptions
1,\n\nDiagnosing why you receive too many notifi...,viewing-your-subscriptions
2,\n\nNotifications and subscriptions\n\nYou can...,about-notifications
3,\n\nNotification delivery options\n\nYou can r...,configuring-notifications
4,\n\nStarting your inbox triage\n\nBefore you s...,customizing-a-workflow-for-triaging-your-notif...
...,...,...
2133,\n\nAbout disabling webhooks\n\n{% ifversion f...,disabling-webhooks
2134,\n\nAbout editing webhooks\n\nYou can edit a w...,editing-webhooks
2135,\n\nAbout webhook delivery failures\n\nA webho...,handling-failed-webhook-deliveries
2136,"\n\nIntroduction\n\nWhen you create a webhook,...",handling-webhook-deliveries


In [44]:
df.to_csv("../data/documents.csv")

In [14]:
inference = InferenceClient(token=hugging_face_api_key)
model_zephyr ="HuggingFaceH4/zephyr-7b-beta"
model_mistral = "mistralai/Mistral-7B-v0.1"
model_falcon = "tiiuae/falcon-7b-instruct"
model_open = "openchat/openchat_3.5"

NameError: name 'hugging_face_api_key' is not defined

In [4]:
inference.text_generation(prompt="What is football", model=model_zephyr, max_new_tokens=50)

"?\n\nFootball is a sport that is played by two teams of eleven players each. The objective of the game is to score goals by kicking a ball into the opposing team's goal. The team with the most goals at the end"

In [4]:
MiniLM_FR = HuggingFaceEmbedding(model_name="../biencoder-MiniLM-L6-all-v2-mmarcoFR")

In [6]:
sentences = ['This is an example sentence', 'Each sentence is converted']

In [12]:
np.array(MiniLM_FR._get_text_embeddings(sentences)).shape

(2, 384)

In [8]:
model_L6_v2 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [7]:
model_L6_v2.encode(sentences).shape

(2, 384)

In [15]:
model_mpnet = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [14]:
model_mpnet.encode(sentences).shape

(2, 384)

In [11]:
reranker = FlagReranker('BAAI/bge-reranker-base', use_fp16=True) 
# Setting use_fp16 to True speeds up computation with a slight performance degradation

In [10]:
scores = reranker.compute_score([['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']])
print(scores)

[-8.154391288757324, 6.182114601135254]


In [11]:
reranker.compute_score(['Hello', 'hi'])

6.860960483551025

## Testing Faiss

In [12]:
df = pd.read_csv("../data/documents.csv")
df.head()

Unnamed: 0,content,title
0,\n\nChoosing how to unsubscribe\n\nTo unwatch ...,managing-your-subscriptions
1,\n\nDiagnosing why you receive too many notifi...,viewing-your-subscriptions
2,\n\nNotifications and subscriptions\n\nYou can...,about-notifications
3,\n\nNotification delivery options\n\nYou can r...,configuring-notifications
4,\n\nStarting your inbox triage\n\nBefore you s...,customizing-a-workflow-for-triaging-your-notif...


In [13]:
df.content.values.shape

(2138,)

In [14]:
def create_index(data, text_column, model):
    embedding = model.encode(data[text_column].to_list())
    print("Embeddings created")
    # dimension
    dimension = embedding.shape[1]
    
    # create the vector/embeddings and their IDs                                                                                                                                                                                                                                                embedding vectors and ids:
    db_vectors = embedding.copy().astype(np.float32)
    db_ids = data.index.values.astype(np.int64)

    faiss.normalize_L2(db_vectors)
    index = faiss.IndexFlatIP(dimension)
    index = faiss.IndexIDMap(index)
    index.add_with_ids(db_vectors, db_ids)
    
    return index, embedding

def query_search(index, model, query, k):
    
    t=time.time()
    query_vector = model.encode([query]).astype(np.float32)
    faiss.normalize_L2(query_vector)
    
    similarities, similarities_ids = index.search(query_vector, k)
    print('totaltime: {}\n'.format(time.time()-t))
    
    similarities = np.clip(similarities, 0, 1)
    
    output = []
    for i in range(len(similarities_ids[0])):
        item = {
            'id': similarities_ids[0][i],
            'title': df.loc[similarities_ids[0][i], 'title'],
            'text': df.loc[similarities_ids[0][i], 'content'],
            'similarity_score': similarities[0][i]
        }
        output.append(item)
    
    return output

In [7]:
model_L6_v2 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
index_flatIP, embeddings = create_index(data=df, 
                          text_column='content', 
                          model=model_L6_v2)

In [16]:
index_flatIP.ntotal

2138

In [19]:
faiss.write_index(index_flatIP, '../data/index_flatIP')

In [15]:
index_flatIP = faiss.read_index('../data/index_flatIP')

In [16]:
search_results = query_search(index=index_flatIP, 
                  model=model_L6_v2, 
                  query="What are codespaces", 
                  k=10)

totaltime: 0.31176304817199707



In [17]:
search_results

[{'id': 960,
  'title': 'overview',
  'text': '\n\nWhat is a codespace?\n\nA codespace is a development environment that\'s hosted in the cloud. You can customize your project for {% data variables.product.prodname_github_codespaces %} by committing configuration files to your repository (often known as Configuration-as-Code), which creates a repeatable codespace configuration for all users of your project. For more information, see "AUTOTITLE."\n\nEach codespace you create is hosted by {% data variables.product.prodname_dotcom %} in a Docker container, running on a virtual machine. You can choose from a selection of virtual machine types, from 2 cores, 8 GB RAM, and 32 GB storage, up to 32 cores, 64 GB RAM, and 128 GB storage.\n\nBy default, the codespace development environment is created from an Ubuntu Linux image that includes a selection of popular languages and tools, but you can use an image based on a Linux distribution of your choice and configure it for your particular requir

In [23]:
# Testing indexIVflat
dimension = embeddings.shape[1]
nlist = 60  # how many Voronoi cells/partitions
quantizer = faiss.IndexFlatL2(dimension)
indexIVFFlat = faiss.IndexIVFFlat(quantizer, dimension, nlist)

In [24]:
indexIVFFlat.is_trained

False

In [25]:
# We train the flat index using our embeddings
indexIVFFlat.train(embeddings)
indexIVFFlat.is_trained  # check if index is now trained

True

In [26]:
indexIVFFlat.add(embeddings)
indexIVFFlat.ntotal  # number of embeddings indexed

2138

In [27]:
query = "What are codespaces ?"
search_results = query_search(index = indexIVFFlat, 
                              model = model_L6_v2, 
                              query = query, 
                              k=10)

totaltime: 0.028784513473510742



In [28]:
search_results

[{'id': 960,
  'title': 'overview',
  'text': '\n\nWhat is a codespace?\n\nA codespace is a development environment that\'s hosted in the cloud. You can customize your project for {% data variables.product.prodname_github_codespaces %} by committing configuration files to your repository (often known as Configuration-as-Code), which creates a repeatable codespace configuration for all users of your project. For more information, see "AUTOTITLE."\n\nEach codespace you create is hosted by {% data variables.product.prodname_dotcom %} in a Docker container, running on a virtual machine. You can choose from a selection of virtual machine types, from 2 cores, 8 GB RAM, and 32 GB storage, up to 32 cores, 64 GB RAM, and 128 GB storage.\n\nBy default, the codespace development environment is created from an Ubuntu Linux image that includes a selection of popular languages and tools, but you can use an image based on a Linux distribution of your choice and configure it for your particular requir

In [29]:
reranker = FlagReranker('BAAI/bge-reranker-base', use_fp16=True) 

In [30]:
def compute_rerank_score(results, reranker, query):
    pairs = [[result['text'], query] for result in results]
    scores = reranker.compute_score(pairs)
    for i, score in enumerate(scores):
        results[i]['rerank_score'] = score

In [31]:
compute_rerank_score(search_results, reranker, query)

In [32]:
sorted(search_results, key=lambda x: x['rerank_score'], reverse=True)[:5]

[{'id': 960,
  'title': 'overview',
  'text': '\n\nWhat is a codespace?\n\nA codespace is a development environment that\'s hosted in the cloud. You can customize your project for {% data variables.product.prodname_github_codespaces %} by committing configuration files to your repository (often known as Configuration-as-Code), which creates a repeatable codespace configuration for all users of your project. For more information, see "AUTOTITLE."\n\nEach codespace you create is hosted by {% data variables.product.prodname_dotcom %} in a Docker container, running on a virtual machine. You can choose from a selection of virtual machine types, from 2 cores, 8 GB RAM, and 32 GB storage, up to 32 cores, 64 GB RAM, and 128 GB storage.\n\nBy default, the codespace development environment is created from an Ubuntu Linux image that includes a selection of popular languages and tools, but you can use an image based on a Linux distribution of your choice and configure it for your particular requir

In [10]:
def encode_chunk_idx(chunk_num, idx):
    # Encode chunk_num and idx into a dictionary and then base64 encode it
    encoded_id = base64.b64encode(f"chunk_{chunk_num}_index_{idx}".encode()).decode()
    
    return encoded_id

def decode_chunk_idx(encoded_id):
    # Decode the base64 string and extract chunk_num and idx
    decoded_id = base64.b64decode(encoded_id).decode()
    
    # Split the decoded string to extract chunk_num and idx
    parts = decoded_id.split('_')
    chunk_num = int(parts[1])
    idx = int(parts[3])
    
    return idx, chunk_num

def get_max_index(corpus_keys):
    max_index = -1
    for encoded_id in corpus_keys:
        idx, _ = decode_chunk_idx(encoded_id)
        max_index = max(max_index, idx)
    return max_index

In [17]:
def update_qa_pairs(
    qa_dataset,
    texts: List[str],
    client: InferenceClient,
    model: str,
    qa_generate_prompt_tmpl: str,
    max_new_tokens: int = 100,
    num_questions_per_chunk: int = 2,
    chunk_size: int = 1024
):
    """Generate examples given a set of texts."""
    node_dict = {}
    queries = {}
    relevant_docs = {}
    max_index = get_max_index(qa_dataset['corpus'].keys())
    
    for idx, text in enumerate(texts):
        num_chunks = (len(text) + chunk_size - 1) // chunk_size
        print(f"Number of chunks in text {idx + max_index}: {num_chunks}")
        for chunk_num in range(num_chunks):
            start_idx = chunk_num * chunk_size
            end_idx = min((chunk_num + 1) * chunk_size, len(text))
            chunk_text = text[start_idx:end_idx]
            
            node_id = encode_chunk_idx(chunk_num, idx + max_index)
            node_dict[node_id] = chunk_text
            
            query = qa_generate_prompt_tmpl.format(
                context_str=chunk_text, num_questions_per_chunk=num_questions_per_chunk
            )

            # Retry logic with a sleep interval after encountering an error
            retries = 3  # Number of retries
            for attempt in range(retries):
                try:
                    response = client.text_generation(prompt=query, model=model, max_new_tokens=max_new_tokens)
                    response = response.split("?\n")
                    questions = [
                        re.sub(r"^(-{1,})","",re.sub(r"^\s*\d+\.\s*|\n", "",question)) for question in response
                    ]
                    questions = [question for question in questions if len(question) > 10]
                    for question in list(set(questions)):
                        question_id = str(uuid.uuid4())
                        queries[question_id] = question
                        relevant_docs[question_id] = [node_id]
                    break  # Exit the retry loop if successful
                except Exception as e:
                    print(f"Error encountered: {e}")
                    if attempt < retries - 1:
                        print(f"Retrying after 2 minutes ({attempt+1}/{retries})...")
                        time.sleep(120)  # Sleep for 2 minutes before retrying
                    else:
                        print("Maximum retries reached. Skipping this text chunk.")

        if idx % 5 == 0:
            print(f"Updating q&a dataset")
            qa_dataset['queries'].update(queries)
            qa_dataset['corpus'].update(node_dict)
            qa_dataset['relevant_docs'].update(relevant_docs)
            node_dict = {}
            queries = {}
            relevant_docs = {}
            time.sleep(30)

In [14]:
inference = InferenceClient(token=hugging_face_api_key)
model_zephyr ="HuggingFaceH4/zephyr-7b-beta"
model_mistral = "mistralai/Mistral-7B-v0.1"
model_falcon = "tiiuae/falcon-7b-instruct"
model_open = "openchat/openchat_3.5"

In [18]:
QA_GENERATE_PROMPT_TMPL = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

You are a Teacher/ Professor. Your task is to setup \
EXACTLY {num_questions_per_chunk} questions for an upcoming \
quiz/examination. 
The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided. 
Questions should be understandable without having access to the context.
Don't hallucinate.
questions:
"""

In [19]:
#qa_dataset = {'queries': {}, 'corpus': {}, 'relevant_docs': {}}
# Load the JSON data from the file
with open('../data/qa_dataset.json', 'r') as json_file:
    qa_dataset = json.load(json_file)

In [2]:
max_index = get_max_index(qa_dataset['corpus'].keys())
max_index

NameError: name 'qa_dataset' is not defined

In [43]:
update_qa_pairs(
    qa_dataset,
    texts=df.loc[max_index:,'content'].to_list(),
    client=InferenceClient(token=hugging_face_api_key),
    model= model_zephyr,
    max_new_tokens=200,
    qa_generate_prompt_tmpl=QA_GENERATE_PROMPT_TMPL,
    num_questions_per_chunk=2,
    chunk_size = 2048
)

Number of chunks in text 0: 1
Error encountered: 500 Server Error: Internal Server Error for url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta (Request ID: hfw0HZqMFyRBcCmFEXotP)
Retrying after 2 minutes (1/3)...
Updating q&a dataset
Number of chunks in text 1: 1
Number of chunks in text 2: 1
Number of chunks in text 3: 1
Error encountered: 500 Server Error: Internal Server Error for url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta (Request ID: asQbxv3ToKChzqWzqw7kG)
Retrying after 2 minutes (1/3)...
Number of chunks in text 4: 1
Number of chunks in text 5: 1
Updating q&a dataset
Number of chunks in text 6: 1
Number of chunks in text 7: 2


KeyboardInterrupt: 

In [23]:
qa_dataset

{'queries': {'a3eb1dc5-c1c1-404c-aafe-41971e359bfd': 'What are the benefits of unsubscribing from notifications in my inbox versus unsubscribing from notifications on the subscriptions page on GitHub?',
  'fb33a268-26e2-420d-85fe-3222c1cd9286': 'How can I quickly unsubscribe from multiple notifications at the same time on GitHub',
  '99454d3c-9cd0-4681-8f34-68cf44ed14fe': 'How can I manage the notifications I receive for repositories I am watching on GitHub',
  '5c0e0049-bd6a-4c42-8409-3ae7144c7557': 'How can I choose to only receive notifications when participating or mentioned for a repository I am watching on GitHub',
  '0e3c2dd3-d76e-4107-b843-0fa8e5cfb7d8': 'Where can I find the "Manage notifications" dropdown menu on GitHub',
  '8bd92299-8e58-41e6-8673-7491deccafcc': 'How can I unwatch all repositories owned by a specific user or organization on GitHub',
  '0c2ab690-7907-45f2-97bc-b189137f34c9': 'How can I unsubscribe from specific notifications on GitHub',
  '881a403c-746d-4a40-

In [25]:
with open('../data/qa_dataset.json', 'w') as json_file:
    json.dump(qa_dataset, json_file)

In [167]:
qa_test = {'queries': {'q1': 'test 1', 
             'q2': 'test 2'},
 'corpus': {'node_1': 'chunk_1',
            'node_2': 'chunk_2'}, 
'relevant_docs': {'q1': 'text_1',
                  'q2': 'text_2'}}

In [168]:
queries = {'q3': 'question_3'}
node_dict = {'node_3': 'chunk_3'}
relevant_docs = {'q3': 'text_3'}

In [169]:
qa_test['queries'].update(queries)
qa_test['corpus'].update(node_dict)
qa_test['relevant_docs'].update(relevant_docs)

In [170]:
qa_test

{'queries': {'q1': 'test 1', 'q2': 'test 2', 'q3': 'question_3'},
 'corpus': {'node_1': 'chunk_1', 'node_2': 'chunk_2', 'node_3': 'chunk_3'},
 'relevant_docs': {'q1': 'text_1', 'q2': 'text_2', 'q3': 'text_3'}}