In [33]:
import os
import pandas as pd
import numpy as np
import yaml
import torch
import torch.nn.functional as F
from pathlib import Path
#from utils import custom_generate_qa_embedding_pairs, display_results, translate_article
import asyncio

# Transformers
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer


from llama_index.schema import MetadataMode, TextNode
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.readers.database import DatabaseReader
from llama_index.node_parser import SimpleNodeParser
from llama_index import download_loader


#LLM
from huggingface_hub import InferenceClient

# Embeddings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from huggingface_hub import InferenceClient


# Retrievers
from llama_index.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
)

# Evaluator
from llama_index.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
    RetrieverEvaluator
)

In [2]:
# Load hugging Face token
with open('../config.yaml', 'r') as config_file:
        config = yaml.safe_load(config_file)
hugging_face_api_key = config['huggingface']['token_api']

In [3]:
MarkdownReader = download_loader("MarkdownReader")
loader = MarkdownReader()

In [4]:
documents = loader.load_data(file=Path('./managing-your-subscriptions.md'))

In [5]:
text = ""
for i in range( len(documents)):
    text =  text + documents[i].get_text()
text

'\n\nChoosing how to unsubscribe\n\nTo unwatch (or unsubscribe from) repositories quickly, navigate to github.com/watching to see all the repositories you\'re following. For more information, see "Unwatching repositories."\n\nTo unsubscribe from multiple notifications at the same time, you can unsubscribe using your inbox or on the subscriptions page. Both of these options offer more context about your subscriptions than the "Watched repositories" page.\n\n\n\nBenefits of unsubscribing from your inbox\n\nWhen you unsubscribe from notifications in your inbox, you have several other triaging options and can filter your notifications by custom filters and discussion types. For more information, see "AUTOTITLE."\n\n\n\nBenefits of unsubscribing from the subscriptions page\n\nWhen you unsubscribe from notifications on the subscriptions page, you can see more of the notifications you\'re subscribed to and sort them by "Most recently subscribed" or "Least recently subscribed".\n\nThe subscrip

In [6]:
with open("test_doc.txt", 'w') as fichier:
    fichier.write(text)
    fichier.close()

In [7]:
def load_docs(file_path : str)->str:
    doc = ""
    # Initialise loader and load documents from specified documents
    MarkdownReader = download_loader("MarkdownReader")
    loader = MarkdownReader()
    documents = loader.load_data(file=Path(file_path))

    for i in range(len(documents)):
        doc =  doc + documents[i].get_text()
    return doc

In [22]:
def explore_folders(root_dir):
    docs = []
    titles = []
    for foldername, subfolders, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith('.md') and filename != 'index.md':
                file_path = os.path.join(foldername, filename)
                doc = load_docs(file_path=file_path)
                docs.append(doc)
                titles.append(filename[:-3])
    return docs, titles

In [23]:
for theme_folder in os.listdir('../content'):
    if theme_folder.endswith(".md") != True:
        explore_folders(theme_folder)

In [24]:
docs, titles = explore_folders(os.path.dirname(os.getcwd()) + "\content\\" + os.listdir('../content')[0])

In [25]:
titles

['managing-your-subscriptions',
 'viewing-your-subscriptions',
 'about-notifications',
 'configuring-notifications',
 'customizing-a-workflow-for-triaging-your-notifications',
 'managing-notifications-from-your-inbox',
 'triaging-a-single-notification',
 'about-your-organizations-profile',
 'about-your-profile',
 'managing-your-profile-readme',
 'personalizing-your-profile',
 'pinning-items-to-your-profile',
 'setting-your-profile-to-private',
 'sending-enterprise-contributions-to-your-githubcom-profile',
 'showing-an-overview-of-your-activity-on-your-profile',
 'showing-your-private-contributions-and-achievements-on-your-profile',
 'troubleshooting-commits-on-your-timeline',
 'viewing-contributions-on-your-profile',
 'why-are-my-contributions-not-showing-up-on-my-profile',
 'inviting-collaborators-to-a-personal-repository',
 'maintaining-ownership-continuity-of-your-personal-accounts-repositories',
 'removing-a-collaborator-from-a-personal-repository',
 'removing-yourself-from-a-colla

In [27]:
len(docs)

57

In [34]:
folders = os.listdir('../content')
docs = []
titles = []
root_directory = os.path.dirname(os.getcwd()) + "\content\\" 
for folder in folders:
    if theme_folder.endswith(".md") != True:
        docs__in_folder, titles_in_folder = explore_folders(root_directory + folder)
        docs = docs + docs__in_folder
        titles = titles + titles_in_folder

In [36]:
len(titles)

2138

In [40]:
df = pd.DataFrame({'content': docs, 'title': titles})

In [41]:
df

Unnamed: 0,content,title
0,\n\nChoosing how to unsubscribe\n\nTo unwatch ...,managing-your-subscriptions
1,\n\nDiagnosing why you receive too many notifi...,viewing-your-subscriptions
2,\n\nNotifications and subscriptions\n\nYou can...,about-notifications
3,\n\nNotification delivery options\n\nYou can r...,configuring-notifications
4,\n\nStarting your inbox triage\n\nBefore you s...,customizing-a-workflow-for-triaging-your-notif...
...,...,...
2133,\n\nAbout disabling webhooks\n\n{% ifversion f...,disabling-webhooks
2134,\n\nAbout editing webhooks\n\nYou can edit a w...,editing-webhooks
2135,\n\nAbout webhook delivery failures\n\nA webho...,handling-failed-webhook-deliveries
2136,"\n\nIntroduction\n\nWhen you create a webhook,...",handling-webhook-deliveries


In [44]:
df.to_csv("../data/documents.csv")

In [3]:
inference = InferenceClient(token=hugging_face_api_key)
model_zephyr ="HuggingFaceH4/zephyr-7b-beta"
model_mistral = "mistralai/Mistral-7B-v0.1"
model_falcon = "tiiuae/falcon-7b-instruct"
model_open = "openchat/openchat_3.5"

In [4]:
inference.text_generation(prompt="What is football", model=model_zephyr, max_new_tokens=50)

"?\n\nFootball is a sport that is played by two teams of eleven players each. The objective of the game is to score goals by kicking a ball into the opposing team's goal. The team with the most goals at the end"

In [8]:
MiniLM_FR = HuggingFaceEmbedding(model_name="../biencoder-MiniLM-L6-all-v2-mmarcoFR")

In [11]:
sentences = ['This is an example sentence', 'Each sentence is converted']

In [12]:
np.array(MiniLM_FR._get_text_embeddings(sentences)).shape

(2, 384)

In [4]:
model_L6_v2 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [7]:
model_L6_v2.encode(sentences).shape

(2, 384)

In [15]:
model_mpnet = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [14]:
model_mpnet.encode(sentences).shape

(2, 384)