In [2]:
import pandas as pd
import json
import yaml
from qa_dataset_manager import QADatasetManager
from extract import DocumentProcessor

#LLM
from huggingface_hub import InferenceClient

In [3]:
# Load hugging Face token
with open('../config.yaml', 'r') as config_file:
        config = yaml.safe_load(config_file)
hugging_face_api_key = config['huggingface']['token_api']

In [2]:
# Extract markdown documents and store them in "data/documents.csv"
processor = DocumentProcessor( root_dir='../content', output_path='../data/documents.csv')
processor.process_documents()

Extracting files from folder: account-and-profile
Extracting files from folder: actions
Extracting files from folder: admin
Extracting files from folder: apps
Extracting files from folder: authentication
Extracting files from folder: billing
Extracting files from folder: code-security
Extracting files from folder: codespaces
Extracting files from folder: communities
Extracting files from folder: contributing
Extracting files from folder: copilot
Extracting files from folder: desktop
Extracting files from folder: discussions
Extracting files from folder: education
Extracting files from folder: get-started
Extracting files from folder: github-cli
Extracting files from folder: graphql
Extracting files from folder: index.md
Extracting files from folder: issues
Extracting files from folder: migrations
Extracting files from folder: organizations
Extracting files from folder: packages
Extracting files from folder: pages
Extracting files from folder: pull-requests
Extracting files from folder:

In [4]:
inference = InferenceClient(token=hugging_face_api_key)
model_zephyr ="HuggingFaceH4/zephyr-7b-beta"
model_mistral = "mistralai/Mistral-7B-v0.1"
model_falcon = "tiiuae/falcon-7b-instruct"
model_open = "openchat/openchat_3.5"

In [5]:
df = pd.read_csv("../data/documents.csv")
data = df['content'].to_list()
df.head()

Unnamed: 0,content,title
0,\n\nChoosing how to unsubscribe\n\nTo unwatch ...,managing-your-subscriptions
1,\n\nDiagnosing why you receive too many notifi...,viewing-your-subscriptions
2,\n\nNotifications and subscriptions\n\nYou can...,about-notifications
3,\n\nNotification delivery options\n\nYou can r...,configuring-notifications
4,\n\nStarting your inbox triage\n\nBefore you s...,customizing-a-workflow-for-triaging-your-notif...


In [6]:
dataset_manager = QADatasetManager()

In [7]:
dataset_manager.create_qa_pairs(
    texts=data,
    client=InferenceClient(token=hugging_face_api_key),
    model= model_zephyr,
    max_new_tokens=200,
    num_questions_per_chunk=2,
    chunk_size = 2048)

Number of chunks in text 1280: 5
Updating QA dataset
Saved questions to --> ../data/qa_dataset_intermed.json
Updated metadata
Number of chunks in text 1768: 1
Number of chunks in text 870: 7
Updating QA dataset
Saved questions to --> ../data/qa_dataset_intermed.json
Updated metadata
Number of chunks in text 2062: 1
Number of chunks in text 707: 1
Number of chunks in text 972: 3
Number of chunks in text 893: 6
Number of chunks in text 1732: 3
Number of chunks in text 1239: 2
Number of chunks in text 602: 5
Number of chunks in text 1876: 1
Number of chunks in text 210: 3
Updating QA dataset
Saved questions to --> ../data/qa_dataset_intermed.json
Updated metadata
Number of chunks in text 346: 3
Number of chunks in text 1935: 1
Updating QA dataset
Saved questions to --> ../data/qa_dataset_intermed.json
Updated metadata
Number of chunks in text 1867: 1
Number of chunks in text 2107: 1
Number of chunks in text 716: 6
Number of chunks in text 1608: 1
Number of chunks in text 2046: 10
Number o

KeyboardInterrupt: 

In [9]:
dataset_manager.qa_intermed['queries']

{'4d1a2b46-b1c1-42ae-a068-e63d13a03fd9': "What is the importance of including a license file when creating a public repository from a fork of someone's project? How can I ensure that I am following open source principles when using GitHub for my organization's development work?",
 '1e6853e3-4a80-4666-a2a4-2144889cc7a1': "What is a fork in the context of GitHub? How can it be used to propose changes to someone else's project",
 'c1b9504f-a5a7-4001-979f-52e40826a2c5': 'How can I create a fork of a repository on GitHub using the web interface',
 '720df159-7cea-4af3-a58a-e693d92e7e2d': 'How can I clone my forked repository to my local computer using the GitHub desktop app',
 '32f9a088-769a-46d4-9c1f-2abeed7ebc2b': 'How can I configure Git to pull changes from the upstream repository into my local clone of the Spoon-Knife repository?',
 'e9495880-65b5-4282-bb10-cc5166a7b70b': 'How can I navigate to my fork of the Spoon-Knife repository on {% ifversion fpt or ghec %}{% data variables.product

In [11]:
len(dataset_manager.qa_intermed['queries'])

476

In [12]:
dataset_manager.add_qa_to_dataset()

Added questions from ../data/qa_dataset_intermed.json to --> ../data/qa_dataset.json


In [13]:
with open('../data/qa_dataset.json', 'r') as json_file:
    qa_dataset = json.load(json_file)
    json_file.close()

In [14]:
len(qa_dataset['queries'])

1230