## Generate Sythetic Dataset

In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
import os
import time
import pandas as pd
from getpass import getpass
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
from docu_bot.utils import create_chatopenai_model, create_openai_embeddings
from docu_bot.document_loaders.git_document_loader import GitDocumentLoader
from docu_bot.document_loaders.utils import LoadedRepositoriesAndFiles
from docu_bot.datasets.generate_synthetic_data_ragas import (
    generate_dataset,
    create_generator,
)


In [12]:
model_type = "gpt-4o-mini"
embedding_model_type="text-embedding-3-small"
api_key = getpass("Enter your OpenAI API key: ")

In [13]:
llm_model = LangchainLLMWrapper(create_chatopenai_model(model_type=model_type, api_key=api_key))
embeddings_model = LangchainEmbeddingsWrapper(create_openai_embeddings(model_type=embedding_model_type, api_key=api_key))
generator = create_generator(llm_model, embeddings_model)
document_loader = GitDocumentLoader(
    repo_path="https://github.com/DIRACGrid/DIRAC.git", branch="rel-v7r2", loaded_repositories_and_files=LoadedRepositoriesAndFiles()
)

In [14]:
synthetic_data_list = []
for i in range(1):
    print(f"Generating synthetic data {i}")
    synthetic_data = generate_dataset(generator, document_loader.load(), dataset_size=25)
    synthetic_data_list.append(synthetic_data.to_pandas())
    print(f"Sleeping for 5 seconds to prevent Token Limit Error" )
    time.sleep(5)
synthetic_data = pd.concat(synthetic_data_list)

Generating synthetic data 0


Generating Scenarios: 100%|██████████| 1/1 [00:46<00:00, 46.01s/it]     
Generating Samples: 100%|██████████| 25/25 [00:03<00:00,  8.33it/s]


Sleeping for 5 seconds to prevent Token Limit Error


In [15]:
synthetic_data

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,Who is Federiko Stagni and what is his role in...,[Main contributors to the source code\n-------...,Federico Stagni is one of the main contributor...,single_hop_specifc_query_synthesizer
1,What is DIRAC and what purpose does it serve i...,[.. -*- mode: rst -*-\n\nDIRAC\n=====\n\n.. im...,"DIRAC is an interware, meaning a software fram...",single_hop_specifc_query_synthesizer
2,What is the role of pylint in the DIRAC coding...,[\n# Contribution Guidelines for DIRAC\n\n\n##...,Your code should not introduce any new pylint ...,single_hop_specifc_query_synthesizer
3,How do you create a Dockerfile for DIRAC in a ...,[# DIRAC in docker containers\n\n[WORK IN PROG...,To create a Dockerfile for DIRAC in a Docker c...,single_hop_specifc_query_synthesizer
4,What is the role of Web_WebApp in the DIRAC se...,[# DIRAC in Docker Compose Setup for Developme...,Web_WebApp is one of the installed components ...,single_hop_specifc_query_synthesizer
5,What is CTA in the context of DIRAC?,[.. image:: _static/DIRAC-logo.png\n :width: ...,"CTA is one of the communities that use DIRAC, ...",single_hop_specifc_query_synthesizer
6,What are the key features of DIRAC as mentione...,[===================\nAdministrator Guide\n===...,DIRAC has been developed with extensibility an...,single_hop_specifc_query_synthesizer
7,What DISET do?,[===================\nTechnology Previews\n===...,DISET is used for encoding and decoding data i...,single_hop_specifc_query_synthesizer
8,How do I use a p12 file with dirac-cert-conver...,[============================\ndirac-cert-conv...,"To use a p12 file with dirac-cert-convert.sh, ...",single_hop_specifc_query_synthesizer
9,What commands are available for managing resou...,[====================================\nAdminis...,The commands available for managing resources ...,single_hop_specifc_query_synthesizer


In [16]:
synthetic_data.to_feather(
    os.path.join(os.path.abspath(''), "..", "datasets", "dirac_synthetic_data.feather")
)