In [54]:
import os

def list_all_files(root_dir):
    """Lists all files under all subfolders of a given root directory.

    Args:
        root_dir: The path to the root directory.

    Returns:
        A list of all file paths found under the root directory.  Returns an empty list if the root directory doesn't exist or is empty.
    """
    all_files = []
    try:
        for dirpath, dirnames, filenames in os.walk(root_dir):
            for filename in filenames:
                if not filename.endswith('-000'):
                    all_files.append(os.path.join(dirpath, filename))
        return all_files
    except FileNotFoundError:
        print(f"Error: Directory '{root_dir}' not found.")
        return []
    except OSError as e:
        print(f"Error accessing directory: {e}")
        return []


# Example usage:
root_directory = "./source_of_data/learncontent/azure-functions-fullpage" 
all_files_list = list_all_files(root_directory)

if all_files_list:
    print("All files under the root directory:")
    for file_path in all_files_list:
        print(file_path)


All files under the root directory:
./source_of_data/learncontent/azure-functions-fullpage\add-bindings-existing-function\cf374970-651e-b078-cf54-7a1117d11405
./source_of_data/learncontent/azure-functions-fullpage\analyze-telemetry-data\1dcc4d8f-79e5-35b9-7fc7-750cab3fc03d
./source_of_data/learncontent/azure-functions-fullpage\bring-dependency-to-functions\350da97f-fa07-6ab9-6df0-548f2c00a651
./source_of_data/learncontent/azure-functions-fullpage\configure-encrypt-at-rest-using-cmk\ed05840e-ea3b-e367-77b5-f80ba8f6d018
./source_of_data/learncontent/azure-functions-fullpage\configure-monitoring\bcf023ad-012c-e840-ef25-dbf13657d751
./source_of_data/learncontent/azure-functions-fullpage\configure-networking-how-to\6152970d-01ac-18a7-1a87-b5e89168f86a
./source_of_data/learncontent/azure-functions-fullpage\consumption-plan\91012c20-a7d6-4442-8c01-d0529994ade0
./source_of_data/learncontent/azure-functions-fullpage\container-concepts\5cae03dd-1d3c-0562-0f98-96e31f469f45
./source_of_data/learnc

In [59]:
# Select first 10 files from the list

corpus = []
file_list = all_files_list[:50]
for file_path in file_list:
    with open(file_path, 'r') as file:
        content = file.read()
        json_content = json.loads(content)
        corpus.append({"_id": json_content['id'], "title": json_content['title'], "text": json_content['content'], "metadata": {}})
        # print(f"Content of {file_path}:\n{content}\n")
# save the corpus to a jsonl file
with open('./datasets/learncorpus/corpus.jsonl', 'w') as f:
    for item in corpus:
        # convert item to json 
        json.dump(item, f)
        f.write('\n')

In [60]:
corpus

[{'_id': 'cf374970-651e-b078-cf54-7a1117d11405',
  'title': 'Connect functions to other Azure services',
  'text': '# Connect functions to Azure services using bindings (programming-language-csharp)\r\n\r\nWhen you create a function, language-specific trigger code is added in your project from a set of trigger templates. If you want to connect your function to other services by using input or output bindings, you have to add specific binding definitions in your function. To learn more about bindings, see [Azure Functions triggers and bindings concepts](functions-triggers-bindings).\r\n\r\n## Local development\r\n\r\nWhen you develop functions locally, you need to update the function code to add bindings. For languages that use function.json, Visual Studio Code provides tooling to add bindings to a function.\r\n\r\n### Manually add bindings based on examples\r\n\r\nWhen adding a binding to an existing function, you need to add binding-specific attributes to the function definition in co

In [61]:
from pydantic import BaseModel
from openai import AzureOpenAI # verified in Python version 3.12.3, 3.12.4 
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

import json
 
token_provider = get_bearer_token_provider(
    DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default"
)
 
client = AzureOpenAI(
    azure_ad_token_provider=token_provider,
    api_version="2024-10-21",
    azure_endpoint="https://discovery-eastus.openai.azure.com/",
)

class QAPair(BaseModel):
    question: str
    answer: str

class QAExtraction(BaseModel):
    identified_content: list[str]
    qa_pairs: list[QAPair]

def generate_qa_from_gpt(
    content: str,
) -> QAExtraction:
    
    system_message = f"""  
        Given a domain-specific document sourced from Windows public or private knowledge-base, your task is to generate question-and-answer pairs according to the following guidelines:  
    
        1. Thoroughly read and understand the document.  
        
        2. Identify and note ALL potential issues, questions, or common problems that can be inferred from the content and context of the document. These elements will serve as the basis for generating general troubleshooting queries, assistance requests, or informational inquiries relevant to customers.  
        
        3. Create question-and-answer pairs based on the given document and identified content:  
            - **Questions** should represent typical customer inquiries, focusing on general troubleshooting, seeking assistance, or looking for information. They should be straightforward and avoid overly technical terms.  
            - **Answers** should be detailed and informative. Each answer must begin with a reasoning process that provides background and domain knowledge related to the question, explaining the context and how to approach the issue. This helps the support team understand the problem. Follow this with insights and step-by-step instructions to address the issue, and conclude with the final solution based on the reasoning process.  
            - Do not include information that is not present in the document.  
            - Ensure that the question-and-answer pairs comprehensively cover ALL aspects of the document without redundancy. No additional pairs should be generated if they would introduce redundancy.  
        
        4. Respond in JSON format with the following structure:  
            - Include the identified potential issues, questions, or common problems for general troubleshooting queries in a list under the key `identified_content`.  
            - Include the question-and-answer pairs in a list under the key `qa_pairs`, using `question` and `answer` keys for each pair.  
        
        """

    # document_content = f"""
    # App Service on Linux supports a number of language-specific built-in images. Just deploy your code. Supported languages include: Node.js, Java (Tomcat, JBoss, or with an embedded web server), PHP, Python, and .NET Core. Run az webapp list-runtimes --os linux to view the latest languages and supported versions. If the runtime your application requires isn't supported in the built-in images, you can deploy it with a custom container.

    # Outdated runtimes are periodically removed from the Web Apps Create and Configuration blades in the portal. These runtimes are hidden from the portal when they're deprecated by the maintaining organization or found to have significant vulnerabilities. These options are hidden to guide customers to the latest runtimes, where they'll be the most successful.

    # When an outdated runtime is hidden from the portal, any of your existing sites using that version will continue to run. If a runtime is fully removed from the App Service platform, your Azure subscription owner(s) will receive an email notice before the removal.

    # If you need to create another web app with an outdated runtime version that's no longer shown on the portal, see the language configuration guides for instructions on how to get the runtime version of your site. You can use the Azure CLI to create another site with the same runtime. Alternatively, you can use the Export Template button on the web app blade in the portal to export an ARM template of the site. You can reuse this template to deploy a new site with the same runtime and configuration.
    # """

    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini-2024-07-18-global", 
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": content}
        ], 
        response_format=QAExtraction,
    )

    # print(completion.model_dump_json(indent=2))
    return completion.choices[0].message.parsed

queries = []
index = 0
for item in corpus:
    # call gpt completions to generate questions
    qa_result = generate_qa_from_gpt(item['text'])
    for qa in qa_result.qa_pairs:
        queries.append({"_id": f"{index}", "text": qa.question, "metadata": {f"{item['_id']}": [{"sentences":qa.answer, "label": ""}]}})
        index += 1
# save the queries to a jsonl file
with open('./datasets/learncorpus/queries.jsonl', 'w') as f:
    for item in queries:
        # convert item to json 
        json.dump(item, f)
        f.write('\n')


In [62]:
import csv

# File name
file_name = "./datasets/learncorpus/qrels/test.tsv"

with open(file_name, 'w', encoding='utf8', newline='') as tsv_file:
        tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n')
        tsv_writer.writerow(["query-id", "corpus-id", "score"])
        for queryItem in queries:
            tsv_writer.writerow([queryItem['_id'], list(queryItem['metadata'].keys())[0], 1])

print(f"TSV file '{file_name}' created successfully.")

TSV file './datasets/learncorpus/qrels/test.tsv' created successfully.
