In [None]:
# Import library

import os
import json
import pandas as pd
import openai
from langchain.text_splitter import TokenTextSplitter, MarkdownHeaderTextSplitter
from datasets import Dataset
from time import sleep
from huggingface_hub import login

In [None]:
# File paths
ROOT_DIR = ""
OUTPUT_PATH = ""
SYSTEM_FILE_PATH = ""
YOUR_REPO_NAME = ""

# Login
openai.api_key = "OPENAI_KEY"
login(token="HUGGINGFACE_KEY")

In [None]:
# Reads and returns the content of a file.
def read_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read()

# Generates a response from the ChatGPT model
def generate_chatgpt_response(messages, temperature=0.5, model="gpt-4", max_tokens=4096):
    response = openai.ChatCompletion.create(
        model=model, messages=messages, temperature=temperature, max_tokens=max_tokens
    )
    return response['choices'][0]['message']['content']

# Process Markdown files
def process_markdown_file(file_path, markdown_splitter, text_splitter):
    with open(file_path, 'r') as file:
        markdown_document = file.read()
    md_header_splits = markdown_splitter.split_text(markdown_document)
    return [chunk for split in md_header_splits for chunk in text_splitter.split_documents([split])]

# Extracts question and answer pairs from a given text
def extract_qa_pairs(text):
    qa_pairs = []
    current_pair = {}
    lines = text.split('\n')
    for line in lines:
        if line.startswith('"question": '):
            current_pair['question'] = line.split('"question": ')[1].strip(' ",')
        elif line.startswith('"answer": '):
            current_pair['answer'] = line.split('"answer": ')[1].strip(' ",')
            qa_pairs.append(current_pair)
            current_pair = {}
    return qa_pairs

# Parse QnA pairs as JSON
def parse_response(response):
    try:
        parsed_data = json.loads(response)
        return parsed_data['qa_pairs']
    except json.JSONDecodeError:
        return extract_qa_pairs(response)

# Create new column in the dataset
def create_message(row):
    return [{"content": row['question'], "role": "user"}, {"content": row['answer'], "role": "assistant"}]


In [None]:
# Main Script Logic
HEADERS_TO_SPLIT = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=HEADERS_TO_SPLIT)
text_splitter = TokenTextSplitter(chunk_size=300, chunk_overlap=30)

all_chunks = []
master_df = pd.DataFrame(columns=['question', 'answer', 'raw'])

for subdir, dirs, files in os.walk(ROOT_DIR):
    for file in files:
        if file.endswith('.md'):
            file_path = os.path.join(subdir, file)
            file_chunks = process_markdown_file(file_path, markdown_splitter, text_splitter)
            all_chunks.extend(file_chunks)

# Generating QnA pairs with GPT
for _ in range(3):
    for chunk in all_chunks:  # Limiting to the first 3 for brevity
        conversation = [{'role': 'system', 'content': read_file(SYSTEM_FILE_PATH)},
                        {'role': 'user', 'content': str(chunk)}]
        response_verification = generate_chatgpt_response(conversation)
        qa_pairs = parse_response(response_verification)
        qa_df = pd.DataFrame(qa_pairs)
        qa_df['raw'] = [chunk] * len(qa_df)
        master_df = pd.concat([master_df, qa_df], ignore_index=True)
        print(master_df.shape)

master_df.to_csv(OUTPUT_PATH, index=False, encoding='utf-8')

In [None]:
# Deduplication
df_deduplicated = master_df.drop_duplicates()
df_deduplicated['messages'] = df_deduplicated.apply(create_message, axis=1)

# Convert data frame to Huggingface dataset format
messages = df_deduplicated['message'].tolist()
rejected = df_deduplicated['rejected'].tolist()
hf_dataset = Dataset.from_dict({'messages': messages, 'chosen': messages, 'rejected': rejected})

# Split train and test
split_dataset = hf_dataset.train_test_split(test_size=0.1)

# Push to Hugging Face Hub
split_dataset.push_to_hub(YOUR_REPO_NAME)