## Notebook to cleanup raw html files and save as json
- code takes headers in html and saves them as questions as 'input' and answers as 'output'
- outputs clean data

In [13]:
import markdown
from bs4 import BeautifulSoup
import glob

pairs = []

for file_path in glob.glob("./input/*.md"):
    with open(file_path, 'r', encoding='utf-8') as f:
        html = markdown.markdown(f.read())
        soup = BeautifulSoup(html, 'html.parser')
        
        terms = soup.find_all(['h1', 'h2', 'h3'])  # assuming terms are in headings
        for term in terms:
            next_sibling = term.find_next_sibling()
            if next_sibling:
                input_text = f"What is an {term.text.strip().lower()}?"
                output_text = next_sibling.text.strip()
                pairs.append((input_text, output_text))

# Save to TSV/CSV/JSON for training
import json
with open('./input/investment_glossary.json', 'w') as f:
    json.dump([{'input': i, 'output': o} for i, o in pairs], f, indent=2)


In [9]:
import os
import re
import json
import nltk

nltk.download('punkt')
from nltk.tokenize import sent_tokenize

INPUT_DIR = "./investopedia_terms"
OUTPUT_FILE = "./input/investopedia_data.json"

def extract_qa_pairs_from_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f.readlines() if line.strip()]

    pairs = []

    for i, line in enumerate(lines):
        if line.endswith("?") and "definition" not in line.lower():
            input_question = line
            buffer = []

            # Collect lines until a full sentence can be formed
            for j in range(i + 1, len(lines)):
                if lines[j].startswith("##"):
                    break
                buffer.append(lines[j].strip())
                combined = ' '.join(buffer)
                sentences = sent_tokenize(combined)
                if sentences:
                    output = sentences[0]
                    if output and output[-1] in ".!?":
                        pairs.append({
                            "input": input_question,
                            "output": output
                        })
                        break  # We have a full sentence; stop
    return pairs


all_pairs = []
for filename in os.listdir(INPUT_DIR):
    if filename.endswith(".md"):
        filepath = os.path.join(INPUT_DIR, filename)
        pairs = extract_qa_pairs_from_file(filepath)
        all_pairs.extend(pairs)

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_pairs, f, indent=2, ensure_ascii=False)

print(f"Saved {len(all_pairs)} pairs to {OUTPUT_FILE}")


[nltk_data] Downloading package punkt to /Users/jorge/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Saved 20171 pairs to ./input/investopedia_data.json


In [11]:
## final cleaning
## removing questions that have special characters

def clean_json(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Filter out records where the input contains '##'
    cleaned_data = [entry for entry in data if '##' not in entry['input']]
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(cleaned_data, f, indent=2, ensure_ascii=False)

clean_json("./input/investopedia_data.json", "./input/investopedia_data_cln.json")

In [1]:
# shuffling 

import json
import random

# Read JSON file
with open('./input/final_data.json', 'r') as file:
    data = json.load(file)

# Check if data is a list
if isinstance(data, list):
    # Randomize order
    random.shuffle(data)
    
    # Save back to JSON file (optional)
    with open('randomized_data.json', 'w') as file:
        json.dump(data, file, indent=4)
else:
    print("JSON data is not a list")
