In [1]:
import json

# Initialize a set to store unique URLs
unique_urls = set()

# Initialize a counter for duplicate URLs
duplicate_count = 0

# Open and read the file
with open('visited.jsonl', 'r') as file:
    for line in file:
        data = json.loads(line)
        url = data['url']
        if url in unique_urls:
            duplicate_count += 1
        else:
            unique_urls.add(url)

# Output the result
print(f"Number of duplicate URLs: {duplicate_count}")

Number of duplicate URLs: 0


In [2]:
import json
from markdownify import markdownify as md
import os
import re

# Base directory to save markdown files
base_output_dir = 'markdown_files'

# Regular expressions to match title and chapter
title_regex = re.compile(r'/title_(\d+)/')
chapter_regex = re.compile(r'/chapter_(\d+)/')

# Open and read the file
with open('visited.jsonl', 'r') as file:
    for line in file:
        data = json.loads(line)
        html_content = data['html']
        url = data['url']

        # Convert HTML to Markdown
        markdown_content = md(html_content)

        # Extract title and chapter using regex
        title_match = title_regex.search(url)
        chapter_match = chapter_regex.search(url)

        # Determine the directory based on title and chapter presence
        if title_match:
            title = title_match.group(1)
            title_dir = os.path.join(base_output_dir, f"title_{title}")
            if chapter_match:
                chapter = chapter_match.group(1)
                chapter_dir = os.path.join(title_dir, f"chapter_{chapter}")
                os.makedirs(chapter_dir, exist_ok=True)
                target_dir = chapter_dir
            else:
                os.makedirs(title_dir, exist_ok=True)
                target_dir = title_dir
        else:
            target_dir = base_output_dir

        # Replace '/' with '---' in the URL to use as filename
        filename = url.replace('/', '---') + ".md"

        # Save the markdown content to the file within the appropriate directory
        file_path = os.path.join(target_dir, filename)
        with open(file_path, 'w') as md_file:
            md_file.write(markdown_content)

print(f"Markdown files are organized in {base_output_dir}/")

Markdown files are organized in markdown_files/


In [3]:
import os

# Base directory where markdown files are organized
base_output_dir = 'markdown_files'

# New directory to store combined chapter documents
knowledge_base_dir = 'mca_knowledge_base'
os.makedirs(knowledge_base_dir, exist_ok=True)

# Function to combine markdown files in a directory
def combine_markdown_files(directory, output_file):
    with open(output_file, 'w') as outfile:
        for filename in sorted(os.listdir(directory)):
            filepath = os.path.join(directory, filename)
            if os.path.isfile(filepath) and filepath.endswith('.md'):
                with open(filepath, 'r') as readfile:
                    outfile.write(readfile.read() + '\n\n')

# Traverse the directory structure
for title_dir_name in os.listdir(base_output_dir):
    title_dir_path = os.path.join(base_output_dir, title_dir_name)
    if os.path.isdir(title_dir_path):
        for chapter_dir_name in os.listdir(title_dir_path):
            chapter_dir_path = os.path.join(title_dir_path, chapter_dir_name)
            if os.path.isdir(chapter_dir_path):
                # Define the output file path for the combined document
                combined_filename = f"{title_dir_name}---{chapter_dir_name}.md"
                combined_filepath = os.path.join(knowledge_base_dir, combined_filename)
                # Combine all markdown files in this chapter directory
                combine_markdown_files(chapter_dir_path, combined_filepath)

print(f"Combined chapter documents are saved in {knowledge_base_dir}/")

Combined chapter documents are saved in mca_knowledge_base/


In [4]:
import os

# Base directory where markdown files are organized
base_output_dir = 'markdown_files'

# New directory to store combined part documents
knowledge_base_dir = 'mca_knowledge_base2'
os.makedirs(knowledge_base_dir, exist_ok=True)

# Function to combine markdown files in a directory
def combine_markdown_files(directory, output_file):
    with open(output_file, 'w') as outfile:
        for filename in sorted(os.listdir(directory)):
            filepath = os.path.join(directory, filename)
            if os.path.isfile(filepath) and filepath.endswith('.md'):
                with open(filepath, 'r') as readfile:
                    outfile.write(readfile.read() + '\n\n')

# Traverse the directory structure
for title_dir_name in os.listdir(base_output_dir):
    title_dir_path = os.path.join(base_output_dir, title_dir_name)
    if os.path.isdir(title_dir_path):
        for chapter_dir_name in os.listdir(title_dir_path):
            chapter_dir_path = os.path.join(title_dir_path, chapter_dir_name)
            if os.path.isdir(chapter_dir_path):
                # Dictionary to hold parts and their corresponding files
                parts_dict = {}
                for filename in os.listdir(chapter_dir_path):
                    if filename.endswith('.md'):
                        part_match = re.search(r'part_(\d+)', filename)
                        if part_match:
                            part = part_match.group(1)
                            if part not in parts_dict:
                                parts_dict[part] = []
                            parts_dict[part].append(filename)
                
                # Combine files for each part
                for part, files in parts_dict.items():
                    combined_filename = f"{title_dir_name}---{chapter_dir_name}---part_{part}.md"
                    combined_filepath = os.path.join(knowledge_base_dir, combined_filename)
                    with open(combined_filepath, 'w') as outfile:
                        for filename in sorted(files):
                            filepath = os.path.join(chapter_dir_path, filename)
                            with open(filepath, 'r') as readfile:
                                outfile.write(readfile.read() + '\n\n')

print(f"Combined part documents are saved in {knowledge_base_dir}/")

Combined part documents are saved in mca_knowledge_base2/
