In [None]:
import os
import json
from tqdm import tqdm

# CORD-19 Dataset: https://www.kaggle.com/datasets/allen-institute-for-ai/CORD-19-research-challenge

# Step 1: Download dataset
# Step 2: Extract contents from zip
# Step 3: Structure JSON files into human-readable text
# Step 4: FAISS index it
# Step 5: RAG it

In [None]:
import zipfile
import os
import os.path as path

def extract_zip_with_progress(zip_file_path):
    # Extract the name of the zip file without extension
    zip_file_name = path.splitext(path.basename(zip_file_path))[0]

    # Define the extraction folder in the same location as the zip file
    extract_to_folder = path.join(path.dirname(zip_file_path), zip_file_name)

    # Create the directory if it doesn't exist
    if not os.path.exists(extract_to_folder):
        os.makedirs(extract_to_folder)

    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Get total number of items in the zip file
        total_files = len(zip_ref.infolist())

        # Extract each file and show progress
        for i, file in enumerate(zip_ref.infolist(), 1):
            # Extract file
            zip_ref.extract(file, extract_to_folder)

            # Print progress
            print(f"Extracting {file.filename} ({i}/{total_files})")

In [None]:
extract_zip_with_progress('archive.zip')

In [None]:
def format_authors(authors):
    """Format the authors list."""
    formatted_authors = []
    for author in authors:
        name_parts = [author.get('first')] + author.get('middle', []) + [author.get('last')]
        name = ' '.join(filter(None, name_parts))  # Filter out empty parts
        if author.get('suffix'):
            name += ', ' + author.get('suffix')
        affiliation = author.get('affiliation', {}).get('name', '')
        email = author.get('email', '')
        author_info = [name, affiliation, email]
        formatted_authors.append(', '.join(filter(None, author_info)))  # Only add non-empty parts
    return '; '.join(formatted_authors)  # Use semicolon and space as separator


In [None]:
def insert_citations(text, cite_spans):
    """
    Insert citations into the text at the appropriate positions.
    """
    offset = 0
    for cite in cite_spans:
        start, end = cite['start'] + offset, cite['end'] + offset
        cite_text = cite['text']
        text = text[:start] + cite_text + text[end:]
        offset += len(cite_text) - (end - start)
    return text

In [None]:
def process_json_to_text(json_data):
    """Convert JSON data to a human-readable text format."""
    text_data = []

    # Title
    title = json_data['metadata'].get('title', 'No Title')
    text_data.append(f"Title: {title}\n")

   # Authors
    authors = json_data['metadata'].get('authors', [])
    formatted_authors = format_authors(authors)
    text_data.append(f"Authors: {formatted_authors}\n")

    # Abstract
    text_data.append("Abstract:\n")
    for paragraph in json_data['metadata'].get('abstract', []):
        text = insert_citations(paragraph.get('text', ''), paragraph.get('cite_spans', []))
        text_data.append(text + '\n')

    # Body Text
    seen_sections = set()
    for section in json_data.get('body_text', []):
        section_title = section.get('section', 'No Section Title')
        if section_title not in seen_sections:
            text_data.append(f"\n{section_title}:\n")
            seen_sections.add(section_title)
        text = insert_citations(section.get('text', ''), section.get('cite_spans', []))
        text_data.append(text + '\n')

    # Bibliography
    text_data.append("\nReferences:\n")
    for key, bib_entry in json_data.get('bib_entries', {}).items():
        ref_number = key.replace('BIBREF', '')
        authors = format_authors(bib_entry.get('authors', []))
        title = bib_entry.get('title', 'No Title')
        year = bib_entry.get('year', '')
        venue = bib_entry.get('venue', '')
        text_data.append(f"{ref_number}: {title}, {authors}, {venue}, {year}\n")

    return ''.join(text_data)

In [None]:
def main():
    json_dir = 'archive/document_parses/pdf_json'
    output_dir = 'archive/document_parses/txt_extract'

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    json_files = [f for f in os.listdir(json_dir) if f.endswith('.json')]

    for json_file in tqdm(json_files, desc="Converting JSON to Text"):
        output_file = os.path.join(output_dir, json_file.replace('.json', '.txt'))

        # Skip if the text file already exists
        if os.path.exists(output_file):
            continue

        with open(os.path.join(json_dir, json_file), 'r', encoding='utf-8') as file:
            json_data = json.load(file)

        text_data = process_json_to_text(json_data)

        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(text_data)

In [None]:
main()