In [13]:
import json
import re

def fix_array_format(text):
    # Fix array formatting in cosine_sim
    return re.sub(r'\[([\d\.\s]+)\]', lambda m: '[' + ', '.join(m.group(1).split()) + ']', text)

def fix_json_content(content):
    # Split content into individual JSON objects
    json_objects = re.findall(r'\{.*?\}', content, re.DOTALL)
    
    # Process each JSON object
    fixed_objects = []
    for obj in json_objects:
        # Replace curly quotes with straight quotes
        obj = obj.replace('"', '"').replace('"', '"')
        
        # Handle apostrophes in text by escaping them
        obj = re.sub(r'(\w)\'(\w)', r'\1\\"\2', obj)
        
        # Fix single quotes to double quotes
        obj = obj.replace("'", '"')
        
        # Fix array formatting
        obj = fix_array_format(obj)
        
        # Remove trailing commas before closing braces
        obj = obj.replace(',\n}', '\n}')
        
        # Remove any trailing comma on the last item
        obj = re.sub(r',(\s*})$', r'\1', obj)
        
        # Fix missing commas between objects
        if fixed_objects and not obj.startswith(','):
            obj = ',' + obj
            
        # Fix missing commas after array values
        obj = re.sub(r'(\])(\s*)([^,}\s])', r'\1,\2\3', obj)
        
        fixed_objects.append(obj)
    
    # Join all objects into a single array
    content = '[\n' + '\n'.join(fixed_objects) + '\n]'
    
    return content

# Read the file
with open('outputk10.txt', 'r', encoding='utf-8') as f:
    content = f.read()

# Fix the content
fixed_content = fix_json_content(content)

# Try to parse and write the formatted JSON
try:
    data = json.loads(fixed_content)
    with open('output_formatted.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)
    print("Successfully formatted JSON")
except json.JSONDecodeError as e:
    print(f"Error parsing JSON: {e}")
    # Print the problematic area
    lines = fixed_content.split('\n')
    line_no = e.lineno - 1
    print("\nProblematic area:")
    for i in range(max(0, line_no-2), min(len(lines), line_no+3)):
        print(f"Line {i+1}: {lines[i]}")

Successfully formatted JSON
