In [1]:
import re
import os
from pprint import pprint

In [2]:
key_map = {
    "I":"id",
    "N":"name",
    "C":"comments",
    "D":"references",
    "H":"related_links",
    "F":"formula",
    "e":"examples",
    "p":"maple_program",
    "t":"mathematica_program",
    "o":"other_program",
    "Y":"cross_references",
    "K":"keywords",
    "O":"offset",
    "A":"author"
}

In [3]:
def parse_sequence_file(file_content):

    parsed_content = {}

    # Regular expression patterns to match the lines
    patterns = {
        'I': r'%I (A\d+)',  # ID of the sequence
        'S': r'%S (A\d+) (.+)',  # Sequence values (start)
        'T': r'%T (A\d+) (.+)',  # Sequence values (continued)
        'U': r'%U (A\d+) (.+)',  # Sequence values (continued)
        'N': r'%N (A\d+) (.+)',  # Textual explanation of the sequence
        'C': r'%C (A\d+) (.+)',  # Comments
        'D': r'%D (A\d+) (.+)',  # References
        'H': r'%H (A\d+) (.+)',  # Links to tables and other material
        'F': r'%F (A\d+) (.+)',  # Formulae
        'e': r'%e (A\d+) (.+)',  # Examples
        'p': r'%p (A\d+) (.+)',  # Maple programs
        't': r'%t (A\d+) (.+)',  # Mathematica programs
        'o': r'%o (A\d+) (.+)',  # Other programs
        'Y': r'%Y (A\d+) (.+)',  # Cross-references
        'K': r'%K (A\d+) (.+)',  # Keywords
        'O': r'%O (A\d+) (.+)',  # Offset
        'A': r'%A (A\d+) (.+)'   # Author
    }

    for line in file_content.strip().splitlines():
        for key, pattern in patterns.items():
            match = re.match(pattern, line)
            if match:
                # If the line contains sequence values, append them
                if key in ['S', 'T', 'U']:
                    if 'sequence' not in parsed_content:
                        parsed_content['sequence'] = []
                    # Split the sequence values and strip whitespace
                    values = match.group(2).strip(', \n').split(',')
                    # Extend the current sequence values
                    parsed_content['sequence'].extend(values)
                # If the line contains comments or references, they could be multiple
                elif key in ['C', 'D', 'H', 'F', 'e', 'p', 't', 'o', 'Y']:
                    if key_map[key] not in parsed_content:
                        parsed_content[key_map[key]] = []
                    parsed_content[key_map[key]].append(match.group(2).strip())
                elif key == 'A':
                    parsed_content[key_map[key]] = match.group(2).strip()
                else:
                    # For all other keys, just add the value
                    parsed_content[key_map[key]] = match.group(1).strip()
                break  # Stop checking patterns if one has matched
    
    for k,v in parsed_content.items():
        
        if isinstance(v, list) and k!='sequence':
            v = [x.replace("\n","").strip() for x in v]
            parsed_content[k] = " ".join(v)
    
    return parsed_content

## Update the path with folder containing the sequence files and run the below cell to extract data for all sequences

In [19]:
import os
import json


input_base_dir = "./oeisdata/seq/"
output_base_dir = "./json_output"

# Loop through each subfolder in the input directory (e.g., A000, A001)
for subfolder in os.listdir(input_base_dir):
    subfolder_path = os.path.join(input_base_dir, subfolder)
    
    # Ensure we are only processing directories
    if not os.path.isdir(subfolder_path):
        continue

    # Create the corresponding output subdirectory
    output_subfolder_path = os.path.join(output_base_dir, subfolder)
    os.makedirs(output_subfolder_path, exist_ok=True)

    # Process each .seq file within the subfolder
    for seq_file in os.listdir(subfolder_path):
        seq_file_path = os.path.join(subfolder_path, seq_file)
        
        # Check if the item is a file and has the .seq extension
        if os.path.isfile(seq_file_path) and seq_file.endswith(".seq"):
            json_filename = os.path.splitext(seq_file)[0] + ".json"
            json_file_path = os.path.join(output_subfolder_path, json_filename)
            
            with open(seq_file_path, "r") as f:
                parsed_content = parse_sequence_file(f.read())
                
                # Write parsed content to a JSON file in the corresponding output subfolder
                with open(json_file_path, "w") as json_file:
                    json.dump(parsed_content, json_file, indent=4)
