## Categorizing into simple, medium and hard categories

In [11]:
import yaml

# Load the already sampled 100 entries
with open("sampled_100_entries.yml", "r") as f:
    sampled_data = yaml.safe_load(f)

# Prepare YAML block scalar formatting
class LiteralStr(str): pass

def str_presenter(dumper, data):
    return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')

yaml.add_representer(LiteralStr, str_presenter)

# Create buckets
simple, medium, complex = [], [], []

# Classify by code line count
for entry in sampled_data:
    code = entry.get("code", "")
    formatted_code = code.replace("\\n", "\n").replace("\\\\", "\\")
    entry["code"] = LiteralStr(formatted_code)

    line_count = formatted_code.count("\n") + 1
    if line_count <= 9:
        simple.append(entry)
    elif line_count <= 13:
        medium.append(entry)
    else:
        complex.append(entry)

# Save categorized datasets
with open("simple.yml", "w") as f:
    yaml.dump(simple, f, sort_keys=False)

with open("medium.yml", "w") as f:
    yaml.dump(medium, f, sort_keys=False)

with open("complex.yml", "w") as f:
    yaml.dump(complex, f, sort_keys=False)

print("Categorized into simple.yml, medium.yml, and complex.yml.")

Categorized into simple.yml, medium.yml, and complex.yml.


### lengths of files

In [13]:
import yaml

# Filenames
files = {
    "Simple": "simple.yml",
    "Medium": "medium.yml",
    "Complex": "complex.yml"
}

# Count entries in each file
for label, filename in files.items():
    with open(filename, "r") as f:
        data = yaml.safe_load(f)
        print(f"{label} complexity: {len(data)} code blocks")

Simple complexity: 35 code blocks
Medium complexity: 35 code blocks
Complex complexity: 30 code blocks


In [14]:
import yaml

# Filenames
files = {
    "Simple": "simple_codes.yml",
    "Medium": "medium_codes.yml",
    "Complex": "complex_codes.yml"
}

# Count entries in each file
for label, filename in files.items():
    with open(filename, "r") as f:
        data = yaml.safe_load(f)
        print(f"{label} codes: {len(data)} code blocks")

Simple codes: 35 code blocks
Medium codes: 35 code blocks
Complex codes: 30 code blocks
