In [None]:
import random
import json
import os
from huggingface_hub import notebook_login
from datasets import Dataset
import requests
from tqdm import tqdm

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
def micro_scramble(json_str: str, seed: int):
    
    # Use the seed for reproducibility
    random.seed(seed)
    
    # Scrambling operations
    def missing_comma(s):
        commas = [i for i, char in enumerate(s) if char == ',']
        if commas:
            return s[:commas[-1]] + s[commas[-1]+1:]
        return s

    def missing_bracket(s):
        brackets = [i for i, char in enumerate(s) if char in ['{', '[', '}', ']']]
        if brackets:
            return s[:brackets[-1]] + s[brackets[-1]+1:]
        return s

    def extra_comma(s):
        positions = [i for i, char in enumerate(s) if char in ['{', '[', ':']]
        if positions:
            pos = random.choice(positions)
            return s[:pos] + ',' + s[pos:]
        return s

    def extra_bracket(s):
        bracket_types = ['{', '[', '}', ']']
        positions = [i for i, char in enumerate(s) if char in bracket_types]
        if positions:
            pos = random.choice(positions)
            bracket = random.choice(bracket_types)
            return s[:pos] + bracket + s[pos:]
        return s
    
    operations = [missing_comma, missing_bracket, extra_comma, extra_bracket]
    
    num_edits = random.randint(1, 3)
    for _ in range(num_edits):
        # Randomly choose an operation and apply it
        json_str = random.choice(operations)(json_str)
    
    return json_str


In [None]:
# Call JSONplaceholder API to generate random JSON data

apis = ['/posts', '/comments', '/albums', '/photos', '/todos', '/users']
jsons = []
for api in apis:
    jsons += requests.get(f'https://jsonplaceholder.typicode.com{api}').json()

In [None]:
len(jsons)

5910

In [None]:
# To make the JSONs a bit more varied, randomly nest some inside others
# Also, convert some values into lists

nested_idx = random.sample(range(len(jsons)), 200)
list_idx = random.sample(range(len(jsons)), 200)
varied_jsons = []

for i in range(len(jsons)):
    json1 = random.choice(jsons)

    if i in nested_idx:
        json2 = random.choice(jsons)
        key = random.choice(list(json1.keys())) 
        json1[key] = json2

    if i in list_idx:  
        key = random.choice(list(json1.keys()))
        json1[key] = [json1[key]] 

    varied_jsons.append(json1)

In [None]:
varied_jsons[80]

{'albumId': 57,
 'id': 2810,
 'title': 'atque et molestiae neque',
 'url': 'https://via.placeholder.com/600/80063c',
 'thumbnailUrl': ['https://via.placeholder.com/150/80063c']}

In [None]:
scrambled_jsons = []
correct_jsons = []

for seed, example in enumerate(varied_jsons):
    correct_json = json.dumps(example)
    scrambled_json = micro_scramble(correct_json, seed)

    scrambled_jsons += [scrambled_json]
    correct_jsons += [correct_json]

In [None]:
scrambled_jsons

['}}{"albumId": 88, "id": 4389, "title": "aperiam neque eius", "url": "https://via.placeholder.com/600/306757", "thumbnailUrl": "https://via.placeholder.com/150/306757"}',
 '{"albumId": 29, "id": 1417, "title": "et exercitationem minus ut animi deserunt a", "url": "https://via.placeholder.com/600/f39e62" "thumbnailUrl": "https://via.placeholder.com/150/f39e62"}',
 '{"albumId": 73, "id": 3608, "title": "nostrum est molestiae iure accusamus eos sed", "url": "https://via.placeholder.com/600/30a48e" "thumbnailUrl": "https://via.placeholder.com/150/30a48e"}',
 '{"albumId": 4, "id": 172, "title": "deserunt commodi et aut et molestiae debitis et sed", "url": "https://via.placeholder.com/600/d611bd", "thumbnailUrl": "https://via.placeholder.com/150/d611bd"',
 '{"albumId",: 9, "id": 448, "title": "et voluptatem animi fuga aut", "url": "https://via.placeholder.com/600/d23a91", "thumbnailUrl": "https://via.placeholder.com/150/d23a91"}',
 '{"albumId": 86, "id": 4299, "title": "deleniti nobis non p

In [None]:
# Convert into a single LLM prompt
prompts = ["Below is a JSON string containing a syntactic error. Return the corrected JSON string.\n\n### Broken JSON:\n{}\n\n### Repaired JSON:\n{}".format(
    x, y
) for x, y in zip(scrambled_jsons, correct_jsons)]

In [None]:
print(prompts[1])

Below is a JSON string containing a syntactic error. Return the corrected JSON string.

### Broken JSON:
{"albumId": 29, "id": 1417, "title": "et exercitationem minus ut animi deserunt a", "url": "https://via.placeholder.com/600/f39e62" "thumbnailUrl": "https://via.placeholder.com/150/f39e62"}

### Repaired JSON:
{"albumId": 29, "id": 1417, "title": "et exercitationem minus ut animi deserunt a", "url": "https://via.placeholder.com/600/f39e62", "thumbnailUrl": "https://via.placeholder.com/150/f39e62"}


In [None]:
ds_prompts = Dataset.from_dict(
    {'text': prompts,
     'correct': correct_jsons,
     'scrambled': scrambled_jsons})

In [None]:
ds_prompts

Dataset({
    features: ['text', 'correct', 'scrambled'],
    num_rows: 5910
})

In [None]:
# Split into train and validation
ds_prompts = ds_prompts.train_test_split(test_size=0.1)

In [None]:
# Save dataset to HF hub for use in Colab
ds_prompts.push_to_hub("jackhogan/agemo_json_prompts", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/520 [00:00<?, ?B/s]