In [41]:
import re
import os
import json

In [42]:
file_path = '/Users/joanvelja/Documents/1000Kelvin/Dataset/my_list_test.txt'
with open(file_path, 'r') as file:
    file_content = file.read()

In [43]:
def find_json_objects_v2(text):
    # This function will find and return a list of JSON-like strings from the text

    # Define a list to store the extracted JSON strings
    json_strings = []

    # Use a stack to manage the curly braces
    brace_stack = []
    start_index = -1
    inside_json = False  # Track whether we're inside a JSON-like object

    # Iterate through the text to find JSON-like objects
    for i, char in enumerate(text):
        if char == '{':
            brace_stack.append(i)
            #print(f"Opening brace found at index {i}, stack: {brace_stack}")
            if len(brace_stack) == 1:
                # Mark the start of a JSON-like object
                start_index = i
                inside_json = True  # We are now inside a JSON-like object
                #print(f"Potential JSON object starts at index {start_index}")

        elif char == '}' or (inside_json and text[i:i+4] == '</s>'):
            # If we find a closing brace or the special token while inside a JSON object
            if brace_stack:
                brace_stack.pop()
                #print(f"Found closing token at index {i}, stack: {brace_stack}")
                if not brace_stack:
                    # We've found the end of a JSON-like object
                    end_index = i + 4 if char != '}' else i + 1
                    json_string = text[start_index:end_index]
                    json_strings.append(json_string)
                    #print(f"Complete JSON-like string found: {json_string}")
                    start_index = -1
                    inside_json = False  # We've exited the JSON-like object

    print(f"JSON-like strings found before filtering: {len(json_strings)}")

    # Filter the list to only include strings with all three required keys
    filtered_json_strings = [
        json_str for json_str in json_strings
        if all(key in json_str for key in ['"instruction":', '"input":', '"output":'])
    ]

    print(f"Filtered JSON strings: {len(filtered_json_strings)}")

    return filtered_json_strings

In [44]:
from colorama import Fore

def find_json_objects_v3(text):
    json_strings = []
    brace_stack = []
    start_index = -1
    inside_json = False

    for i, char in enumerate(text):
        if char == '{':
            brace_stack.append(i)
            if len(brace_stack) == 1:
                start_index = i
                inside_json = True

        elif char == '}' or (inside_json and text[i:i+4] == '</s>'):
            if brace_stack:
                brace_stack.pop()
                if not brace_stack:
                    end_index = i + 4 if char != '}' else i + 1
                    json_string = text[start_index:end_index]

                    # Correct the JSON string format
                    corrected_json_string = correct_json_format(json_string)
                    if corrected_json_string:
                        json_strings.append(corrected_json_string)

                    start_index = -1
                    inside_json = False
    
    print(f"JSON-like strings found before filtering: {len(json_strings)}")

    filtered_json_strings = [
        json_str for json_str in json_strings
        if all(key in json_str for key in ['"instruction":', '"input":', '"output":'])
    ]

    print(f"Filtered JSON strings: {len(filtered_json_strings)}")

    return filtered_json_strings



def correct_json_format(json_string):
    # Split the string into lines
    lines = json_string.split('\n')

    corrected_lines = []
    for line in lines:
        # Check if the each line is missing a quote/comma pair (instruction/input line) or quote/brace pair (output line)

        # If the line starts with "instruction" and doesn't end with a comma

        if '"instruction":' in line:
            #print("Instruction line found")
            #print(line)

            # If the instruction line is missing a quote
            if not line.split('"instruction":')[1].strip().startswith('"'):
                print( "Instruction line is missing a quote")
                print(line)
                line = line.replace(': ', ':"', 1)
                print("corrected line:")
                print("-"*50)
                print(line)
                print("#"*50)
               
            # If the instruction line doesn't end with a comma
            if not line.lstrip().endswith(','):
                print( "Instruction line has missing quote/comma pair")
                if line.endswith('}'):
                    print("Defective line. Discarding.")
                    continue
                print(line)
                if not line.endswith('"'):
                    line = line.rstrip() + '",'
                else:
                    line = line.rstrip() + ','
                print("corrected line:")
                print("-"*50)
                print(line)
                #line = '",' + line.lstrip()
                print("#"*50)
            

        # If the line starts with "input" and doesn't end with a comma
        if '"input":' in line:            
            # If the input line is missing a quote
            if not line.split('"input":')[1].strip().startswith('"'):
                print( "Input line is missing a quote")
                print(line)
                line = line.replace(': ', ':"', 1)
                print("corrected line:")
                print("-"*50)
                print(line)
                print("#"*50)

            # If the input line doesn't end with a comma
            if not line.lstrip().endswith(','):
                print( "Input line has missing quote/comma pair")
                if line.endswith('}'):
                    print("Defective line. Discarding.")
                    continue
                print(line)
                if not line.endswith('"'):
                    line = line.rstrip() + '",'
                else:
                    line = line.rstrip() + ','
                print("corrected line:")
                print("-"*50)
                print(line)
                #line = '",' + line.lstrip()
                print("#"*50)

        

        # If the line starts with "output" and doesn't end with a brace
        if '"output":' in line:
            
            # If the output line is missing a quote
            if not line.split('"output":')[1].strip().startswith('"'):
                print( "Output line is missing a quote")
                print(line)
                line = line.replace(': ', ':"', 1)
                print("corrected line:")
                print("-"*50)
                print(line)
                print("#"*50)
            
            # If the output line doesn't end with a brace
            if not line.lstrip().endswith('"}'):
                print( "Output line has missing quote/brace pair")
                print(line)
                #line = '",' + line.lstrip()
                if not line.endswith('"'):
                    line = line.rstrip() + '"}'
                else:
                    line = line.rstrip() + '}'
                print("corrected line:")
                print("-"*50)
                print(line)
                print("#"*50)
            
           
        print("submitting line :", line)
        corrected_lines.append(line)

    # Join the lines back into a single string
    corrected_json_string = '\n'.join(corrected_lines)

    # Add " before } if not preceded by "
    #corrected_json_string = re.sub(r'(?<!")(\s*}\s*)\Z', r'" \1', json_string)

    return corrected_json_string

In [45]:
# Apply the function to the content extracted from the file
json_like_strings = find_json_objects_v3(file_content)

submitting line : {"instruction": "How does NSTO optimize structures for 3D printing infill to save materials while ensuring structural strength?",
submitting line : "input": "NSTO has been demonstrated for a wide range of personal fabrication applications, including 3D printing infill, which aims to save materials while ensuring structural strength. NSTO optimizes structures for 3D printing infill by significantly improving optimization efficiency from the resolution aspect to ensure optimal performance and expanding the solution space purely under the physical constraints of multiple subtasks through self-supervision.",
submitting line : "output": "NSTO optimizes structures for 3D printing infill by saving materials while ensuring structural strength through significant improvements in optimization efficiency from the resolution aspect, which ensures optimal performance. This framework expands the solution space purely under the physical constraints of multiple subtasks through self-

In [46]:
json_like_strings

['{"instruction": "How does NSTO optimize structures for 3D printing infill to save materials while ensuring structural strength?",\n"input": "NSTO has been demonstrated for a wide range of personal fabrication applications, including 3D printing infill, which aims to save materials while ensuring structural strength. NSTO optimizes structures for 3D printing infill by significantly improving optimization efficiency from the resolution aspect to ensure optimal performance and expanding the solution space purely under the physical constraints of multiple subtasks through self-supervision.",\n"output": "NSTO optimizes structures for 3D printing infill by saving materials while ensuring structural strength through significant improvements in optimization efficiency from the resolution aspect, which ensures optimal performance. This framework expands the solution space purely under the physical constraints of multiple subtasks through self-supervision, allowing for a wider range of solutio

In [None]:
import json

string_original = r"""{"instruction": "Explain the role of the stress, \(\sigma\), in calculating the body heat-flux density, \(q_{el}\), in the FRAM process.",
"input": "The body heat-flux density, \(q_{el}\), in the FRAM process can be calculated using equation (8), where \(\sigma\) is the stress. This stress is a measure of the internal resistance of a material to deformation, and it plays a crucial role in determining the amount of heat generated during the FRAM process.",
"output": "In equation (8), the stress, \(\sigma\), is multiplied with the equivalent plastic strain rate, \(\hat{\varpi}_{el}\), to obtain the component of heat that is generated due to plastic deformation. This component of heat is known as the plastic work, and it is a measure of the energy required to deform the material. The higher the stress, the greater the plastic work, and hence, the greater the body heat-flux density, \(q_{el}\). This is because the material requires more energy to deform at higher stress levels. Therefore, the stress, \(\sigma\), is a critical parameter in determining the body heat-flux density, \(q_{el}\), in the FRAM process."}"""

# Replace single backslashes with double backslashes
string_original = string_original.replace('\\', '\\\\')

# Load the string into a JSON object
json_object = json.loads(string_original)

# Print the resulting JSON object
print(json_object)


In [None]:
repr(j)[1:-1]

In [None]:
repr(j)

In [47]:
json_objects = []

i = 0
for j in json_like_strings:
    i += 1
    print(f"processing {i}-th JSON-like string....")
    print(j)
    #print(escape_json_string(j))
    #dumped = json.dumps(escape_json_string(j))
    #print("Dumped JSON-like string")
    var = rf"{j}"
    var = var.replace('\\', '\\\\')
    try:
        var_dict = json.loads(var)
        json_objects.append(var_dict)
        print("JSON-like string converted to JSON object")
        print("#"*50)
    except:
        print("JSON-like string is not valid JSON")
        print("#"*50)
        continue


processing 1-th JSON-like string....
{"instruction": "How does NSTO optimize structures for 3D printing infill to save materials while ensuring structural strength?",
"input": "NSTO has been demonstrated for a wide range of personal fabrication applications, including 3D printing infill, which aims to save materials while ensuring structural strength. NSTO optimizes structures for 3D printing infill by significantly improving optimization efficiency from the resolution aspect to ensure optimal performance and expanding the solution space purely under the physical constraints of multiple subtasks through self-supervision.",
"output": "NSTO optimizes structures for 3D printing infill by saving materials while ensuring structural strength through significant improvements in optimization efficiency from the resolution aspect, which ensures optimal performance. This framework expands the solution space purely under the physical constraints of multiple subtasks through self-supervision, allo

In [48]:
len(json_objects)

37

In [49]:
with open('my_list_test.json', 'w') as f:
    json.dump(json_objects, f)

In [26]:
file_path = '/Users/joanvelja/Documents/1000Kelvin/Dataset/my_list_first62.txt'
with open(file_path, 'r') as file:
    content = file.read()

In [32]:
from colorama import Fore

def find_json_objects_v4(text):
    json_strings = []
    brace_stack = []
    start_index = -1
    inside_json = False

    for i, char in enumerate(text):
        if char == '{':
            brace_stack.append(i)
            if len(brace_stack) == 1:
                start_index = i
                inside_json = True

        elif char == '}' or (inside_json and text[i:i+10] == '<|im_end|>'): #text[i:i+4] == '</s>'):
            if brace_stack:
                brace_stack.pop()
                if not brace_stack:
                    end_index = i + 10 if char != '}' else i + 1
                    json_string = text[start_index:end_index]

                    # Correct the JSON string format
                    corrected_json_string = correct_json_format(json_string)
                    if corrected_json_string:
                        json_strings.append(corrected_json_string)

                    start_index = -1
                    inside_json = False
    
    print(f"JSON-like strings found before filtering: {len(json_strings)}")

    filtered_json_strings = [
        json_str for json_str in json_strings
        if all(key in json_str for key in ['"instruction":', '"input":', '"output":'])
    ]

    print(f"Filtered JSON strings: {len(filtered_json_strings)}")

    return filtered_json_strings



def correct_json_format(json_string):
    # Split the string into lines
    lines = json_string.split('\n')

    corrected_lines = []
    for line in lines:

        # If line is empty, skip it
        if not line:
            continue

        # Check if the each line is missing a quote/comma pair (instruction/input line) or quote/brace pair (output line)

        # If the line starts with "instruction" and doesn't end with a comma

        if '"instruction":' in line:
            #print("Instruction line found")
            #print(line)

            # If the instruction line is missing a quote
            if not line.split('"instruction":')[1].strip().startswith('"'):
                print( "Instruction line is missing a quote")
                print(line)
                line = line.replace(': ', ':"', 1)
                print("corrected line:")
                print("-"*50)
                print(line)
                print("#"*50)
               
            # If the instruction line doesn't end with a comma
            if not line.lstrip().endswith(','):
                print( "Instruction line has missing quote/comma pair")
                if line.endswith('}'):
                    print("Defective line. Discarding.")
                    continue
                print(line)
                if not line.endswith('"'):
                    line = line.rstrip() + '",'
                else:
                    line = line.rstrip() + ','
                print("corrected line:")
                print("-"*50)
                print(line)
                #line = '",' + line.lstrip()
                print("#"*50)
            

        # If the line starts with "input" and doesn't end with a comma
        if '"input":' in line:            
            # If the input line is missing a quote
            if not line.split('"input":')[1].strip().startswith('"'):
                print( "Input line is missing a quote")
                print(line)
                line = line.replace(': ', ':"', 1)
                print("corrected line:")
                print("-"*50)
                print(line)
                print("#"*50)

            # If the input line doesn't end with a comma
            if not line.lstrip().endswith(','):
                print( "Input line has missing quote/comma pair")
                if line.endswith('}'):
                    print("Defective line. Discarding.")
                    continue
                print(line)
                if not line.endswith('"'):
                    line = line.rstrip() + '",'
                else:
                    line = line.rstrip() + ','
                print("corrected line:")
                print("-"*50)
                print(line)
                #line = '",' + line.lstrip()
                print("#"*50)

        

        # If the line starts with "output" and doesn't end with a brace
        if '"output":' in line:
            
            # If the output line is missing a quote
            if not line.split('"output":')[1].strip().startswith('"'):
                print( "Output line is missing a quote")
                print(line)
                line = line.replace(': ', ':"', 1)
                print("corrected line:")
                print("-"*50)
                print(line)
                print("#"*50)
            
            # If the output line doesn't end with a brace
            if not line.lstrip().endswith('"}'):
                print( "Output line has missing quote/brace pair")
                print(line)
                #line = '",' + line.lstrip()
                if not line.endswith('"'):
                    line = line.rstrip() + '"}'
                else:
                    line = line.rstrip() + '}'
                print("corrected line:")
                print("-"*50)
                print(line)
                print("#"*50)

            if line.endswith('<|im_end|>'):
                print("Output line has EoT token")
                print(line)
                line = line.replace('<|im_end|>', '')
                print("corrected line:")
                print("-"*50)
                print(line)            
           
        print("submitting line :", line)
        corrected_lines.append(line)

    # Join the lines back into a single string
    corrected_json_string = '\n'.join(corrected_lines)

    # Add " before } if not preceded by "
    #corrected_json_string = re.sub(r'(?<!")(\s*}\s*)\Z', r'" \1', json_string)

    return corrected_json_string

In [33]:
json62 = find_json_objects_v4(content)

submitting line : {"instruction": "What is the relationship between the network frequency features and the structural performance at various resolutions in NSTO?",
submitting line : "input": "In NSTO, the network frequency hyperparameter o has a positive relationship with the structure details, resulting in compliance change. The stiffness matrix assembly time was 0.486s, and the linear elasticity solving time was 0.160s (with AMGX [NAC*15]), taking up around 82.04% of the computation time.",
submitting line : "output": "The relationship between network frequency features and the structural performance at various resolutions in NSTO is that an increase in the network frequency hyperparameter o leads to a higher level of structure details, which in turn affects the compliance change of the structure. This implies that the structural performance at different resolutions is influenced by the network frequency features, and optimizing these features can help in achieving better structural 

In [36]:
json_objects62 = []

i = 0
for j in json62:
    i += 1
    print(f"processing {i}-th JSON-like string....")
    print(j)
    #print(escape_json_string(j))
    #dumped = json.dumps(escape_json_string(j))
    #print("Dumped JSON-like string")
    var = rf"{j}"
    var = var.replace('\\', '\\\\')
    try:
        var_dict = json.loads(var)
        json_objects62.append(var_dict)
        print("JSON-like string converted to JSON object")
        print("#"*50)
    except:
        print("JSON-like string is not valid JSON")
        print("#"*50)
        continue


processing 1-th JSON-like string....
{"instruction": "What is the relationship between the network frequency features and the structural performance at various resolutions in NSTO?",
"input": "In NSTO, the network frequency hyperparameter o has a positive relationship with the structure details, resulting in compliance change. The stiffness matrix assembly time was 0.486s, and the linear elasticity solving time was 0.160s (with AMGX [NAC*15]), taking up around 82.04% of the computation time.",
"output": "The relationship between network frequency features and the structural performance at various resolutions in NSTO is that an increase in the network frequency hyperparameter o leads to a higher level of structure details, which in turn affects the compliance change of the structure. This implies that the structural performance at different resolutions is influenced by the network frequency features, and optimizing these features can help in achieving better structural performance. Addi

In [37]:
len(json_objects62)

40

In [38]:
with open('json_objects62.json', 'w') as f:
    json.dump(json_objects62, f)

In [50]:
import glob

with open("merged_file.jsonl", "w") as outfile:
    for filename in glob.glob("*.json"):
        print(f"merging {filename}...")
        with open(filename) as infile:
            for line in infile:
                outfile.write(line)
            if not line.endswith('\n'):
                outfile.write('\n')

merging json_objects62.json...
merging my_list_test.json...


In [58]:
for f in glob.glob("*.txt"):
    if f.split('.')[0].startswith('my_list_first'):
        print("Opening...", f)
        with open(f, 'r') as file:
            content = file.read()

        print("Previewing content...")
        print(content[:2000])
        print("#"*1000)
        print("Finding JSON objects...")
        json_strings = find_json_objects_v4(content)
        print("Converting JSON-like strings to JSON objects...")
        json_objects = []
        for j in json_strings:
            var = rf"{j}"
            var = var.replace('\\', '\\\\')
            try:
                var_dict = json.loads(var)
                json_objects.append(var_dict)
            except:
                continue
        print("Writing JSON objects to file...")
        with open(f.split('.')[0] + '.json', 'w') as f:
            json.dump(json_objects, f)

Opening... my_list_first1028.txt
Previewing content...
: {"instruction": "What are the main sources of equipment-induced defects in Laser Powder Bed Fusion additive manufacturing?",
"input": "In Laser Powder Bed Fusion additive manufacturing, equipment-induced defects can be generated by improper or sub-optimal setting and calibration of the MAM equipment. These defects can arise from four main sources: beam scanning and deflection system, powder handling and deposition system, insufficient baseplate thickness, and build chamber environmental control.",
"output": "In Laser Powder Bed Fusion additive manufacturing, equipment-induced defects can arise from four main sources: beam scanning and deflection system, powder handling and deposition system, insufficient baseplate thickness, and build chamber environmental control. These defects are caused by improper or sub-optimal setting and calibration of the MAM equipment."}<|im_end|>
: {"instruction": "What are the expected stress levels wh

In [59]:
import glob

with open("merged_file_xxl.jsonl", "w") as outfile:
    for filename in glob.glob("*.json"):
        print(f"merging {filename}...")
        with open(filename) as infile:
            for line in infile:
                outfile.write(line)
            if not line.endswith('\n'):
                outfile.write('\n')

merging my_list_first1257.json...
merging my_list_first62.json...
merging my_list_first473.json...
merging my_list_first1112.json...
merging my_list_first214.json...
merging my_list_first2685.json...
merging my_list_first264.json...
merging my_list_first1381.json...
merging json_objects62.json...
merging my_list_first2058.json...
merging my_list_first1481.json...
merging my_list_first1028.json...
merging my_list_test.json...
merging my_list_first94.json...
