## **Triplet visualizer**

##### **Install the required packages**

In [None]:
!pip install gradio
!pip install markdown2

##### **Import the required packages**

In [1]:
import gradio as gr
import json
import random
import markdown2
import re
import io

##### **Import the functions needed for the GUI**

In [2]:
# Function to fix LaTeX formatting in a string
def fix_latex_formatting(text):
    pattern_math_mode = r'\((\\[a-zA-Z]+(?:\{[^\}]*\})?)\)'
    pattern_text_subscript = r'\{([a-zA-Z]+)\}'

# Function to replace math mode patterns
    def replace_math_mode(match):
        return f'\\\\({match.group(1)}\\\\)'

# Function to replace textual subscripts
    def replace_text_subscript(match):
        return f'{{\\text{{{match.group(1)}}}}}'

# Replace all found patterns in the text
    text = re.sub(pattern_math_mode, replace_math_mode, text)
    text = re.sub(pattern_text_subscript, replace_text_subscript, text)
    return text

# Function to fix LaTeX formatting in a dictionary
def fix_latex_in_dict(input_dict):
    for key, value in input_dict.items():
        if isinstance(value, str):
            input_dict[key] = fix_latex_formatting(value)
        elif isinstance(value, dict):
            fix_latex_in_dict(value)
    return input_dict

# Function to read a JSONL file
def read_jsonl_file(filepath):
    global feedback_data, current_jsonl_data
    current_jsonl_data = []
    feedback_data = {}
    #entries = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line_number, line in enumerate(file, start=1):
            if not line.strip():  # Skip empty lines
                print(f"Line {line_number} is empty or whitespace.")
                continue
            try:
                json_line = json.loads(line)
                for j in json_line:
                    #print(j)

                    current_jsonl_data.append(j)
                    feedback_data[j] = None
                    #print(f"Line {line_number}: {json_line}")
                    #print(type(json_line))
            except json.JSONDecodeError as e:
                print(f"JSON decode error on line {line_number}: {e}")
                continue

    return None, current_jsonl_data

# Function to get random entries from a list of entries, and convert markdown to HTML
def get_random_entries(entries, n):
    n = int(n)
    if len(entries) < n:
        return f"Requested number of entries ({n}) exceeds the available entries ({len(entries)}).", None
    
    print(f"Entries has type : {type(entries)}")
    random_entries = random.sample(entries, n)
    
    # Convert markdown to HTML
    converted_entries = []
    for entry in random_entries:
        if isinstance(entry, dict):
            converted_entry = {
                "instruction": markdown2.markdown(entry["instruction"]),
                "input": markdown2.markdown(entry["input"]),
                "output": markdown2.markdown(entry["output"])
            }
            converted_entries.append(converted_entry)
        else:
            var = rf"{entry}"
            #var = var.replace('\\', '\\\\')
            var_dict = json.loads(var)
            converted_entry = {
                "instruction": markdown2.markdown(var_dict["instruction"]),
                "input": markdown2.markdown(var_dict["input"]),
                "output": markdown2.markdown(var_dict["output"])
            }
            converted_entries.append(converted_entry)
    return None, converted_entries

# Function to interface with the Gradio interface
def interface(file, n):
    error, entries = read_jsonl_file(file)
    if error:
        return error
    error, random_entries = get_random_entries(entries, n)
    if error:
        return error
    return random_entries


# Function to update the output
def update_output(file, n):
    global feedback_data

    if file is None:
        return "Please upload a file."
    if n < 1:
        return "Please enter a valid number of entries (greater than 0)."

    entries_html = interface(file, n)
    if isinstance(entries_html, str):  # If an error message is returned
        return entries_html
    else:
        formatted_entries = []
        for entry in entries_html:

            entry = fix_latex_in_dict(entry)
            print(entry)
            print("#"*100)
            print(type(entry))
            formatted_entry = "<div><strong>Instruction:</strong> " + entry["instruction"] + \
                              "<strong>Input:</strong> " + entry["input"] + \
                              "<strong>Output:</strong> " + entry["output"] + "</div><hr>" + \
                              "<button onclick='feedback(\"{entry_id}\", \"thumbs_up\")'>👍</button>" + \
                              "<button onclick='feedback(\"{entry_id}\", \"thumbs_down\")'>👎</button>"
            formatted_entries.append(formatted_entry)
        
        # Join entries and add the MathJax script
        all_entries_html = "<br>".join(formatted_entries)
        html_with_mathjax = all_entries_html + """
        <script type="text/javascript" async
        src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/3.2.0/es5/tex-mml-chtml.js">
        </script>
        """
        return html_with_mathjax or "No entries found."

def handle_feedback(entry_id, feedback_type):
    global feedback_data
    feedback_data[entry_id] = feedback_type
    return "Feedback received!"

def download_updated_jsonl():
    global feedback_data, current_jsonl_data
    updated_jsonl = [entry for entry_id, entry in enumerate(current_jsonl_data) if feedback_data.get(f"entry_{entry_id}") != "thumbs_down"]
    
    jsonl_str = "\n".join([json.dumps(entry) for entry in updated_jsonl])
    return io.BytesIO(jsonl_str.encode()), "updated_file.jsonl"

##### **Create the GUI**

In [4]:
with gr.Blocks() as demo:
    gr.Markdown("## JSONL File Processor")
    gr.Markdown("Upload a JSONL file and display a specified number of random entries from the file.")

    with gr.Row():
        file_input = gr.File(label="Upload JSONL file")
        n_input = gr.Number(label="Number of Entries", value=1, step=1, 
                            info="Enter the number of random entries to display")
        submit_button = gr.Button("Show Random Entries")
        feedback_button = gr.Button("Submit Feedback")
        download_button = gr.Button("Download Feedback-Based JSON")
        
    output = gr.HTML(label="Random Entries Display")

    submit_button.click(fn=update_output, inputs=[file_input, n_input], outputs=output)

    #feedback_button.click(fn=handle_feedback, inputs=["entry_id", "feedback_type"], outputs="text")
    download_button.click(fn=download_updated_jsonl, inputs=[], outputs=gr.File(label="Download Updated JSONL"))

##### **Launch!**

The GUI will open below and provide a link to open in a new tab in your browser. Click the link and in the browser you can now upload the file in the attachments `merged_file_xl.jsonl` and visualize the triplets. You can either sample one triplet at a time or change the number of triplets to sample at a time.

In [5]:
demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://80a6c2a4e7f1378292.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




Traceback (most recent call last):
  File "/Users/joanvelja/anaconda3/envs/nlp1/lib/python3.11/site-packages/gradio/queueing.py", line 427, in call_prediction
    output = await route_utils.call_process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/joanvelja/anaconda3/envs/nlp1/lib/python3.11/site-packages/gradio/route_utils.py", line 234, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/joanvelja/anaconda3/envs/nlp1/lib/python3.11/site-packages/gradio/blocks.py", line 1487, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/joanvelja/anaconda3/envs/nlp1/lib/python3.11/site-packages/gradio/blocks.py", line 1109, in call_function
    prediction = await anyio.to_thread.run_sync(
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/joanvelja/anaconda3/envs/nlp1/lib/python3.11/site-packages/anyio/to_thread.py", line 3