# Dataset reviewer
Notebook that allows reviewing examples from the created HuggingFace dataset

In [None]:
import os
from pathlib import Path

import datasets

# set the correct path to your dataset:
dataset_path = '' # SET PATH HERE
dataset_path = str(dataset_path)
display(dataset_path)
assert os.path.exists(dataset_path)

dataset = datasets.load_from_disk(dataset_path)
display(dataset)

train_ds = dataset['train']
display(train_ds)

df = train_ds.to_pandas()
display(df)

In [None]:
import difflib
import html
import ipywidgets as widgets
from IPython.display import display, clear_output


def print_diffs(str_a, str_b):
    diff = difflib.ndiff(str_a, str_b)
    result = []
    buffer = ""
    last_op = ''

    for d in diff:
        op = d[0]
        text = d[2]

        if op == ' ':
            if buffer:
                # flush buffer with color based on last_op
                color = "red" if last_op == '-' else "green"
                result.append(f"<span style='color:{color};'>{html.escape(buffer)}</span>")
                buffer = ""
            result.append(html.escape(text))
        elif op in '+-':
            if last_op and op != last_op and buffer:
                # flush buffer with previous color
                color = "red" if last_op == '-' else "green"
                result.append(f"<span style='color:{color};'>{html.escape(buffer)}</span>")
                buffer = ""
            buffer += text
            last_op = op
        elif op == '?':
            continue  # Ignoring the ? hints in output

    if buffer:
        # flush remaining buffer with color
        color = "red" if last_op == '-' else "green"
        result.append(f"<span style='color:{color};'>{html.escape(buffer)}</span>")

    return ''.join(result)


class DataReviewer:
    def __init__(self, df, start_row=0):
        self.output = widgets.Output()
        self.prev_button = None
        self.next_button = None

        self.df = df
        self.df_len = df.shape[0]
        self.current_index = start_row

    def display_row(self):
        with self.output:
            clear_output(wait=True)

            if self.current_index < len(self.df):
                row = self.df.iloc[self.current_index]
                display(f'{row["page_title"]} - {row["section"]} <{row["rev_id"]}>')
                display(f'{row["comment"]}')
                display(f'https://en.wikipedia.org/w/index.php?diff=prev&oldid={row["rev_id"]}')

                prompt = row['prompt']
                chosen = row['chosen'][1]['content']
                rejected = row['rejected'][1]['content']

                for title, text in [('Prompt', prompt), ('Chosen', chosen), ('Rejected', rejected)]:
                    text = html.escape(text)
                    accordion = widgets.Accordion(children=[widgets.HTML(f'<pre>{text}</pre>')])
                    accordion.set_title(0, title)
                    accordion.selected_index = 0
                    display(accordion)
                
                diff = print_diffs(rejected, chosen)
                accordion = widgets.Accordion(children=[widgets.HTML(f'<pre>{diff}</pre>')])
                accordion.set_title(0, 'Diff')
                accordion.selected_index = 0
                display(accordion)

                display(f'{self.current_index}/{self.df_len}')
                display(widgets.HBox([self.prev_button, self.next_button]))
            else:
                print("No more rows to review or end of dataset reached.")
                return

    def prev_row(self, b):
        self.current_index -= 1
        self.display_row()

    def next_row(self, b):
        self.current_index += 1
        self.display_row()

    def create_widgets(self):
        self.prev_button = widgets.Button(description="Previous", button_style='danger')
        self.next_button = widgets.Button(description="Next", button_style='success')

        self.prev_button.on_click(self.prev_row)
        self.next_button.on_click(self.next_row)

# Sample random 1000 rows
df_random = df.sample(n=1000, random_state=None)

reviewer = DataReviewer(df_random, start_row=0)
reviewer.create_widgets()

display(reviewer.output)
reviewer.display_row()