In this notebook we use the issues identified by running the cleanlab_find_issue notebook to clean the datasets. We will make use of the qsl library to manually check though label issues.

First we import the necessary libraries.

In [None]:
# Only run this once to run the notebook from the parent directory that contains necessary modules.
%cd ..

In [None]:
import io
import numpy as np
import pandas as pd
import qsl
import PIL
import datasets
import cleanlab

Now load the dataset

In [None]:
SPLIT = "validation"

In [None]:
ds = datasets.load_from_disk(f"tmp/wv_datalab_{SPLIT}/data")

And the previously computed cleanlab issues

In [None]:
lab = cleanlab.Datalab(ds)
lab = lab.load(f'tmp/wv_datalab_{SPLIT}')

In [None]:
label_issues = lab.get_issues("label")
label_issues_df = label_issues.query("is_label_issue")

In [None]:
extra_info_df = ds.to_pandas()

In [None]:
final_df= pd.merge(label_issues_df, extra_info_df, how="left", left_index=True, right_index=True)

In [None]:
labels = ["No Person", "Person"]

params = dict(
    config={
        "image": [
            {
                "name": "clean_label",
                "options": [{"name": "Person"}, {"name": "No Person"}, {"name": "Exclude"}],
                "required": True,
            }
        ]
    },
    items=[
        {
            "target": np.asarray(PIL.Image.open(io.BytesIO(row["image"]["bytes"])))[:, :, [2, 1, 0]],
            "metadata": {"Original Label": row["given_label"], "Suggested Label": row["predicted_label"], "Label Score": row["label_score"], "Filename": row["filename"]},
        } for _, row in final_df.iterrows()
    ],
    maxCanvasSize=224,
)

labeller = qsl.MediaLabeler(**params)

In [None]:
display(labeller)

After manually looking through the dataset we save the cleaned labels to disk.

In [None]:
def str_label_to_int(label):
    if label == ["Person"]:
        return 1
    elif label == ["No Person"]:
        return 0
    elif label == ["Exclude"]:
        return -1
    else:
        raise ValueError(f"Unknown label: {label}")

In [None]:
output = [{"filename": item["metadata"]["Filename"],
            "clean_label": str_label_to_int(item["labels"]["image"]["clean_label"]),
            "original_label": item["metadata"]["Original Label"],} for item in labeller.items]

In [None]:
output_df = pd.DataFrame(output)
output_df.to_csv(f"tmp/wv_datalab_{SPLIT}_cleaned.csv", index=False)