# DIGI405 Lab 4.3: Annotation

Use this notebook to annotate an unlabelled sample of tweets from the [Cardiff NLP tweet_eval dataset](https://huggingface.co/datasets/cardiffnlp/tweet_eval) [(Barbieri et al., 2020)](https://arxiv.org/pdf/2010.12421).

* There are 45 tweets in the sample, and you will have the choice of three labels "positive", "negative", or "neutral".
* For each tweet, you must also provide a score (0-100%) that indicates your confidence that the label is a good fit for the tweet.

➡️ Run all of the cells below to load the dataset and display the interface. Then start annotating!

✳️ Do not discuss your annotations with others. After this exercise, your tutor will collate the class annotations and measure agreement using the [Krippendorff's alpha metric](https://www.k-alpha.org/).


In [None]:
import os
import pandas as pd
import zipfile

import ipywidgets as widgets
from ipywidgets import Layout
from ipywidgets import AppLayout, Button, ButtonStyle
from ipywidgets import HTML as WidgetHTML
from IPython.display import display, clear_output
from IPython.display import HTML

 ### Important - enter your UC ID in the cell below 👇

In [None]:
annotator = "abc123" # Enter your UC user ID here

### Now run the cells to load the data and begin annotating

In [None]:
df = pd.read_csv("/srv/source-data/sample_tweets_unlabelled.csv")

In [None]:
df

In [None]:
# @title
# Labels
label_list = ["NEGATIVE",
              "NEUTRAL",
              "POSITIVE",
             ]

In [None]:
# @title
# Instructions
instruction_text = f"""The goal for this task is to identify the sentiment of each tweet as either POSITIVE, NEGATIVE, or NEUTRAL.

Steps:
(1) Select the sentiment that you feel best represents this tweet from the options POSITIVE, NEGATIVE, or NEUTRAL
(2) Choose a confidence score to represent how confident you are that this label is a good match for the tweet

Your annotations will be saved to a new CSV file called 'df_labelled_{annotator}.csv'
"""

In [None]:
# @title
# Interface
debug_output = widgets.Output()
display(debug_output)

# Annotation widgets
out = widgets.Output()

label_options = ["Select Label"] + label_list  # Add default option if not present

# Create a new df to store labelled data
labelled_df = pd.DataFrame(columns = df.columns)
labelled_file_name = f"df_labelled_{annotator}.csv"

text_display = widgets.HTML(
    value = "",  # Will be populated with text
    layout = widgets.Layout(
        width = "100%",
        padding = "20px",
        margin = "10px 0",
        font_size = "18px",
        border = "1px solid #ddd",
        border_radius = "5px",
        background_color = "#f8f9fa"
    )
)

label_heading = widgets.HTML(value =  "<b>Select label:</b>")

label_dropdown = widgets.Dropdown(
    options = label_options,
    description = "Label:",
    layout = widgets.Layout(width = "60%")
)

confidence_dropdown = widgets.Dropdown(
    options = ["100%", "90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%"],
    description = "Confidence:",
    value = "0%",  # Default value
    layout = widgets.Layout(width="60%")
)

formatted_instructions = instruction_text.replace("\n", "<br>")

label_help = widgets.HTML(
    value = f"""<div style="margin-top: 10px; padding: 10px; background-color: #f1f1f1; border-radius: 5px;">
        <p><b>Instructions:</b></p>
        <p>{formatted_instructions}</p>
    </div>"""
)

# Progress bar for tracking annotation progress
progress_bar = widgets.IntProgress(
    value = 0,
    min = 0,
    max = len(df),
    description = "Progress:",
    bar_style = "info",
    style = {"bar_color": "green"},
    orientation = "horizontal",
    layout = widgets.Layout(
        width = "95%",
        align_items = "flex-start",
        margin = "20px 0px 0px -15px",
        padding = "0px"  # Top, Right, Bottom, Left
    )
)

# Update the dataframes
def update_dataframe(row_index):
    global labelled_df

    # Update the main dataframe
    df.loc[row_index, "label"] = label_dropdown.value
    df.loc[row_index, "confidence"] = confidence_dropdown.value

    # Add this row to the labelled dataframe if not already present
    if row_index not in labelled_df.index:
        row_to_add = df.loc[row_index:row_index].copy()
        labelled_df = pd.concat([labelled_df, row_to_add], ignore_index=False)
    else:
        # Update existing entry
        labelled_df.loc[row_index, "label"] = label_dropdown.value
        labelled_df.loc[row_index, "confidence"] = confidence_dropdown.value

    # Save the labelled dataframe to CSV
    labelled_df.to_csv(labelled_file_name, index=False)

    with debug_output:
        print(f"✅ Saved to {labelled_file_name} - Total labelled: {len(labelled_df)}")

if "label" not in df.columns:
    df["label"] = None
if "confidence" not in df.columns:
    df["confidence"] = None

# Custom CSS for the buttons - larger size and rounded corners
custom_css = """
.custom-button {
    color: white !important;
    font-weight: bold !important;
    font-size: 14px !important;
    margin: 5px 5px 20px 0px !important;
    border-radius: 5px !important;
}
"""

def update_stats():
    counts = df["label"].value_counts().to_dict()

    # Generate list items for each label in label_list
    label_stats = []
    for label in label_list:
        # Skip the "Select Label" option if present
        if label != "Select Label":
            label_stats.append(f"<li>{label}: {counts.get(label, 0)}</li>")

    stats_html = f"""<div style="margin-top: 15px; padding: 10px; background-color: #e9f7f2; border-radius: 5px;">
        <p><b>Annotation stats:</b></p>
        <ul>
            {"".join(label_stats)}
            <li>Remaining: {len(df[df["label"].isnull()])}</li>
        </ul>
    </div>"""
    return widgets.HTML(value = stats_html)

stats_widget = update_stats()

def display_interface(row_index):
    with out:
        clear_output(wait=True)

    # Load text and display it
    # Assuming "text" is the column with the data
    item_text = df.loc[row_index, "text"]
    text_display.value = f"""
    <div style="padding: 15px; font-size: 20px; line-height: 1.5;">
        {item_text}
    </div>
    """

    # Load existing values from df if available
    if not pd.isnull(df.loc[row_index, "label"]):
        label_dropdown.value = df.loc[row_index, "label"]
        confidence_dropdown.value = df.loc[row_index, "confidence"]
    else:
        # Reset to default values - use first item in dropdown
        label_dropdown.value = label_options[0]
        confidence_dropdown.value = "0%"

    progress_bar.value = row_index + 1

    app_layout.center.children = [
        widgets.HTML(value=f"<h3>Item #{row_index + 1} of {len(df)}</h3>"),
        text_display,
        widgets.VBox([
            label_heading,
            label_dropdown,
            confidence_dropdown,
            label_help
        ])
    ]

form_item_layout = widgets.Layout(
    display = "flex",
    flex_flow = "column",
    justify_content = "flex-start",
    padding = "10px 0 0 2px"  # Top, Right, Bottom, Left
)

# Widgets
next_button = widgets.Button(
    description = "Next",
    style = widgets.ButtonStyle(button_color="green"),
    layout = widgets.Layout(width = "100%")
)
left_button = widgets.Button(
    description = "Previous",
    style = widgets.ButtonStyle(button_color = "blue"),
    layout = widgets.Layout(width = "100%")
)

display(HTML("<style>" + custom_css + "</style>"))
next_button.add_class("custom-button")

jump_to = widgets.IntText(
    description = "Go to item:",
    min = 1,
    max = len(df),
    layout = widgets.Layout(width="150px")
)

jump_button = widgets.Button(
    description = "Jump",
    style = widgets.ButtonStyle(button_color="orange"),
    layout = widgets.Layout(width="80px")
)

# Load existing labelled data if available
if os.path.exists(labelled_file_name):
    try:
        labelled_df = pd.read_csv(labelled_file_name)
        with debug_output:
            print(f"Loaded existing labelled data with {len(labelled_df)} entries")
    except Exception as e:
        with debug_output:
            print(f"Error loading existing labelled file: {str(e)}")

# Left side-bar
left_sidebar_content = widgets.VBox([
    left_button,
    next_button,
    widgets.HTML("<hr>"),
    widgets.HBox([jump_to, jump_button]),
    widgets.HTML("<hr>"),
    stats_widget,
    progress_bar,
], layout = widgets.Layout(
    padding = "10px",
))

# Create the app layout
app_layout = widgets.AppLayout(
    header = None,
    left_sidebar = left_sidebar_content,
    center = widgets.VBox([widgets.HTML(), text_display]),
    right_sidebar = None,
    pane_widths = [25, 75, 0],
    justify_items = "left",
)

display(HTML("<style>" + custom_css + "</style>"))
left_button.add_class("custom-button")
next_button.add_class("custom-button")

# Find the index of the first row with NaN in "label" column, or use the first row if all are annotated
null_rows = df[df["label"].isnull()]
if len(null_rows) > 0:
    # There are rows that need annotation
    current_row_index = null_rows.index[0]
else:
    # All rows are already annotated
    current_row_index = 0  # Start from beginning
    print("\n⚠️ All items have been annotated!\n")

def on_previous_button_clicked(b):
    with debug_output:
        global current_row_index
        if current_row_index > 0:
            try:
                current_row_index -= 1
                display_interface(current_row_index)
            except Exception as e:
                current_row_index += 1  # Revert if there's an error
                print(f"Error: {str(e)}")
                import traceback
                traceback.print_exc()

def on_next_button_clicked(b):
    global current_row_index

    # Clear previous output
    with debug_output:
        clear_output(wait=True)

    # Validation check
    if label_dropdown.value == "Select Label":
        with debug_output:
            print("\n" + "!" * 50)
            print("⚠️  ERROR: Please select a label before proceeding.")
            print("!" * 50 + "\n")
        return  # Stop further execution

    try:
        # Save current item's data
        update_dataframe(current_row_index)

        # Update stats
        stats_widget.value = update_stats().value

        # Check for end of df
        if current_row_index >= len(df) - 1:
            with debug_output:
                print(f"\n✨ ANNOTATION COMPLETE! Take a break! ☕")

            # Save the final data
            labelled_df.to_csv(labelled_file_name, index=False)

            # Clear the display to indicate completion
            app_layout.center.children = [
                widgets.HTML(value="""
                <div style="text-align: center; margin-top: 100px;">
                    <h1 style="color: green;">✅ Annotation Complete!</h1>
                    <p>You've reached the end of the dataset.</p>
                </div>
                """)
            ]
            return

        # Move to next item
        current_row_index += 1
        display_interface(current_row_index)

        with debug_output:
            print(f"✅ Continue...")
    except Exception as e:
        current_row_index -= 1  # Revert if there's an error
        with debug_output:
            print(f"Error navigating to next item: {str(e)}")
            import traceback
            traceback.print_exc()

def on_jump_clicked(b):
    global current_row_index
    target_idx = jump_to.value - 1
    if 0 <= target_idx < len(df):
        current_row_index = target_idx
        display_interface(current_row_index)

next_button.on_click(on_next_button_clicked)
left_button.on_click(on_previous_button_clicked)
jump_button.on_click(on_jump_clicked)

display(app_layout)

# Initial display
display_interface(current_row_index)

In [None]:
# See annotations in the dataframe
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
labelled_df