In [1]:
import re
import pandas as pd
from datasets import Dataset, Value

In [3]:
sheet_id = "1qZc2b8wWlIRhxDr6Z9r6tzOcLsBrLXmKBvj4zp7-qHU"
export_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=xlsx"

sheets = pd.read_excel(
    export_url,
    sheet_name=[
        "langdon",
        "David",
        "Joon",
        "Wes",
        "Anna",
        "Julia",
        "Reece",
        "Tobasum",
        "Qiushi",
        "Rachel",
        "Sophie",
        "Scott",
        "Van",
        "Adithya",
        "Jesse",
        "Kevin",
        "Saeyi",
    ],
)

dfs = []
for name, d in sheets.items():
    d["annotator"] = name
    dfs.append(d)

df = (
    pd.concat(dfs, ignore_index=True)
    .drop(["annotator_id"], axis=1)
    .rename(
        columns={
            "response": "candidate",
            "reference_answer": "reference",
            "chunk_text": "text",
            "score (1-4)": "score",
        }
    )
    .dropna(subset=["score"])
    .assign(labels=lambda x: (x.score >= 2).astype(int))
)

df.to_csv("../../data/cri_annotations.csv", index=False)

In [4]:
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].astype(pd.StringDtype())

df.dtypes

question_id     string[python]
chunk_header    string[python]
text            string[python]
question        string[python]
reference       string[python]
candidate       string[python]
score                  float64
annotator       string[python]
labels                   int64
dtype: object

In [5]:
ds = Dataset.from_pandas(df, preserve_index=False)
ds

Dataset({
    features: ['question_id', 'chunk_header', 'text', 'question', 'reference', 'candidate', 'score', 'annotator', 'labels'],
    num_rows: 490
})

In [6]:
ds.save_to_disk("../../data/cri_annotations.hf")

Saving the dataset (0/1 shards):   0%|          | 0/490 [00:00<?, ? examples/s]