In [None]:
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


#### Step 1: Data Loading

In [None]:
## from huggingface datasets
## onestop qa
ds_onestop = load_dataset("iastate/onestop_english")
dfs_onestop = []
for split in ds_onestop.keys():
    split_df = ds_onestop[split].to_pandas()
    #split_df["split"] = split
    dfs_onestop.append(split_df)
df_onestop = pd.concat(dfs_onestop)

## cefr english
ds_cefr_eng = load_dataset("edesaras/CEFR-Sentence-Level-Annotations")
dfs_cefr_eng = []
for split in ds_cefr_eng.keys():
    split_df = ds_cefr_eng[split].to_pandas()
    #split_df["split"] = split
    dfs_cefr_eng.append(split_df)
df_cefr_eng = pd.concat(dfs_cefr_eng)

'''
## cefr german
ds_cefr_ger = load_dataset("EliasAhl/german-cefr")
dfs_cefr_ger = []
for split in ds_cefr_ger.keys():
    split_df = ds_cefr_ger[split].to_pandas()
    #split_df["split"] = split
    dfs_cefr_ger.append(split_df)
df_cefr_ger = pd.concat(dfs_cefr_ger)
'''

Using the latest cached version of the dataset since iastate/onestop_english couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\lenao\.cache\huggingface\datasets\iastate___onestop_english\default\0.0.0\748c4f3fb4668e53f291756363d1876ebbdca007 (last modified on Sun Apr  6 16:11:30 2025).


#### Step 2: Unifying difficulty levels

In [17]:
'''
onestop difficulty levels: 1 (ele), 2 (int), 3 (adv), with 1 being the easiest and 3 being the hardest
cefr_en levels: 1-6, with 1 (corresponding to A1) being the easiest and 6 (corresponding to C2) being the hardest
    average level between 2 annotations is taken
cefr_ger levels: A1, A2, B1, B2, C1, C2, with A1 being the easiest and C2 being the hardest
'''
# first, average the two annotations in cefr_en
df_cefr_eng["avg_level"] = df_cefr_eng[["Annotator I", "Annotator II"]].mean(axis=1).round().astype(int)


In [None]:
# normalize column names
df_cefr_eng.rename(columns={"avg_level": "level"}, inplace=True)
#df_cefr_ger.rename(columns={"cefrLevel": "level"}, inplace=True)
df_onestop.rename(columns={"label": "level"}, inplace=True)

df_cefr_eng.drop("Annotator I", axis=1, inplace=True)
#df_cefr_eng.drop("Annotator II", axis=1, inplace=True)
df_cefr_ger.drop("prompt", axis=1, inplace=True)

In [None]:
normalized_levels_onestop = {
    0: "[Level: elementary-1]", 1: "[Level: intermediate-1]", 2: "[Level: advanced-1]" # onestop qa levels
}
normalized_levels_cefr_eng = {
    1: "[Level: elementary-1]", 2: "[Level: elementary-2]", 3: "[Level: intermediate-1]", 4: "[Level: intermediate-2]", 5: "[Level: advanced-1]", 6: "[Level: advanced-2]" # english cefr levels
}
'''
normalized_levels_cefr_ger = {
    "A1": "[Level: elementary-1]", "A2": "[Level: elementary-2]", "B1": "[Level: intermediate-1]", "B2": "[Level: intermediate-2]", "C1": "[Level: advanced-1]", "C2": "[Level: advanced-2]" # german cefr levels
}
'''

In [None]:
df_onestop["labels"] = df_onestop["level"].map(lambda x: normalized_levels_onestop[x])
df_cefr_eng["input"] = df_cefr_eng["level"].map(lambda x: normalized_levels_cefr_eng[x])
#df_cefr_ger["input"] = df_cefr_ger["level"].map(lambda x: normalized_levels_cefr_ger[x])

In [None]:
#create one dataset for english and one for german
df_eng = pd.concat([df_onestop, df_cefr_eng], ignore_index=True)
#df_ger = df_cefr_ger.copy()

In [None]:
df_eng.to_excel("../data/eng_data.xlsx")
#df_ger.to_excel("../data/ger_data.xlsx")