# BasqueTrivia Examples and Statistics

In [38]:
from datasets import load_dataset

basquetrivia_en = load_dataset("HiTZ/BasqueTrivia", "en", split="test")

basquetrivia_eu = load_dataset("HiTZ/BasqueTrivia", "eu", split="test")


In [39]:
basquetrivia_en[0]

{'id': 0,
 'category': 'Geografia eta Historia',
 'group': 'Gai orokorrak',
 'difficulty': 3,
 'question': 'Who was imprisoned in 1964?',
 'answer': 0,
 'candidates': ['Nelson Mandela', 'Mumia Abu Jamal', 'Charles Ghankay']}

## Examples

In [40]:
# extract one example from each category and group fields
import pandas as pd
def extract_examples(basquetrivia):
    examples = []
    seen = set()
    for ex in basquetrivia:
        category_group = (ex["category"], ex["group"])
        if category_group not in seen:
            examples.append(ex)
            seen.add(tuple(category_group))
    
    examples_df = pd.DataFrame(examples)
    # order by category and group
    examples_df = examples_df.sort_values(["category", "group"])
    return examples_df

In [41]:
examples_en = extract_examples(basquetrivia_en)
pd.set_option('display.max_colwidth', None)
examples_en


Unnamed: 0,id,category,group,difficulty,question,answer,candidates
10,16,Euskara eta literatura,Euskal gaiak,1,"What does the ""Karmel"" magazine specialize in?",1,"[Bertsolarism, Basque culture in the past and the present, The life of the Carmelites]"
3,3,Euskara eta literatura,Gai orokorrak,3,How many members make up the Euskararen Gizarte Erakundeen Kontseilua?,1,"[40, 46, 50]"
2,2,Geografia eta Historia,Euskal gaiak,1,Where's Atxondo?,0,"[In Biscay, In Gipuzkoa, In Navarre]"
0,0,Geografia eta Historia,Gai orokorrak,3,Who was imprisoned in 1964?,0,"[Nelson Mandela, Mumia Abu Jamal, Charles Ghankay]"
9,15,Gizartea eta ohiturak,Euskal gaiak,3,Which of the following is a Basque Government institution?,2,"[IKA, AEK, HABE]"
13,29,Gizartea eta ohiturak,Gai orokorrak,3,"In the first general laws we know (those of the king of Babylon in the eighteenth century BC), was there any distinction between women and men?",2,"[No, they both had the same rights and obligations., Yes, women were subject to men and had no rights., Yes, but women also had certain rights and obligations.]"
5,8,Kirola eta aisialdia,Euskal gaiak,1,Where was Julian Retegi born?,1,"[Areso, Eratsun, Eraso]"
8,13,Kirola eta aisialdia,Gai orokorrak,2,Where's the O'Connell Bridge?,1,"[In London, In Dublin, In Milan]"
11,21,Kultura eta artea,Euskal gaiak,3,Who built the Gaztelu Berria or Ch칙teau-Neuf in Bayonne?,0,"[The English, The French, The Spanish]"
6,9,Kultura eta artea,Gai orokorrak,2,When did the Titanic Belfast Museum open?,0,"[In 2012, In 2005, In 2002]"


In [42]:
examples_eu = extract_examples(basquetrivia_eu)
examples_eu

Unnamed: 0,id,category,group,difficulty,question,answer,candidates
10,16,Euskara eta literatura,Euskal gaiak,1,"Zertaz ari da ""Karmel"" aldizkaria?",1,"[Bertsolaritzaz, Lehengo eta gaurko euskal kulturaz, Karmeldarren bizimoduaz]"
3,3,Euskara eta literatura,Gai orokorrak,3,Zenbat kidek osatzen dute Euskararen Gizarte Erakundeen Kontseilua?,1,"[40k, 46k, 50ek]"
2,2,Geografia eta Historia,Euskal gaiak,1,Non dago Atxondo?,0,"[Bizkaian, Gipuzkoan, Nafarroan]"
0,0,Geografia eta Historia,Gai orokorrak,3,Nor kartzelaratu zuten 1964an?,0,"[Nelson Mandela, Mumia Abu Jamal, Charles Ghankay]"
9,15,Gizartea eta ohiturak,Euskal gaiak,3,Hauetako zein dago Eusko Jaurlaritzaren menpe?,2,"[IKA, AEK, HABE]"
13,29,Gizartea eta ohiturak,Gai orokorrak,3,"Ezagutzen dugun lehenengo legedi orokorrean (K.a. XVIII. mendeko Babiloniako erregearena), bereizketarik al zegoen emakume eta gizonezkoen artean?",2,"[Ez, biek eskubide eta betebehar beretsuak zituzten, Bai, emakumeak gizonaren menpean bizi behar zuen eta ez zuen inolako eskubiderik, Bai, baina emakumeek ere bazituzten zenbait eskubide eta betebehar]"
5,8,Kirola eta aisialdia,Euskal gaiak,1,Non jaio zen Julian Retegi?,1,"[Areson, Eratsunen, Erason]"
8,13,Kirola eta aisialdia,Gai orokorrak,2,Non dago O 패Connell zubia?,1,"[Londresen, Dublinen, Milanen]"
11,21,Kultura eta artea,Euskal gaiak,3,Nortzuek eraiki zuten Baionako Gaztelu Berria?,0,"[Ingelesek, Frantsesek, Espainolek]"
6,9,Kultura eta artea,Gai orokorrak,2,Noiz ireki zuten Titanic Belfast Museoa?,0,"[2012an, 2005ean, 2002an]"


## Statistics

In [51]:
# calculate statistics from the dataset:
# number of examples per category, group and difficulty, similar to how above examples where extracted
# question length, candidate answers length

from collections import defaultdict
def calculate_statistics(basquetrivia):
    # number of examples per category and group
    stats = defaultdict(lambda: defaultdict(int))
    for ex in basquetrivia:
        stats[ex["category"]][ex["group"]] += 1
        
    # number of examples per category, group and difficulty
    stats_difficulty = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    for ex in basquetrivia:
        stats_difficulty[ex["category"]][ex["group"]][ex["difficulty"]] += 1
    
    # average len of questions and candidates
    stats_len = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    for ex in basquetrivia:
        stats_len[ex["category"]][ex["group"]]["question"] += len(ex["question"])
        for candidate in ex["candidates"]:
            stats_len[ex["category"]][ex["group"]]["candidates"] += len(candidate)
        
    # divide by number of examples
    for category, groups in stats_len.items():
        for group, lengths in groups.items():
            for key, value in lengths.items():
                if key == "question":
                    stats_len[category][group][key] = round(value / stats[category][group], 1)
                else: 
                    stats_len[category][group][key] = round(value / (stats[category][group]* 3), 1)
                    
    # convert stats to a dataframe with the same structure as the examples
    stats_df = []
    for category, groups in stats.items():
        for group, items in groups.items():
            stats_df.append({"category": category, "group": group, "items": items})
            # Add difficulty statistics
            for difficulty, count in stats_difficulty[category][group].items():
                stats_df[-1][difficulty] = count
            # Add length statistics
            stats_df[-1].update(stats_len[category][group])

    stats_df = pd.DataFrame(stats_df)
    stats_df = stats_df.sort_values(["category", "group"])
    
    return stats_df

stats_en = calculate_statistics(basquetrivia_en)


In [52]:
stats_en

Unnamed: 0,category,group,items,3,2,1,question,candidates
5,Euskara eta literatura,Euskal gaiak,305,112,103,90,55.9,16.9
4,Euskara eta literatura,Gai orokorrak,310,111,108,91,53.3,15.7
1,Geografia eta Historia,Euskal gaiak,300,80,110,110,48.9,12.8
0,Geografia eta Historia,Gai orokorrak,300,80,110,110,43.0,10.3
14,Gizartea eta ohiturak,Euskal gaiak,289,78,108,103,60.2,16.1
15,Gizartea eta ohiturak,Gai orokorrak,298,79,109,110,51.1,18.0
8,Kirola eta aisialdia,Euskal gaiak,296,80,109,107,47.5,11.7
9,Kirola eta aisialdia,Gai orokorrak,303,80,110,113,43.3,10.3
11,Kultura eta artea,Euskal gaiak,295,80,110,105,43.5,11.5
10,Kultura eta artea,Gai orokorrak,286,80,108,98,40.7,9.1


In [53]:
stats_eu = calculate_statistics(basquetrivia_eu)


In [54]:
stats_eu

Unnamed: 0,category,group,items,3,2,1,question,candidates
5,Euskara eta literatura,Euskal gaiak,305,112,103,90,51.0,16.7
4,Euskara eta literatura,Gai orokorrak,310,111,108,91,51.1,16.0
1,Geografia eta Historia,Euskal gaiak,300,80,110,110,44.4,12.1
0,Geografia eta Historia,Gai orokorrak,300,80,110,110,43.7,11.0
14,Gizartea eta ohiturak,Euskal gaiak,289,78,108,103,53.7,14.9
15,Gizartea eta ohiturak,Gai orokorrak,298,79,109,110,50.4,18.6
8,Kirola eta aisialdia,Euskal gaiak,296,80,109,107,42.6,10.7
9,Kirola eta aisialdia,Gai orokorrak,303,80,110,113,43.0,10.5
11,Kultura eta artea,Euskal gaiak,295,80,110,105,39.5,10.2
10,Kultura eta artea,Gai orokorrak,286,80,108,98,38.1,9.6
