# BasqueTrivia Examples and Statistics

In [1]:
from datasets import load_dataset

basquetrivia_en = load_dataset("HiTZ/BasqueTrivia", "en", split="test")

basquetrivia_eu = load_dataset("HiTZ/BasqueTrivia", "eu", split="test")


In [2]:
basquetrivia_en[0]

{'id': 0,
 'category': 'Geografia eta Historia',
 'group': 'Gai orokorrak',
 'difficulty': 3,
 'question': 'Who was imprisoned in 1964?',
 'answer': 0,
 'candidates': ['Nelson Mandela', 'Mumia Abu Jamal', 'Charles Ghankay']}

## Examples

In [9]:
# extract one example from each category and group fields
import pandas as pd
def extract_examples(basquetrivia):
    examples = []
    seen = set()
    for ex in basquetrivia:
        category_group = (ex["category"], ex["group"])
        if category_group not in seen:
            examples.append(ex)
            seen.add(tuple(category_group))
    
    examples_df = pd.DataFrame(examples)
    # order by category and group
    examples_df = examples_df.sort_values(["category", "group"])
    return examples_df

In [12]:
examples_en = extract_examples(basquetrivia_en)
examples_en
# display dataframe withouth truncating text
pd.set_option('display.max_colwidth', None)
examples_en


Unnamed: 0,id,category,group,difficulty,question,answer,candidates
10,16,Euskara eta literatura,Euskal gaiak,1,"What does the ""Karmel"" magazine specialize in?",1,"[Bertsolarism, Basque culture in the past and the present, The life of the Carmelites]"
3,3,Euskara eta literatura,Gai orokorrak,3,How many members make up the Euskararen Gizarte Erakundeen Kontseilua?,1,"[40, 46, 50]"
2,2,Geografia eta Historia,Euskal gaiak,1,Where's Atxondo?,0,"[In Biscay, In Gipuzkoa, In Navarre]"
0,0,Geografia eta Historia,Gai orokorrak,3,Who was imprisoned in 1964?,0,"[Nelson Mandela, Mumia Abu Jamal, Charles Ghankay]"
9,15,Gizartea eta ohiturak,Euskal gaiak,3,Which of the following is a Basque Government institution?,2,"[IKA, AEK, HABE]"
13,29,Gizartea eta ohiturak,Gai orokorrak,3,"In the first general laws we know (those of the king of Babylon in the eighteenth century BC), was there any distinction between women and men?",2,"[No, they both had the same rights and obligations., Yes, women were subject to men and had no rights., Yes, but women also had certain rights and obligations.]"
5,8,Kirola eta aisialdia,Euskal gaiak,1,Where was Julian Retegi born?,1,"[Areso, Eratsun, Eraso]"
8,13,Kirola eta aisialdia,Gai orokorrak,2,Where's the O'Connell Bridge?,1,"[In London, In Dublin, In Milan]"
11,21,Kultura eta artea,Euskal gaiak,3,Who built the Gaztelu Berria or Château-Neuf in Bayonne?,0,"[The English, The French, The Spanish]"
6,9,Kultura eta artea,Gai orokorrak,2,When did the Titanic Belfast Museum open?,0,"[In 2012, In 2005, In 2002]"


In [11]:
examples_eu = extract_examples(basquetrivia_eu)
examples_eu

Unnamed: 0,id,category,group,difficulty,question,answer,candidates
10,16,Euskara eta literatura,Euskal gaiak,1,"Zertaz ari da ""Karmel"" aldizkaria?",1,"[Bertsolaritzaz, Lehengo eta gaurko euskal kul..."
3,3,Euskara eta literatura,Gai orokorrak,3,Zenbat kidek osatzen dute Euskararen Gizarte E...,1,"[40k, 46k, 50ek]"
2,2,Geografia eta Historia,Euskal gaiak,1,Non dago Atxondo?,0,"[Bizkaian, Gipuzkoan, Nafarroan]"
0,0,Geografia eta Historia,Gai orokorrak,3,Nor kartzelaratu zuten 1964an?,0,"[Nelson Mandela, Mumia Abu Jamal, Charles Ghan..."
9,15,Gizartea eta ohiturak,Euskal gaiak,3,Hauetako zein dago Eusko Jaurlaritzaren menpe?,2,"[IKA, AEK, HABE]"
13,29,Gizartea eta ohiturak,Gai orokorrak,3,Ezagutzen dugun lehenengo legedi orokorrean (K...,2,"[Ez, biek eskubide eta betebehar beretsuak zit..."
5,8,Kirola eta aisialdia,Euskal gaiak,1,Non jaio zen Julian Retegi?,1,"[Areson, Eratsunen, Erason]"
8,13,Kirola eta aisialdia,Gai orokorrak,2,Non dago O ́Connell zubia?,1,"[Londresen, Dublinen, Milanen]"
11,21,Kultura eta artea,Euskal gaiak,3,Nortzuek eraiki zuten Baionako Gaztelu Berria?,0,"[Ingelesek, Frantsesek, Espainolek]"
6,9,Kultura eta artea,Gai orokorrak,2,Noiz ireki zuten Titanic Belfast Museoa?,0,"[2012an, 2005ean, 2002an]"


## Statistics

In [30]:
# calculate statistics from the dataset:
# number of examples per category, group and difficulty, similar to how above examples where extracted
# question length, candidate answers length

# Category Group Difficulty Items Question Candidate 1 Candidate 2 Candidate 3
# Geografia eta Historia Global Difficult 5169 100 15 15 15

from collections import defaultdict
def calculate_statistics(basquetrivia):
    # number of examples per category and group
    stats = defaultdict(lambda: defaultdict(int))
    for ex in basquetrivia:
        stats[ex["category"]][ex["group"]] += 1
        stats[ex["category"]][ex["difficulty"]] += 1
    
    # average len of questions and candidates
    stats_len = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    for ex in basquetrivia:
        stats_len[ex["category"]][ex["group"]]["question"] += len(ex["question"])
        stats_len[ex["category"]][ex["group"]]["candidate_0"] += len(ex["candidates"][0])
        stats_len[ex["category"]][ex["group"]]["candidate_1"] += len(ex["candidates"][1])
        stats_len[ex["category"]][ex["group"]]["candidate_2"] += len(ex["candidates"][2])
        
    # divide by number of examples
    for category, groups in stats_len.items():
        for group, lengths in groups.items():
            for key, value in lengths.items():
                stats_len[category][group][key] = value / stats[category][group]
    
    return stats, stats_len

stats_en, stats_len_en = calculate_statistics(basquetrivia_en)


In [31]:
stats_en

defaultdict(<function __main__.calculate_statistics.<locals>.<lambda>()>,
            {'Geografia eta Historia': defaultdict(int,
                         {'Gai orokorrak': 300,
                          3: 160,
                          'Euskal gaiak': 300,
                          1: 220,
                          2: 220}),
             'Zinema eta ikuskizunak': defaultdict(int,
                         {'Euskal gaiak': 298,
                          1: 219,
                          3: 159,
                          2: 219,
                          'Gai orokorrak': 299}),
             'Euskara eta literatura': defaultdict(int,
                         {'Gai orokorrak': 310,
                          3: 223,
                          'Euskal gaiak': 305,
                          1: 181,
                          2: 211}),
             'Zientzia eta teknologia': defaultdict(int,
                         {'Gai orokorrak': 296,
                          3: 158,
                      

In [32]:
stats_len_en

defaultdict(<function __main__.calculate_statistics.<locals>.<lambda>()>,
            {'Geografia eta Historia': defaultdict(<function __main__.calculate_statistics.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'Gai orokorrak': defaultdict(int,
                                      {'question': 43.0,
                                       'candidate_0': 10.163333333333334,
                                       'candidate_1': 10.4,
                                       'candidate_2': 10.326666666666666}),
                          'Euskal gaiak': defaultdict(int,
                                      {'question': 48.92,
                                       'candidate_0': 12.506666666666666,
                                       'candidate_1': 12.383333333333333,
                                       'candidate_2': 13.57})}),
             'Zinema eta ikuskizunak': defaultdict(<function __main__.calculate_statistics.<locals>.<lambda>.<locals>.<lambda>()>,
       