In [6]:
import torch
import random
import numpy as np
import pandas as pd
import nltk 
nltk.download('punkt_tab')

def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

enforce_reproducibility()

train_data  = pd.read_parquet("dataset/train.parquet")
test_data   = pd.read_parquet("dataset/validation.parquet")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [7]:
questions_fi = test_data.loc[(test_data["lang"] == "fi")]
questions_fi

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang
311,Missä maassa Jack Churchill syntyi?,"Churchill was born at Colombo, British Ceylon ...",fi,True,22,"Colombo, British Ceylon",
312,Mikä on yleisin uskonto maailmassa?,The five largest religious groups by world pop...,fi,True,130,Christianity,
313,Kuka oli Glee sarjan pääosassa?,Rachel Barbra Berry (Lea Michele) is the lead ...,fi,True,0,Rachel Barbra Berry,
314,Milloin Killzone-sarjan peli julkaistiin ensim...,Killzone is a series of first-person shooter a...,fi,True,404,November 2004,
315,Milloin Pennsylvania liitty USA?,The state is one of the 13 original founding s...,fi,True,404,"December 12, 1787",
...,...,...,...,...,...,...,...
2723,Kuinka Ateenan tyrannia loppui?,"Athens' competition for Greek hegemony, i.e. s...",fi,True,-1,In the 510s BC.,510-luvulla eaa.
2724,Kuinka monessa maassa WWF toimii?,WWF (World Wide Fund for Nature) is an interna...,fi,False,-1,more than 100,yli 100
2725,Kuinka monta kertaa Jari-Matti Latvala on voit...,out and the car fell over the railing into the...,fi,False,-1,18,18
2726,Kuinka monta kertaa Katharine Hepburn oli uran...,"Katharine Houghton Hepburn (May 12, 1907 Hartf...",fi,True,32,12,12


In [11]:
train_data.loc[(train_data["lang"] == "fi")][["question","context","answerable"]]

Unnamed: 0,question,context,answerable
9137,Mitkä olivat Rooman alkuvaiheet?,"In historiography, ancient Rome is Roman civil...",True
9138,Kuka oli toisen maailmansodan jälkeisen sosial...,Rákosi had difficulty managing the economy and...,True
9139,Mikä oli roomalaisten antama nimi nykyisen Unk...,Hungary in its modern (post-1946) borders roug...,True
9140,Kuinka monta ihmistä menehtyi Suezin kriisin a...,"On 25 January 1952, British forces attempted t...",True
9141,Millä vuosikymmenellä Yhdysvaltojen varhaishis...,The history of the United States began with th...,True
...,...,...,...
15171,Minä vuonna Italia haki paikkaansa Euroopan va...,di Diano National Park in Campania. In ancient...,False
15172,Minä vuonna Kellopeliappelsiini julkaistiin?,"Thiamylal or ""thioseconal"" (brand name ""Surita...",False
15173,Minä vuonna Tom Fletcher on syntynyt?,Thomas Michael Fletcher (born 17 July 1985) is...,True
15174,Minä vuonna William R. Catton Jr. kuoli?,"William Robert Catton Jr. (January 15, 1926 – ...",True


## 1-. [Naive Bayes Classifier] - [Add 1 smoothing] - Finnish

In [37]:
#Input: training data
def train_naive_bayes_fi(train_data):
    log_p_c = 0
    log_p_w_c = 0

    num_answerable_questions    = train_data.loc[(train_data["answerable"])]["answerable"].count()
    num_unanswerable_questions  = train_data.loc[(train_data["answerable"]) == False]["answerable"].count()
    num_questions               = len(train_data)

    #Tokenize and use bag of words method. One count for answerable, another for unanswerable

    #Answerable
    tokens_ans = train_data.loc[(train_data["lang"] == "fi") & (train_data["answerable"])]["question"].apply(lambda row: nltk.word_tokenize(row, language="finnish")).to_list()
    tokens_ans = [item for sublist in tokens_ans for item in sublist]


    counts_answerable = {}
    for token in tokens_ans:
        if token in counts_answerable:
            counts_answerable[token] += 1
        else: 
            counts_answerable[token] = 1
    V_ans = list(counts_answerable.keys())

    #Unanswerable
    tokens_unans = train_data.loc[(train_data["lang"] == "fi") & (train_data["answerable"])== False]["question"].apply(lambda row: nltk.word_tokenize(row, language="finnish")).to_list()
    tokens_unans = [item for sublist in tokens_unans for item in sublist]


    counts_unanswerable = {}
    for token in tokens_ans:
        if token in counts_unanswerable:
            counts_unanswerable[token] += 1
        else: 
            counts_unanswerable[token] = 1
    V_unans = list(counts_unanswerable.keys())

    V = V_ans + [item for item in V_unans if item not in V_ans]

    #Calculate log probabilities
    log_p_c = {"answerable":np.log(num_answerable_questions/num_questions),"unanswerable":np.log(num_unanswerable_questions/num_questions)}


    #Calculating the sum of the denominator for the calculation of log odds
    general_counts_ans = {}
    for w in V:
        if w in counts_answerable:
            general_counts_ans[w] = counts_answerable[w]
        else:
            general_counts_ans[w] = 1
    log_sum_ans = sum(general_counts_ans.values())

    general_counts_unans = {}
    for w in V:
        if w in counts_unanswerable:
            general_counts_unans[w] = counts_unanswerable[w] + 1
        else:
            general_counts_unans[w] = 1
    log_sum_unans = sum(general_counts_unans.values())

    #Calculation of P(w|answerable)
    log_p_w_c_ans = {}
    for w in V:
        log_p_w_c_ans[w] = np.log(counts_answerable[w] + 1) - log_sum_ans 

    #Calculation of P(w|unanswerable)
    log_p_w_c_unans = {}
    for w in V:
        log_p_w_c_unans[w] = np.log(counts_unanswerable[w] + 1) - log_sum_unans 

    log_p_w_c = {"answerable":log_p_w_c_ans,"unanswerable":log_p_w_c_unans}

    model = [V,log_p_c,log_p_w_c]

    return model


model = train_naive_bayes_fi(train_data)

def naive_bayes_fi(model,text):
    text_tokens = nltk.word_tokenize(text, language="finnish")
    V           =  model[0]
    log_p_c     =  model[1]
    log_p_w_c   =  model[2]

    sum_ans = log_p_c["answerable"]
    for word in text_tokens:
        if word in V:
            sum_ans += log_p_w_c["answerable"][word]
    
    sum_unans = log_p_c["unanswerable"]
    for word in text_tokens:
        if word in V:
            sum_unans += log_p_w_c["unanswerable"][word]
    
    return (sum_ans >= sum_unans)

In [64]:
def evaluate(test_data,model):
    test = test_data.loc[(test_data["lang"] == "fi")][["question","answerable"]]
    test['eval'] = test['question'].apply(lambda x: naive_bayes_fi(model, x))
    test['is_correct'] = test['answerable'] == test['eval']

    num_quest = len(test)
    num_correct = test["is_correct"].sum()

    return num_correct/num_quest

evaluate(test_data,model)

0.7424242424242424

## 2.- [BiLSTM] - [BPEmb] - Finnish

In [39]:
from bpemb import BPEmb
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences


bpemb_fi = BPEmb(lang="fi", vs=100000, dim=300)
train_fi = train_data.loc[(train_data["lang"] == "fi")]

# Sample question and context
question_fi = train_fi["question"].to_list()
context_fi  = train_fi["context"].to_list()
labels_fi   = np.array(train_fi["answerable"].to_list()).astype('int')

print(len(question_fi))

# Encode questions and contexts
question_ids = [bpemb_fi.encode_ids(text) for text in question_fi]
context_ids = [bpemb_fi.encode_ids(text) for text in context_fi]

print("Length of question_ids:", len(question_ids))
print("Length of context_ids:", len(context_ids))
print("Length of labels_fi:", len(labels_fi))


# Determine maximum sequence lengths
max_len_question = max(len(seq) for seq in question_ids)
max_len_context = max(len(seq) for seq in context_ids)

# Pad sequences
question_padded = pad_sequences(question_ids, maxlen=max_len_question, padding='post')
context_padded = pad_sequences(context_ids, maxlen=max_len_context, padding='post')

# Split data
from sklearn.model_selection import train_test_split

print(len(question_padded),len(context_padded),len(labels_fi))

X_question_train, X_question_val, X_context_train, X_context_val, y_train, y_val = train_test_split(
    question_padded, context_padded, labels_fi, test_size=0.1, random_state=42
)

#Build the model
vocab_size = bpemb_fi.vocab_size
embedding_dim = bpemb_fi.dim
embedding_matrix = bpemb_fi.vectors

# Question Input
question_input = Input(shape=(max_len_question,), name='question_input')
question_embedding = Embedding(input_dim=vocab_size,
                               output_dim=embedding_dim,
                               weights=[embedding_matrix],
                               input_length=max_len_question,
                               trainable=False)(question_input)
question_lstm = Bidirectional(LSTM(128))(question_embedding)

# Context Input
context_input = Input(shape=(max_len_context,), name='context_input')
context_embedding = Embedding(input_dim=vocab_size,
                              output_dim=embedding_dim,
                              weights=[embedding_matrix],
                              input_length=max_len_context,
                              trainable=False)(context_input)
context_lstm = Bidirectional(LSTM(128))(context_embedding)
combined = Concatenate()([question_lstm, context_lstm])

# Dense Layers
dense = Dense(64, activation='relu')(combined)
output = Dense(1, activation='sigmoid')(dense)

model_fi = Model(inputs=[question_input, context_input], outputs=output)

# Compile Model
model_fi.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model_fi.fit(
    {'question_input': X_question_train, 'context_input': X_context_train},
    y_train,
    batch_size=32,
    epochs=10,
    validation_data=(
        {'question_input': X_question_val, 'context_input': X_context_val},
        y_val
    )
)




2126
Length of question_ids: 2126
Length of context_ids: 2126
Length of labels_fi: 2126
2126 2126 2126
Epoch 1/10




[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 354ms/step - accuracy: 0.8242 - loss: 0.3702 - val_accuracy: 0.9390 - val_loss: 0.2208
Epoch 2/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 335ms/step - accuracy: 0.9530 - loss: 0.1636 - val_accuracy: 0.9484 - val_loss: 0.1943
Epoch 3/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 331ms/step - accuracy: 0.9606 - loss: 0.1225 - val_accuracy: 0.9437 - val_loss: 0.1807
Epoch 4/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 325ms/step - accuracy: 0.9735 - loss: 0.0734 - val_accuracy: 0.9437 - val_loss: 0.1859
Epoch 5/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 326ms/step - accuracy: 0.9803 - loss: 0.0516 - val_accuracy: 0.9484 - val_loss: 0.1874
Epoch 6/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 332ms/step - accuracy: 0.9970 - loss: 0.0147 - val_accuracy: 0.9437 - val_loss: 0.2334
Epoch 7/10
[1m60/60[0m [32m━━━

<keras.src.callbacks.history.History at 0x155890c7fe0>

## 

In [42]:
from sklearn.metrics import classification_report

test_fi = test_data.loc[(test_data["lang"] == "fi")]

question_fi = test_fi["question"].to_list()
context_fi  = test_fi["context"].to_list()
labels_fi   = np.array(test_fi["answerable"].to_list()).astype('int')


# Encode questions and contexts
question_ids = [bpemb_fi.encode_ids(text) for text in question_fi]
context_ids = [bpemb_fi.encode_ids(text) for text in context_fi]


# Determine maximum sequence lengths
max_len_question = max(len(seq) for seq in question_ids)
max_len_context = max(len(seq) for seq in context_ids)

# Pad sequences
question_padded = pad_sequences(question_ids, maxlen=max_len_question, padding='post')
context_padded = pad_sequences(context_ids, maxlen=max_len_context, padding='post')


# Predict on validation data
predictions = model_fi.predict({'question_input': question_padded, 'context_input': context_padded})
predicted_labels = (predictions > 0.5).astype('int').flatten()



[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 72ms/step


In [48]:
test_labels  = np.array(test_fi["answerable"].to_list()).astype('int')
predicted_labels = np.array(predicted_labels).astype('int')

same_positions = sum([1 for a, b in zip(test_labels, predicted_labels) if a == b])
percentage = (same_positions / len(predicted_labels)) * 100

print(f"Success percentaje of this model is {percentage}")

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,