### Installation

In [3]:
!pip install datasets



In [4]:
!pip install googletrans

Collecting googletrans
  Downloading googletrans-3.0.0.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans)
  Downloading hstspreload-2024.9.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googletrans)
  Downloading h11-0.9.0-py2.py3-none-any.whl.metadata (8.1 kB)
Collectin

### Library import

In [12]:
import pandas as pd
from datasets import load_dataset
import re
import nltk
nltk.download('averaged_perceptron_tagger')
from collections import Counter
from googletrans import Translator
from nltk import word_tokenize, pos_tag
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, classification_report

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Dataset Loading

In [2]:
dataset = load_dataset("coastalcph/tydi_xor_rc", split=['train', 'validation'])

train = pd.DataFrame(dataset[0])
validation = pd.DataFrame(dataset[1])

df = pd.concat([train, validation], ignore_index=True)
df.to_excel("tydi_xor_re.xlsx", index=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.85k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/6.87M [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15326 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3028 [00:00<?, ? examples/s]

In [3]:
df.head()

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang
0,উইকিলিকস কত সালে সর্বপ্রথম ইন্টারনেটে প্রথম তথ...,WikiLeaks () is an international non-profit or...,bn,True,182,2006,
1,দ্বিতীয় বিশ্বযুদ্ধে কোন দেশ পরাজিত হয় ?,The war in Europe concluded with an invasion o...,bn,True,48,Germany,
2,মার্কিন যুক্তরাষ্ট্রের সংবিধান অনুযায়ী মার্কিন...,Same-sex marriage in the United States expande...,bn,False,-1,no,
3,আরব-ইসরায়েলি যুদ্ধে আরবের মোট কয়জন সৈন্যের মৃ...,The exact number of Arab casualties is unknown...,bn,True,39,unknown,
4,বিশ্বে প্রথম পুঁজিবাদী সমাজ কবে গড়ে ওঠে ?,"As Thomas Hall (2000) notes, ""The Sung Empire ...",bn,True,1219,17th century,


### Finnish Language

loading finnish language

In [4]:
finnish_train = train[train['lang'] == 'fi']
finnish_validation = validation[validation['lang'] == 'fi']


In [5]:
finnish_train.head()
finnish_validation.head()

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang
311,Missä maassa Jack Churchill syntyi?,"Churchill was born at Colombo, British Ceylon ...",fi,True,22,"Colombo, British Ceylon",
312,Mikä on yleisin uskonto maailmassa?,The five largest religious groups by world pop...,fi,True,130,Christianity,
313,Kuka oli Glee sarjan pääosassa?,Rachel Barbra Berry (Lea Michele) is the lead ...,fi,True,0,Rachel Barbra Berry,
314,Milloin Killzone-sarjan peli julkaistiin ensim...,Killzone is a series of first-person shooter a...,fi,True,404,November 2004,
315,Milloin Pennsylvania liitty USA?,The state is one of the 13 original founding s...,fi,True,404,"December 12, 1787",


Number of finnish samples

In [6]:
num_train_samples = len(finnish_train)
num_validation_samples = len(finnish_validation)

print(f"Number of Finnish samples in the training set: {num_train_samples}")
print(f"Number of Finnish samples in the validation set: {num_validation_samples}")


Number of Finnish samples in the training set: 2126
Number of Finnish samples in the validation set: 528


5 Most common word in the question of trainning data

In [7]:
finnish_questions = ' '.join(finnish_train['question'].tolist())

question_words = re.findall(r'\b\w+\b', finnish_questions.lower())

word_counts = Counter(question_words)

most_common_words = word_counts.most_common(5)

for word, count in most_common_words:
    print(f"Word: '{word}', Occurrences: {count}")


Word: 'on', Occurrences: 642
Word: 'mikä', Occurrences: 328
Word: 'milloin', Occurrences: 287
Word: 'vuonna', Occurrences: 227
Word: 'kuka', Occurrences: 215


In [8]:
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fi-en")

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/832k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


parts of speech of translated finnish words

In [9]:
translated_words = [translator(word)[0]['translation_text'] for word, _ in most_common_words]

pos_tags = pos_tag(translated_words)

for word, pos in pos_tags:
    print(f"Word: '{word}', POS Tag: {pos}")


Word: 'is', POS Tag: VBZ
Word: 'what', POS Tag: WP
Word: 'when', POS Tag: WRB
Word: 'year', POS Tag: NN
Word: 'who', POS Tag: WP


## Model

In [29]:
train_fi = train[train['lang'] == 'fi']
validation_fi = validation[validation['lang'] == 'fi']


In [28]:
def translate_to_english(question):
    return translator(question)[0]['translation_text']

train_fi['translated_question'] = train_fi['question'].apply(translate_to_english)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_fi['translated_question'] = train_fi['question'].apply(translate_to_english)


In [33]:
train_fi.head()

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang,predicted,predicted_sim
9137,Mitkä olivat Rooman alkuvaiheet?,"In historiography, ancient Rome is Roman civil...",fi,True,0,"In historiography, ancient Rome is Roman civil...",,False,False
9138,Kuka oli toisen maailmansodan jälkeisen sosial...,Rákosi had difficulty managing the economy and...,fi,True,187,Mátyás Rákosi,,False,False
9139,Mikä oli roomalaisten antama nimi nykyisen Unk...,Hungary in its modern (post-1946) borders roug...,fi,True,286,Pannonia,,False,False
9140,Kuinka monta ihmistä menehtyi Suezin kriisin a...,"On 25 January 1952, British forces attempted t...",fi,True,131,deaths of 41 Egyptians,,False,False
9141,Millä vuosikymmenellä Yhdysvaltojen varhaishis...,The history of the United States began with th...,fi,True,87,"15,000 BC",,False,False


Similarity base class

In [16]:
def rule_based_classifier_similarity(context, question):
    context_words = set(context.lower().split())
    question_words = set(question.lower().split())

    overlap = len(context_words.intersection(question_words))

    if overlap > 3:
        return True
    return False


In [21]:
train_fi['predicted_sim'] = train_fi.apply(lambda row: rule_based_classifier_similarity(row['context'], row['question']), axis=1)

validation_fi['predicted_sim'] = validation_fi.apply(lambda row: rule_based_classifier_similarity(row['context'], row['question']), axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_fi['predicted_sim'] = train_fi.apply(lambda row: rule_based_classifier_similarity(row['context'], row['question']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_fi['predicted_sim'] = validation_fi.apply(lambda row: rule_based_classifier_similarity(row['context'], row['question']), axis=1)


In [22]:

# Evaluate accuracy on training data
train_accuracy = accuracy_score(train_fi['answerable'], train_fi['predicted_sim'])
print(f"Training Accuracy: {train_accuracy}")

# Evaluate accuracy on validation data
validation_accuracy = accuracy_score(validation_fi['answerable'], validation_fi['predicted_sim'])
print(f"Validation Accuracy: {validation_accuracy}")


Training Accuracy: 0.12935089369708372
Validation Accuracy: 0.2821969696969697


Cosine vector

In [13]:
def rule_based_classifier_cosine(context, question):
    texts = [context, question]
    vectorizer = TfidfVectorizer().fit_transform(texts)
    vectors = vectorizer.toarray()

    similarity_score = cosine_similarity(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1))[0][0]

    if similarity_score > 0.3:
        return True
    return False


In [24]:
train_fi['predicted_cos'] = train_fi.apply(lambda row: rule_based_classifier_cosine(row['context'], row['question']), axis=1)

validation_fi['predicted_cos'] = validation_fi.apply(lambda row: rule_based_classifier_cosine(row['context'], row['question']), axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_fi['predicted_cos'] = train_fi.apply(lambda row: rule_based_classifier_cosine(row['context'], row['question']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_fi['predicted_cos'] = validation_fi.apply(lambda row: rule_based_classifier_cosine(row['context'], row['question']), axis=1)


In [25]:
train_accuracy = accuracy_score(train_fi['answerable'], train_fi['predicted_cos'])
print(f"Training Accuracy: {train_accuracy}")

train_report = classification_report(train_fi['answerable'], train_fi['predicted_cos'])
print(f"Training Classification Report:\n{train_report}")

validation_accuracy = accuracy_score(validation_fi['answerable'], validation_fi['predicted_cos'])
print(f"Validation Accuracy: {validation_accuracy}")

validation_report = classification_report(validation_fi['answerable'], validation_fi['predicted_cos'])
print(f"Validation Classification Report:\n{validation_report}")


Training Accuracy: 0.12464722483537159
Training Classification Report:
              precision    recall  f1-score   support

       False       0.12      1.00      0.22       263
        True       1.00      0.00      0.00      1863

    accuracy                           0.12      2126
   macro avg       0.56      0.50      0.11      2126
weighted avg       0.89      0.12      0.03      2126

Validation Accuracy: 0.2803030303030303
Validation Classification Report:
              precision    recall  f1-score   support

       False       0.28      1.00      0.44       148
        True       0.00      0.00      0.00       380

    accuracy                           0.28       528
   macro avg       0.14      0.50      0.22       528
weighted avg       0.08      0.28      0.12       528



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
train_fi.head()

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang,predicted,predicted_sim,predicted_cos
9137,Mitkä olivat Rooman alkuvaiheet?,"In historiography, ancient Rome is Roman civil...",fi,True,0,"In historiography, ancient Rome is Roman civil...",,False,False,False
9138,Kuka oli toisen maailmansodan jälkeisen sosial...,Rákosi had difficulty managing the economy and...,fi,True,187,Mátyás Rákosi,,False,False,False
9139,Mikä oli roomalaisten antama nimi nykyisen Unk...,Hungary in its modern (post-1946) borders roug...,fi,True,286,Pannonia,,False,False,False
9140,Kuinka monta ihmistä menehtyi Suezin kriisin a...,"On 25 January 1952, British forces attempted t...",fi,True,131,deaths of 41 Egyptians,,False,False,False
9141,Millä vuosikymmenellä Yhdysvaltojen varhaishis...,The history of the United States began with th...,fi,True,87,"15,000 BC",,False,False,False
