In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset_train       = pd.read_parquet("dataset/train.parquet")
dataset_validation  = pd.read_parquet("dataset/validation.parquet")

In [3]:
print(dataset_train.keys())

dataset_train["lang"]

Index(['question', 'context', 'lang', 'answerable', 'answer_start', 'answer',
       'answer_inlang'],
      dtype='object')


0        bn
1        bn
2        bn
3        bn
4        bn
         ..
15321    te
15322    te
15323    te
15324    te
15325    te
Name: lang, Length: 15326, dtype: object

## Week 36

### a)

In [4]:
def summarize_statistics(df):

    print(f'Total number of rows: {len(df)}')
    print(f'Labels are: {df.keys()}')

    num_fi = df.loc[df["lang"] == "fi"].shape[0]
    num_ja = df.loc[df["lang"] == "ja"].shape[0]
    num_ru = df.loc[df["lang"] == "ru"].shape[0]

    num_questions = {'fi':num_fi, 'ja':num_ja, 'ru':num_ru}

    print(f'Number of questions per language: {num_questions}')

    num_fi_ans = df.loc[(df["lang"] == "fi") & (df["answerable"])].shape[0]
    num_ja_ans = df.loc[(df["lang"] == "ja") & (df["answerable"])].shape[0]
    num_ru_ans = df.loc[(df["lang"] == "ru") & (df["answerable"])].shape[0]

    num_questions_ans = {'fi':num_fi_ans, 'ja':num_ja_ans, 'ru':num_ru_ans}
    percentajes = {'fi':num_fi_ans*1.0/num_fi, 'ja':num_ja_ans*1.0/num_ja, 'ru':num_ru_ans*1.0/num_ru}

    print(f'Number of answerable questions per language: {num_questions_ans}')
    print(f'Percentaje of answerable questions: {percentajes}')

    num_fi_mean_ans_start = df.loc[(df["lang"] == "fi") & (df["answerable"])]["answer_start"].mean()
    num_ja_mean_ans_start = df.loc[(df["lang"] == "ja") & (df["answerable"])]["answer_start"].mean()
    num_ru_mean_ans_start = df.loc[(df["lang"] == "ru") & (df["answerable"])]["answer_start"].mean()

    mean_ans_start = {'fi':float(num_fi_mean_ans_start), 'ja':float(num_ja_mean_ans_start), 'ru':float(num_ru_mean_ans_start)}

    print(f'Average start of the answer in context in answerable questions: {mean_ans_start}')

    num_fi_mean_context_len = float(df.loc[(df["lang"] == "fi") & (df["answerable"])]["context"].apply(lambda row: len(row)).mean())
    num_ja_mean_context_len = float(df.loc[(df["lang"] == "ja") & (df["answerable"])]["context"].apply(lambda row: len(row)).mean())
    num_ru_mean_context_len = float(df.loc[(df["lang"] == "ru") & (df["answerable"])]["context"].apply(lambda row: len(row)).mean())

    mean_context_len = {'fi': num_fi_mean_context_len, 'ja':num_ja_mean_context_len, 'ru':num_ru_mean_context_len}

    print(f'Average length of context in answerable questions: {mean_context_len}')

    context_lengths = df["context"].apply(lambda row: len(row))
    mean    = float(context_lengths.mean())
    variance= float(context_lengths.var()) 

    print(f'Mean length of context: {mean}')
    print(f'Variance in length of context: {variance}')
    
print("----------------------------[Train]----------------------------")
summarize_statistics(dataset_train)
print("----------------------------[Validation]----------------------------")
summarize_statistics(dataset_validation)


----------------------------[Train]----------------------------
Total number of rows: 15326
Labels are: Index(['question', 'context', 'lang', 'answerable', 'answer_start', 'answer',
       'answer_inlang'],
      dtype='object')
Number of questions per language: {'fi': 2126, 'ja': 2301, 'ru': 1983}
Number of answerable questions per language: {'fi': 1872, 'ja': 1929, 'ru': 1756}
Percentaje of answerable questions: {'fi': 0.8805268109125117, 'ja': 0.8383311603650587, 'ru': 0.8855269793242562}
Average start of the answer in context in answerable questions: {'fi': 166.58226495726495, 'ja': 167.73613271124935, 'ru': 203.73234624145786}
Average length of context in answerable questions: {'fi': 614.7708333333334, 'ja': 619.270088128564, 'ru': 627.6714123006834}
Mean length of context: 613.0135064596111
Variance in length of context: 164329.30794154384
----------------------------[Validation]----------------------------
Total number of rows: 3028
Labels are: Index(['question', 'context', 'lan

### b)

In [5]:
#Preprocessing to put everythin in lowercase:

dataset_train["question"] = dataset_train["question"].apply(lambda row: row.lower())
dataset_validation["question"] = dataset_validation["question"].apply(lambda row: row.lower())




In [10]:
#Takes a long time to execute

from transformers import AutoTokenizer, MarianMTModel, AutoTokenizer


model_name_fi = 'Helsinki-NLP/opus-mt-fi-en'
model_name_ja = 'Helsinki-NLP/opus-mt-ja-en'
model_name_ru = 'Helsinki-NLP/opus-mt-ru-en'


model_fi = MarianMTModel.from_pretrained(model_name_fi)
model_ja = MarianMTModel.from_pretrained(model_name_ja)
model_ru = MarianMTModel.from_pretrained(model_name_ru)


tokenizer_fi = AutoTokenizer.from_pretrained(model_name_fi)
tokenizer_ja = AutoTokenizer.from_pretrained(model_name_ja)
tokenizer_ru = AutoTokenizer.from_pretrained(model_name_ru)
tokenizer_en = AutoTokenizer.from_pretrained("bert-base-cased")

sample_text = "Miten voin käyttää tätä käännösmallia?"

def translate(text,tokenizer,model):
    batch = tokenizer([text], return_tensors="pt")
    generated_ids = model.generate(**batch)
    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]




questions_fi = dataset_train.loc[(dataset_train["lang"] == "fi")]["question"].apply(lambda row: translate(row,tokenizer_fi,model_fi))
questions_ja = dataset_train.loc[(dataset_train["lang"] == "ja")]["question"].apply(lambda row: translate(row,tokenizer_ja,model_ja))
questions_ru = dataset_train.loc[(dataset_train["lang"] == "ru")]["question"].apply(lambda row: translate(row,tokenizer_ru,model_ru))







In [16]:
tokenized_questions_fi = questions_fi.apply(lambda row: tokenizer_en.tokenize(row)).to_list()
tokenized_questions_ja = questions_ja.apply(lambda row: tokenizer_en.tokenize(row)).to_list()
tokenized_questions_ru = questions_ru.apply(lambda row: tokenizer_en.tokenize(row)).to_list()

def count_word_frequency(sequences):
    frequencies = {}
    for row in sequences:
        for word in row:
            if word in frequencies:
                frequencies[word] += 1
            else: 
                frequencies[word] = 1
    return dict(sorted(frequencies.items(), key=lambda x:x[1], reverse=True))

word_count_fi = count_word_frequency(tokenized_questions_fi)
word_count_ja = count_word_frequency(tokenized_questions_ja)
word_count_ru = count_word_frequency(tokenized_questions_ru)

### c)

In [17]:

from transformers import AutoTokenizer, MarianMTModel, AutoTokenizer


model_name_fi = 'Helsinki-NLP/opus-mt-fi-en'
model_name_ja = 'Helsinki-NLP/opus-mt-ja-en'
model_name_ru = 'Helsinki-NLP/opus-mt-ru-en'


model_fi = MarianMTModel.from_pretrained(model_name_fi)
model_ja = MarianMTModel.from_pretrained(model_name_ja)
model_ru = MarianMTModel.from_pretrained(model_name_ru)


tokenizer_fi = AutoTokenizer.from_pretrained(model_name_fi)
tokenizer_ja = AutoTokenizer.from_pretrained(model_name_ja)
tokenizer_ru = AutoTokenizer.from_pretrained(model_name_ru)


def translate(text,tokenizer,model):
    batch = tokenizer([text], return_tensors="pt")
    generated_ids = model.generate(**batch)
    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]




questions_fi = dataset_validation.loc[(dataset_validation["lang"] == "fi")]["question"].apply(lambda row: translate(row,tokenizer_fi,model_fi))
questions_ja = dataset_validation.loc[(dataset_validation["lang"] == "ja")]["question"].apply(lambda row: translate(row,tokenizer_ja,model_ja))
questions_ru = dataset_validation.loc[(dataset_validation["lang"] == "ru")]["question"].apply(lambda row: translate(row,tokenizer_ru,model_ru))

In [21]:

context_fi = dataset_validation.loc[(dataset_validation["lang"] == "fi")]["context"]
context_ja = dataset_validation.loc[(dataset_validation["lang"] == "ja")]["context"]
context_ru = dataset_validation.loc[(dataset_validation["lang"] == "ru")]["context"]

X_fi = pd.concat([questions_fi,context_fi], axis=1)
X_ja = pd.concat([questions_ja,context_ja], axis=1)
X_ru = pd.concat([questions_ru,context_ru], axis=1)

X = pd.concat([X_fi,X_ja,X_ru],axis = 0)

def rule_based_predictor(row):


    question = row["question"]
    context  = row["context"]

    
    #Functions to check type of question
    def contains_what(question):
        return "what" in question.lower()
    def contains_where(question):
        return "where" in question.lower()
    def contains_who(question):
        return "who" in question.lower()
    def contains_why(question):
        return "why" in question.lower()
    def contains_when(question):
        return "when" in question.lower()
    def contains_yes_no(question):
        return question.lower().startswith(("is", "are"))
    def contains_how(question):
        return "how" in question.lower()
    

    #Functions to guess if context answers specific type
    
    if contains_what(question):
        return answers_what(context)
    if contains_where(question):
        return answers_where(context)
    if contains_who(question):
        return answers_who(context)
    if contains_how(question):
        return answers_how(context)
    if contains_why(question):
        return answers_why(context)
    if contains_when(question):
        return answers_when(context)
    return False
        



In [19]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
display(questions_fi)

311              What country was Jack churchill born in?
312        what is the most common religion in the world?
313                  Who was the star of the glee series?
314     When was the Killzone series game first released?
315                        When did Pennsylvania join us?
316                   Where did Richard attenborough die?
317                                When was atheism born?
318          where does digital television become a name?
319                 Have Finnish dialects always existed?
320                 Who played the lead in the glee show?
321        What is the most famous song of jaco pastorus?
322     Are the peaks of the Andes completely covered ...
323             What town was my mario capollini born in?
324     When did the silver arrow manga begin publishing?
325     With which dynasty did the so-called Mandarins...
326                    Where did nominalism get its name?
327          Which grain is the basic ingredient in ouzo?
328           

In [20]:
display(context_fi)

311     Churchill was born at Colombo, British Ceylon ...
312     The five largest religious groups by world pop...
313     Rachel Barbra Berry (Lea Michele) is the lead ...
314     Killzone is a series of first-person shooter a...
315     The state is one of the 13 original founding s...
316     In June 2012, shortly before her 90th birthday...
317     In early ancient Greek, the adjective ' (, fro...
318     Digital television (DTV) is the transmission o...
319     Westrobothnian () is a number of closely relat...
320     Rachel Barbra Berry (Lea Michele) is the lead ...
321     Birdland marked the peak of Weather Report's c...
322     The climate in the Andes varies greatly depend...
323     Cipollini was born in Lucca, Tuscany. He came ...
324     Takahashi was born September 18, 1953, in Higa...
325     China has had civil servants since at least th...
326     The term 'nominalism' stems from the Latin "no...
327     Ouzo (, ) is a dry anise-flavoured aperitif th...
328     The br