In [1]:
import torch
import random
import numpy as np
import pandas as pd
from janome.tokenizer import Tokenizer


import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/jljubas/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

enforce_reproducibility()

dataset_train       = pd.read_parquet("dataset/train.parquet")
dataset_validation  = pd.read_parquet("dataset/validation.parquet")

questions_fi = dataset_train.loc[(dataset_train["lang"] == "fi")]["question"].apply(lambda row: ['<s>'] + nltk.word_tokenize(row) + ['</s>']).to_list()
questions_fi = [item for sublist in questions_fi for item in sublist]


a = dataset_validation.loc[(dataset_validation["lang"] == "ja")]["question"].to_list()
a[0]

'月の地表の温度は何度'

## NOT SURE.-[ 1-gram model ] - [ No smoothing ] - Finnish

In [4]:
def calc_unigram_probabilities(train_dataset):
    questions_fi = dataset_train.loc[(dataset_train["lang"] == "fi")]["question"].apply(lambda row: nltk.word_tokenize(row, language='finnish')).to_list()
    tokens = [item.lower() for sublist in questions_fi for item in sublist]
    
    counts = {}
    for token in tokens:
        if token not in counts.keys():
            counts[token] = 1
        else:
            counts[token] += 1
    token_list_size = len(tokens)

    for count in counts:
        counts[count] = counts[count] / token_list_size

    return counts

model = calc_unigram_probabilities(dataset_train)


def unigram_model_fi(model,text):
    log_sentence_probability = 0.0
    OOV_default_prob = 1e-6
    
    for token in nltk.word_tokenize(text):
        if token.lower() in model:
            token_prob = model[token.lower()]  # Use a small value for unknown words
            
        else: 
            token_prob = OOV_default_prob
        log_sentence_probability += np.log(token_prob)
    
    return log_sentence_probability


unigram_model_fi(model,'Missä maassa Jack Churchill syntyi?')


-46.74348697097472

# Finnish

## FI-1.- [ 2-gram model ] - [ No smoothing ] - Finnish

In [5]:
def calc_bigram_probabilities(train_dataset):
    questions_fi = dataset_train.loc[(dataset_train["lang"] == "fi")]["question"].apply(lambda row: ['<s>'] + nltk.word_tokenize(row, language='finnish') + ['</s>']).to_list()
    tokens = [item.lower() for sublist in questions_fi for item in sublist]

    counts = {}
    for token in tokens:
        if token not in counts.keys():
            counts[token] = 1
        else:
            counts[token] += 1

    bigram_probs = {}
    for token_st, token_nd in zip(tokens,tokens[1:]):
        bigram = (token_st,token_nd)
        if bigram not in bigram_probs.keys():
            bigram_probs[bigram] = 1
        else:
            bigram_probs[bigram] += 1

    for bigram in bigram_probs:
        first = bigram[0]
        bigram_probs[bigram] = bigram_probs[bigram]/counts[first]

    return bigram_probs

model = calc_bigram_probabilities(dataset_train)

def bigram_model_fi(model, text):
    tokens = nltk.word_tokenize(text, language='finnish')
    tokens = ['<s>'] + [token.lower() for token in tokens]  + ['</s>']

    bigrams = [(token_st,token_nd) for token_st,token_nd in  zip(tokens,tokens[1:])]

    log_sentence_probability = 0.0
    OOV_default_prob = 1e-6
    
    for bigram in bigrams:
        if bigram in model:
            bigram_prob = model[bigram]  
        else: 
            bigram_prob = OOV_default_prob
        log_sentence_probability += np.log(bigram_prob)
    
    return log_sentence_probability

bigram_model_fi(model,'Missä maassa Jack Churchill syntyi?')


-46.34585503574918

## FI-2.- [ 2-gram model ] - [ Laplace smoothing ] - Finnish

In [6]:
def calc_bigram_probabilities(train_dataset):
    questions_fi = dataset_train.loc[(dataset_train["lang"] == "fi")]["question"].apply(lambda row: ['<s>'] + nltk.word_tokenize(row, language='finnish') + ['</s>']).to_list()
    tokens = [item.lower() for sublist in questions_fi for item in sublist]

    unigram_counts = {}
    for token in tokens:
        if token not in unigram_counts.keys():
            unigram_counts[token] = 1
        else:
            unigram_counts[token] += 1

    V = len(unigram_counts)

   
    bigram_counts = {}
    for token_st, token_nd in zip(tokens, tokens[1:]):
        bigram = (token_st, token_nd)
        if bigram not in bigram_counts:
            bigram_counts[bigram] = 1
        else:
            bigram_counts[bigram] += 1


    bigram_probs = {}
    
  
    for token_st in unigram_counts:
        for token_nd in unigram_counts:
            bigram = (token_st, token_nd)
            count_bigram = bigram_counts[bigram] if bigram in bigram_counts else 0
            bigram_probs[bigram] = (count_bigram + 1) / (unigram_counts[token_st] + V)
    
    return bigram_probs

model = calc_bigram_probabilities(dataset_train)

def bigram_model_ru(model, text):
    tokens = nltk.word_tokenize(text, language='finnish')
    tokens = ['<s>'] + [token.lower() for token in tokens]  + ['</s>']


    bigrams = [(token_st,token_nd) for token_st,token_nd in  zip(tokens,tokens[1:])]

    log_sentence_probability = 0.0
    OOV_default_prob = 1e-6
    
    for bigram in bigrams:
        if bigram in model:
            bigram_prob = model[bigram]  
        else: 
            bigram_prob = OOV_default_prob
        log_sentence_probability += np.log(bigram_prob)
    
    return log_sentence_probability

bigram_model_ru(model,'Missä maassa Jack Churchill syntyi?')


-57.188383868337404

# Russian

## RU-1.- [ 2-gram model ] - [ No smoothing ] - Russian

In [7]:
def calc_bigram_probabilities(train_dataset):
    questions_fi = dataset_train.loc[(dataset_train["lang"] == "ru")]["question"].apply(lambda row: ['<s>'] + nltk.word_tokenize(row, language='russian') + ['</s>']).to_list()
    tokens = [item.lower() for sublist in questions_fi for item in sublist]

    counts = {}
    for token in tokens:
        if token not in counts.keys():
            counts[token] = 1
        else:
            counts[token] += 1

    bigram_probs = {}
    for token_st, token_nd in zip(tokens,tokens[1:]):
        bigram = (token_st,token_nd)
        if bigram not in bigram_probs.keys():
            bigram_probs[bigram] = 1
        else:
            bigram_probs[bigram] += 1

    for bigram in bigram_probs:
        first = bigram[0]
        bigram_probs[bigram] = bigram_probs[bigram]/counts[first]

    return bigram_probs

model = calc_bigram_probabilities(dataset_train)

def bigram_model_ru(model, text):
    tokens = nltk.word_tokenize(text, language='russian')
    tokens = ['<s>'] + [token.lower() for token in tokens]  + ['</s>']

    bigrams = [(token_st,token_nd) for token_st,token_nd in  zip(tokens,tokens[1:])]

    log_sentence_probability = 0.0
    OOV_default_prob = 1e-6
    
    for bigram in bigrams:
        if bigram in model:
            bigram_prob = model[bigram]  
        else: 
            bigram_prob = OOV_default_prob
        log_sentence_probability += np.log(bigram_prob)
    
    return log_sentence_probability

bigram_model_ru(model,'Где вручается Шно́белевская премия ?')


-31.599046311471977

## RU-2.- [ 2-gram model ] - [ Laplace smoothing ] - Russian

In [8]:
def calc_bigram_probabilities(train_dataset):
    questions_fi = dataset_train.loc[(dataset_train["lang"] == "ru")]["question"].apply(lambda row: ['<s>'] + nltk.word_tokenize(row, language='russian') + ['</s>']).to_list()
    tokens = [item.lower() for sublist in questions_fi for item in sublist]

    unigram_counts = {}
    for token in tokens:
        if token not in unigram_counts.keys():
            unigram_counts[token] = 1
        else:
            unigram_counts[token] += 1

    V = len(unigram_counts)

   
    bigram_counts = {}
    for token_st, token_nd in zip(tokens, tokens[1:]):
        bigram = (token_st, token_nd)
        if bigram not in bigram_counts:
            bigram_counts[bigram] = 1
        else:
            bigram_counts[bigram] += 1


    bigram_probs = {}
    
  
    for token_st in unigram_counts:
        for token_nd in unigram_counts:
            bigram = (token_st, token_nd)
            count_bigram = bigram_counts[bigram] if bigram in bigram_counts else 0
            bigram_probs[bigram] = (count_bigram + 1) / (unigram_counts[token_st] + V)
    
    return bigram_probs

model = calc_bigram_probabilities(dataset_train)

def bigram_model_ru(model, text):
    tokens = nltk.word_tokenize(text, language='russian')
    tokens = ['<s>'] + [token.lower() for token in tokens]  + ['</s>']


    bigrams = [(token_st,token_nd) for token_st,token_nd in  zip(tokens,tokens[1:])]

    log_sentence_probability = 0.0
    OOV_default_prob = 1e-6
    
    for bigram in bigrams:
        if bigram in model:
            bigram_prob = model[bigram]  
        else: 
            bigram_prob = OOV_default_prob
        log_sentence_probability += np.log(bigram_prob)
    
    return log_sentence_probability

bigram_model_ru(model,'Где вручается Шно́белевская премия ?')


-48.2739011526146

# Japanese

## JA-1.- [ 2-gram model ] - [ No smoothing ] - Japanese

In [9]:

def calc_bigram_probabilities(train_dataset):
    tokenizer = Tokenizer()
    questions_ja = train_dataset.loc[(dataset_train["lang"] == "ja")]["question"].apply(lambda row: ['<s>'] + [token.surface for token in tokenizer.tokenize(row)] + ['</s>']).to_list()
    tokens = [item.lower() for sublist in questions_ja for item in sublist]

    counts = {}
    for token in tokens:
        if token not in counts.keys():
            counts[token] = 1
        else:
            counts[token] += 1

    bigram_probs = {}
    for token_st, token_nd in zip(tokens,tokens[1:]):
        bigram = (token_st,token_nd)
        if bigram not in bigram_probs.keys():
            bigram_probs[bigram] = 1
        else:
            bigram_probs[bigram] += 1

    for bigram in bigram_probs:
        first = bigram[0]
        bigram_probs[bigram] = bigram_probs[bigram]/counts[first]

    return bigram_probs

model = calc_bigram_probabilities(dataset_train)

def bigram_model_ja(model, text):
    tokenizer = Tokenizer()
    tokens = [token.surface for token in tokenizer.tokenize(text)]
    tokens = ['<s>'] + tokens  + ['</s>']

    bigrams = [(token_st,token_nd) for token_st,token_nd in  zip(tokens,tokens[1:])]

    log_sentence_probability = 0.0
    OOV_default_prob = 1e-6
    
    for bigram in bigrams:
        if bigram in model:
            bigram_prob = model[bigram]
        else: 
            bigram_prob = OOV_default_prob
        log_sentence_probability += np.log(bigram_prob)
    
    return log_sentence_probability

bigram_model_ja(model,'月の地表の温度は何度')

-83.92396327421085

## JA-2.- [ 2-gram model ] - [ Laplace smoothing ] - Japanese

In [12]:
def calc_bigram_probabilities(train_dataset):
    tokenizer = Tokenizer()
    questions_ja = train_dataset.loc[(dataset_train["lang"] == "ja")]["question"].apply(lambda row: ['<s>'] + [token.surface for token in tokenizer.tokenize(row)] + ['</s>']).to_list()
    tokens = [item.lower() for sublist in questions_ja for item in sublist]

    unigram_counts = {}
    for token in tokens:
        if token not in unigram_counts.keys():
            unigram_counts[token] = 1
        else:
            unigram_counts[token] += 1

    V = len(unigram_counts)

   
    bigram_counts = {}
    for token_st, token_nd in zip(tokens, tokens[1:]):
        bigram = (token_st, token_nd)
        if bigram not in bigram_counts:
            bigram_counts[bigram] = 1
        else:
            bigram_counts[bigram] += 1


    bigram_probs = {}
    
  
    for token_st in unigram_counts:
        for token_nd in unigram_counts:
            bigram = (token_st, token_nd)
            count_bigram = bigram_counts[bigram] if bigram in bigram_counts else 0
            bigram_probs[bigram] = (count_bigram + 1) / (unigram_counts[token_st] + V)
    
    return bigram_probs

model = calc_bigram_probabilities(dataset_train)

def bigram_model_ru(model, text):
    tokenizer = Tokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = ['<s>'] + [token for token in tokens]  + ['</s>']


    bigrams = [(token_st,token_nd) for token_st,token_nd in  zip(tokens,tokens[1:])]

    log_sentence_probability = 0.0
    OOV_default_prob = 1e-6
    
    for bigram in bigrams:
        if bigram in model:
            bigram_prob = model[bigram]  
        else: 
            bigram_prob = OOV_default_prob
        log_sentence_probability += np.log(bigram_prob)
    
    return log_sentence_probability

bigram_model_ja(model,'月の地表の温度は何度')

-67.80043695279065