In [6]:
import numpy as np
import itertools
import os
import re
from scipy.spatial.distance import cosine
import unidecode as un
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [7]:
n = 3
alphabet = " abcdefghijklmnopqrstuvwxyz"
N_GRAMS = ["".join(p) for p in itertools.product(alphabet, repeat=n)]
print(len(N_GRAMS))
langLabels = ["bul", "ces", "dan", "nld", "deu", "eng", "est", "fin", "fra", "ell", "hun", "ita", "lav", "lit", "pol", "por", "ron", "slk", "slv", "spa", "swe"]

19683


In [8]:
def count_ngrams(lines, n):
    ngrams_dict = {}
    for line in lines:
        chars = list(line)
        for i in range(len(chars) - n + 1):
            ngram = ''.join(chars[i:i+n])
            if ngram in ngrams_dict:
                ngrams_dict[ngram] += 1
            else:
                ngrams_dict[ngram] = 1
    return ngrams_dict

def preprocess_text(text):
    
    text = un.unidecode(text)
    
    text = text.lower()
    
    return re.sub(r'[^a-zA-Z ]', '', text)


def preprocess_and_count_ngrams(source_dir, target_dir, n, limit=1000):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    else:
        for file in os.listdir(target_dir):
            os.remove(os.path.join(target_dir, file))

    language_ngram_counts = {}

    for filename in os.listdir(source_dir):
        if filename.endswith('.txt') and filename[:3].isalpha():
            language_code = filename[:3]  # Assuming first 3 letters are language code
            with open(os.path.join(source_dir, filename), 'r', encoding='utf-8') as file:
                lines = [preprocess_text(next(file)) for _ in range(limit)]
                ngram_counts = count_ngrams(lines, n)

                processed_content = '\n'.join(lines)
                with open(os.path.join(target_dir, filename), 'w', encoding='utf-8') as new_file:
                    new_file.write(processed_content)

            if language_code in language_ngram_counts:
                
                for ngram, count in ngram_counts.items():
                    language_ngram_counts[language_code][ngram] = language_ngram_counts[language_code].get(ngram, 0) + count
            else:
                
                language_ngram_counts[language_code] = dict(ngram_counts)

    return language_ngram_counts

source_directory = "training_languages"
target_directory = "prep_training"

training_language_ngram_stats = preprocess_and_count_ngrams(source_directory, target_directory, n)
print("Training: ",training_language_ngram_stats)

prep_source_directory = "testing_languages"
prep_target_directory = "prep_testing"
testing_language_ngram_stats = preprocess_and_count_ngrams(prep_source_directory, prep_target_directory, n)
print("\n Testing: ", testing_language_ngram_stats)

Training:  {'bul': {' bi': 89, 'bia': 50, 'iak': 109, 'akh': 114, 'kha': 115, 'ha ': 148, 'a p': 375, ' po': 459, 'pob': 8, 'obe': 24, 'bed': 12, 'edi': 97, 'di ': 72, 'i v': 130, ' v ': 278, 'v k': 41, ' ka': 256, 'kat': 186, 'ate': 71, 'teg': 9, 'ego': 27, 'gor': 14, 'ori': 137, 'rii': 91, 'iia': 492, 'iat': 278, 'ata': 468, 'ta ': 511, 'a n': 422, ' na': 776, 'nai': 51, 'aid': 16, 'ido': 5, 'dob': 38, 'obr': 48, 'bra': 61, 'ra ': 103, 'a b': 139, ' br': 38, 'bri': 30, 'rit': 72, 'ita': 71, 'tan': 68, 'ans': 56, 'nsk': 57, 'ska': 109, 'ka ': 151, 'a g': 116, ' gr': 56, 'gru': 7, 'rup': 6, 'upa': 7, 'pa ': 19, 'a i': 292, ' i ': 445, 'i s': 259, ' sp': 80, 'spe': 39, 'pec': 18, 'ech': 78, 'che': 482, 'hel': 27, 'eli': 115, 'li ': 186, 'i n': 166, 'aig': 4, 'igo': 6, 'gol': 35, 'oli': 78, 'lia': 101, 'iam': 76, 'ama': 116, 'mat': 55, 'nag': 21, 'agr': 9, 'gra': 79, 'rad': 60, 'ada': 24, 'da ': 474, 'na ': 729, 'a s': 528, ' sn': 15, 'sno': 25, 'nos': 57, 'osh': 57, 'shc': 460, 'hch': 4

In [9]:
def create_item_memory(d, alphabet):

    np.random.seed(1)  # Setting seed for reproducibility
    return {char: np.random.choice([-1, 1], d) for char in alphabet}

def encode_ngram(ngram, item_memory):

    if not ngram or any(char not in item_memory for char in ngram):
        return np.zeros(len(next(iter(item_memory.values()))))
    return np.prod([item_memory[char] for char in ngram], axis=0)

def construct_language_centroid(ngram_stats, item_memory):

    centroid = np.zeros(len(next(iter(item_memory.values()))))
    for ngram, count in ngram_stats.items():
        encoded_ngram = encode_ngram(ngram, item_memory)
        if isinstance(encoded_ngram, np.ndarray):
            centroid += encoded_ngram * count
    return centroid

def classify_language(query_vector, centroids):

    similarities = {lang: 1 - cosine(query_vector, centroid) 
                    for lang, centroid in centroids.items()}
    return max(similarities, key=similarities.get)

d = 100
#d = 1000
# accuracy = 0,95 

H = create_item_memory(d, alphabet)


language_centroids = {language: construct_language_centroid(ngram_stats, H) 
                      for language, ngram_stats in training_language_ngram_stats.items()}


test_language_centroids = {language: construct_language_centroid(ngram_stats, H) 
                           for language, ngram_stats in testing_language_ngram_stats.items()}


predicted_languages = {language: classify_language(query_vector, language_centroids)
                       for language, query_vector in test_language_centroids.items()}

print(predicted_languages)


{'bul': 'bul', 'ces': 'slk', 'dan': 'dan', 'deu': 'deu', 'ell': 'ell', 'eng': 'eng', 'est': 'est', 'fin': 'fin', 'fra': 'fra', 'hun': 'hun', 'ita': 'ita', 'lav': 'lav', 'lit': 'lit', 'nld': 'nld', 'pol': 'pol', 'por': 'por', 'ron': 'por', 'slk': 'slk', 'slv': 'slv', 'spa': 'spa', 'swe': 'swe'}


In [10]:
true_labels = [language for language in testing_language_ngram_stats.keys()]
predicted_labels = [predicted_languages[language] for language in testing_language_ngram_stats.keys()]


conf_matrix = confusion_matrix(true_labels, predicted_labels, labels=list(language_centroids.keys()))
print("Confusion Matrix:\n", conf_matrix)


accuracy = accuracy_score(true_labels, predicted_labels)
print("Accuracy:", accuracy)


f1 = f1_score(true_labels, predicted_labels, average='weighted')
print("F1-Score:", f1)


Confusion Matrix:
 [[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]]
Accuracy: 0.9047619047619048
F1-Sco

## Questions

## Question 1: 
What will be the size of the n-gram input vector in conventional (local) represenation?
19683

## Question 2: 
Identify difficulties of working with conventional representations of n-grams in the machine learning context
The difficulties of working with conventional representations of n-grams and such high-dimensional vectors oftens leads to computational challenges and sparsity issues. Most elements are zeros, that can increase the computational resources needed for the models and hindering effective learning.

Working with such high-dimensional vectors often leads to computational challenges and sparsity issues, where most elements of the vectors are zero, hindering effective learning and increasing the computational resources needed for machine learning models.