# Language Identification

In [None]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m890.9/897.5 kB[0m [31m67.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


## Tokenize data in the proper language


In [None]:
import re

def clean_text(text):
  """ Return the text in lower case, without strange characters and multiples spaces"""
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  text = re.sub(r'\s+', ' ', text).strip()
  return text


In [None]:
import os
import sys
from sacremoses import MosesTokenizer

def tokenize_files(directory):
    """
    Tokenizes the content of all .txt files in a given directory
    and returns a dictionary where each filename is associated with its list of tokens.
    The file name must be a language.
    """
    tokenized_data = {}

    # Check if the directory exists
    if not os.path.isdir(directory):
        raise FileNotFoundError(f"The directory '{directory}' does not exist.")

    # Iterate over .txt files
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory, filename)

            # Determine the language from the filename
            language = filename.replace(".txt", "").lower()
            try:
                tokenizer = MosesTokenizer(lang=language)  # Initialize tokenizer based on language
                with open(file_path, "r", encoding="utf-8") as f:
                    content = clean_text(f.read())
                    tokens = tokenizer.tokenize(content)  # Tokenization
                    tokenized_data[language] = tokens  # Store tokens in dictionary

                print(f"Tokenization completed ({language}): {filename}")
                print(f"Number of tokens in {language}: {len(tokens)} tokens")
                print(f"Size of the data in {language} : {sys.getsizeof(tokens)} bytes")
                print("")
            except Exception as e:
                print(f"Error with {filename} ({language}): {e}")

    return tokenized_data

directory_path = "/content/data"
tokenized_results = tokenize_files(directory_path)

FileNotFoundError: The directory '/content/data' does not exist.

## Split Data
We split the data into the training set, heldout and test sets. We will use 80% of the data for training.
The

In [None]:
def create_sets(tokenized_data):
  """ Create training, held Out and test data set with tokenized_data
  and returns all sets."""

  training_data={}
  test_data={}
  heldout_data={}

  for language, tokens in tokenized_data.items():
      split_training_point=int(len(tokens)*0.8)
      split_test_point=int(len(tokens)*0.1+split_training_point)
      training_data[language]=tokens[:split_training_point]
      test_data[language]=tokens[split_training_point:split_test_point]
      heldout_data[language]=tokens[split_test_point:]

  return training_data,test_data,heldout_data

sets=create_sets(tokenized_results)
training_data=sets[0]
test_data=sets[1]
heldout_data=sets[2]




NameError: name 'tokenized_results' is not defined

In [None]:
def calculate_oov_percentage(training_data, data):
    """

    """
    oov_percentages = {}

    for language in _data:
        training_vocab = set(training_data.get(language))
        tokens = data.get(language)

        oov_tokens = [token for token in tokens if token not in training_vocab]
        oov_percentage = (len(oov_tokens) / len(tokens)) * 100 if tokens else 0

        oov_percentages[language] = round(oov_percentage, 2)

    return oov_percentages

    print(calculate_oov_percentage(training_data, heldout_data))
    print(calculate_oov_percentage(training_data, test_data))

## Probabilities of n-grams

First, we create n-grams (characters or tokens).


In [None]:
def extract_charac_ngrams(tokens, n):
  """Extract characters n-grams from a given list of tokens
  and returns a list of n-grams"""
  return [token[i:i+n] for token in tokens for i in range(len(token) - n + 1)]



In [None]:
def extract_tokens_ngrams(tokens, n):
  """Extract tokens n-grams from a given list of tokens
  and returns a list of n-grams"""
  ngrams_list = []
  for i in range(len(tokens) - n + 1):
      ngram = tuple(tokens[i:i + n])
      ngrams_list.append(ngram)

  return ngrams_list


Now, we compute the probabilities and return a dictionary of ``` ngrams ```.


In [None]:
from collections import Counter

def compute_ngram_probabilities(extract_ngrams,tokens, n):
    """
    Compute n-gram probabilities for a given list of tokens
    and returns a dictionary of probabilities, n-gram counts for each and the total number of n-gram.
    """
    ngrams = extract_ngrams(tokens, n)
    ngram_counts = Counter(ngrams)
    total_ngrams = sum(ngram_counts.values())
    if n == 1:

        probabilities = {ngram: count / total_ngrams for ngram, count in ngram_counts.items()}
    else:
        n_minus_1_grams = extract_ngrams(tokens, n - 1)
        n_minus_1_counts = Counter(n_minus_1_grams)
        probabilities = {}
        for ngram, count in ngram_counts.items():
            n_minus_1_ngram = ngram[:-1]
            if n_minus_1_ngram in n_minus_1_counts:
                probabilities[ngram] = count / n_minus_1_counts[n_minus_1_ngram]  # P(w_n | w_{n-2}, w_{n-1})
            else:
                probabilities[ngram] = 0

    return probabilities, ngram_counts, total_ngrams



In [None]:
def get_most_common(ngram_counts, total_ngrams, top_n=5):
    """Return the top 5 most common  n-grams along with their counts and relative frequencies."""
    most_common = [(ngram, count, count / total_ngrams) for ngram, count in ngram_counts.most_common(top_n)]
    return most_common


In [None]:
def get_least_common(ngram_counts, total_ngrams, top_n=5):
    """Return the top 5 least common n-grams along with their counts and relative frequencies."""
    least_common = [(ngram, count, count / total_ngrams) for ngram, count in ngram_counts.most_common()[-top_n:]]
    return least_common


In [None]:
def process_tokenized_data(tokenized_data):
    """
    Process tokenized data and compute unigram, bigram, and trigram probabilities
    and returns a dictionary with languages (filenames) as keys and n-gram probabilities, most and least common trigrams as values.
    """
    results = {}

    for language, tokens in tokenized_data.items():
        unigram_probs, unigram_counts, total_unigrams = compute_ngram_probabilities(extract_charac_ngrams,tokens, 1)
        bigram_probs, bigram_counts, total_bigrams = compute_ngram_probabilities(extract_charac_ngrams,tokens, 2)
        trigram_probs, trigram_counts, total_trigrams = compute_ngram_probabilities(extract_charac_ngrams,tokens, 3)

        results[language] = {
            'unigram': unigram_probs,
            'bigram': bigram_probs,
            'trigram': trigram_probs,
            'most_common_trigrams': get_most_common(trigram_counts, total_trigrams),
            'least_common_trigrams': get_least_common(trigram_counts, total_trigrams),
        }

    return results

ngram_results = process_tokenized_data(training_data)


for filename, ngrams in ngram_results.items():
    print(f"Data Language: {filename}")
    print(f"Most Common: {ngrams['most_common_trigrams']}")
    print(f"Least Common: {list(ngrams['least_common_trigrams'])}")
    print("-")

Data Language: english
Most Common: [('the', 21738, 0.02821563191177845), ('and', 12975, 0.016841375658079187), ('ing', 9499, 0.012329574364246181), ('her', 9441, 0.012254291143578082), ('hat', 5583, 0.0072466589825862125)]
Least Common: [('ieg', 1, 1.2979865632430974e-06), ('870', 1, 1.2979865632430974e-06), ('shh', 1, 1.2979865632430974e-06), ('zyb', 1, 1.2979865632430974e-06), ('ghh', 1, 1.2979865632430974e-06)]
-
Data Language: french
Most Common: [('ent', 5877, 0.011949469420785204), ('que', 5189, 0.01055058649386667), ('les', 4208, 0.008555958366966843), ('ait', 3614, 0.0073482018864586916), ('ant', 3558, 0.007234339322639741)]
Least Common: [('wys', 1, 2.0332600681955427e-06), ('iqû', 1, 2.0332600681955427e-06), ('qûr', 1, 2.0332600681955427e-06), ('tga', 1, 2.0332600681955427e-06), ('fûm', 1, 2.0332600681955427e-06)]
-
Data Language: czech
Most Common: [('sta', 2408, 0.0051783275808307255), ('byl', 2400, 0.005161123834715009), ('ost', 2266, 0.004872961087276755), ('pro', 2151, 

## Add less than one and EM algorithm

In [None]:
def compute_smoothed_ngram_probabilities(extract_ngrams,train_tokens, n, lambda_value):
    """Compute P'_{λ}(w | h) with smoothing add-less-than-one
    and returns a dictionnary with n-gram as key and probabilities as values."""
    ngrams = extract_ngrams(train_tokens, n)
    n_minus_1_grams = extract_ngrams(train_tokens, n - 1)
    ngram_counts = Counter(ngrams)
    n_minus_1_counts = Counter(n_minus_1_grams)
    V = len(set(ngrams))

    probabilities = {}
    for ngram, count_h_w in ngram_counts.items():
      count_h = n_minus_1_counts[ngram[:-1]]
      probabilities[ngram] = (count_h_w + lambda_value) / (count_h + lambda_value * V)

    return probabilities

In [None]:
def compute_interpolated_probabilities(extract_ngrams,train_tokens, lambdas, lambda_value):
  """compute P'_{λ}(w | h) with add-less-than-one and interpolated
  and returns a dictionnary with n-gram as key and probabilities as values. """
  trigram_probs = compute_smoothed_ngram_probabilities(extract_ngrams,train_tokens, 3, lambda_value)
  bigram_probs = compute_smoothed_ngram_probabilities(extract_ngrams,train_tokens, 2, lambda_value)
  unigram_probs = compute_smoothed_ngram_probabilities(extract_ngrams,train_tokens, 1, lambda_value)
  V = len(set(trigram_probs))

  probabilities = {}
  for trigram in trigram_probs:
      w_i = trigram[-1]
      w_i_1 = trigram[-2] if len(trigram) > 1 else ""
      w_i_2 = trigram[-3] if len(trigram) > 2 else ""

      p3 = trigram_probs.get((w_i_2,w_i_1, w_i), 0)
      p2 = bigram_probs.get((w_i_1, w_i), 0)
      p1 = unigram_probs.get((w_i,), 0)

      probabilities[trigram] = lambdas[3] * p3 + lambdas[2] * p2 + lambdas[1] * p1 + lambdas[0] * (1/V)

  return probabilities

In [None]:
def optimize_lambdas_with_em(extract_ngrams,train_tokens, heldout_tokens, lambda_value, epsilon=0.03, max_iter=1000):
    """Optimize lambdas with EM algorithm and using add-less-than-one for smoothing
    and returns an array of lambdas (float)."""
    lambdas = [0.25, 0.25, 0.25, 0.25]

    for iter in range(max_iter):
        trigram_probs = compute_interpolated_probabilities(extract_ngrams,train_tokens, lambdas, lambda_value)
        trigrams = extract_ngrams(heldout_tokens, 3)
        expected_counts = [0, 0, 0, 0]

        for trigram in trigrams:
            p_lambda = sum(lambdas[i] * trigram_probs.get(trigram, 1e-10) for i in range(4))
            for i in range(4):
                expected_counts[i] += (lambdas[i] * trigram_probs.get(trigram, 1e-10)) / p_lambda
        if iter % 10 == 0 :
            print(f"Iteration {iter+1}, Expected Counts: {expected_counts}")

        new_lambdas = [ec / sum(expected_counts) for ec in expected_counts]

        print(f"New Lambdas: {new_lambdas}")

        if all(abs(new_lambdas[i] - lambdas[i]) < epsilon for i in range(4)):
            print(f"Convergence atteinte après {iter+1} itérations.")
            break
        lambdas = new_lambdas

    return lambdas

In [None]:

lambda_value = 0.2


## Probabilities of our data

In [None]:
training_trigram_probs = {}
for language, tokens in training_data.items():
    lambdas = optimize_lambdas_with_em(extract_tokens_ngrams, tokens, heldout_data.get(language), lambda_value)
    trigram_probs = compute_interpolated_probabilities(extract_tokens_ngrams, tokens, lambdas, lambda_value)
    training_trigram_probs[language] = trigram_probs


training_trigram_charac_probs = {}
for language, tokens in training_data.items():
    lambdas = optimize_lambdas_with_em(extract_charac_ngrams, tokens, heldout_data.get(language), lambda_value)
    trigram_probs = compute_interpolated_probabilities(extract_charac_ngrams, tokens, lambdas, lambda_value)
    training_trigram_charac_probs[language] = trigram_probs

Iteration 1, Expected Counts: [10055.25, 10055.25, 10055.25, 10055.25]
New Lambdas: [0.25, 0.25, 0.25, 0.25]
Convergence atteinte après 1 itérations.
Iteration 1, Expected Counts: [5864.5, 5864.5, 5864.5, 5864.5]
New Lambdas: [0.25, 0.25, 0.25, 0.25]
Convergence atteinte après 1 itérations.
Iteration 1, Expected Counts: [5055.25, 5055.25, 5055.25, 5055.25]
New Lambdas: [0.25, 0.25, 0.25, 0.25]
Convergence atteinte après 1 itérations.
Iteration 1, Expected Counts: [22774.25, 22774.25, 22774.25, 22774.25]
New Lambdas: [0.25, 0.25, 0.25, 0.25]
Convergence atteinte après 1 itérations.
Iteration 1, Expected Counts: [15113.75, 15113.75, 15113.75, 15113.75]
New Lambdas: [0.25, 0.25, 0.25, 0.25]
Convergence atteinte après 1 itérations.
Iteration 1, Expected Counts: [14331.25, 14331.25, 14331.25, 14331.25]
New Lambdas: [0.25, 0.25, 0.25, 0.25]
Convergence atteinte après 1 itérations.


## Compute Cross Entropy of trigram

In [None]:
import numpy as np

def compute_cross_entropy(trigrams, trigram_probs):
  """ Compute the cross entropy
  and returns it as a float."""
  H = len(trigrams)
  entropy = -sum(np.log2(trigram_probs.get(trigram,1e-10)) for trigram in trigrams) / H
  return entropy



In [None]:
cross_entropies = {}

for language_train in training_data.keys():
  for language, test_tokens in test_data.items():
      trigram_probs, _, _ = compute_ngram_probabilities(extract_tokens_ngrams, training_data[language_train], 3)
      trigrams = extract_tokens_ngrams(test_tokens, 3)
      cross_entropies[language,language_train] =compute_cross_entropy(trigrams, trigram_probs)

for languages in cross_entropies.keys():
    print(f"Language Training : {languages[1]}, Language test : {languages[0]}\n Proba Cross-Entropy :{cross_entropies.get(languages)}")
    print("\n")


Language Training : english, Language test : english
 Proba Cross-Entropy :27.749919845556107


Language Training : english, Language test : french
 Proba Cross-Entropy :33.03314596554471


Language Training : english, Language test : czech
 Proba Cross-Entropy :33.2192809488651


Language Training : french, Language test : english
 Proba Cross-Entropy :33.21781047076802


Language Training : french, Language test : french
 Proba Cross-Entropy :29.52630606214211


Language Training : french, Language test : czech
 Proba Cross-Entropy :33.2192809488651


Language Training : czech, Language test : english
 Proba Cross-Entropy :33.219280948863776


Language Training : czech, Language test : french
 Proba Cross-Entropy :33.21928094886473


Language Training : czech, Language test : czech
 Proba Cross-Entropy :31.253510329428686




With that output, we can see that is more important when it is the good language, we will use this for the implementation of ``identify_language()``

## Identify Language

In [None]:
def softmax(scores):
    """Transform scores into probabilities"""
    exp_scores = np.exp(-np.array(list(scores.values())))  # e^(-H)
    sum_exp_scores = np.sum(exp_scores)
    probabilities = {lang: exp_scores[i] / sum_exp_scores for i, lang in enumerate(scores)}
    return probabilities

In [None]:
def identify_tokens_language(text):
    """Identifies the language of a given text by comparing tokens trigram probabilities."""

    # Tokenize the text
    tokenizer = MosesTokenizer(lang='en')  # Default to English tokenizer
    tokens_text = tokenizer.tokenize(text)

    if len(tokens_text)<3 :
        print("Text too short, please send at least 3 words")
        return None

    trigrams_text = extract_tokens_ngrams(tokens_text, 3)

    language_scores = {}

    for language, trigram_probs in training_trigram_probs.items():
        language_scores[language] = compute_cross_entropy(trigrams_text, trigram_probs)

    language_probabilities = softmax(language_scores)
    sorted_languages = sorted(language_probabilities.items(), key=lambda x: x[1],reverse=True)



    return sorted_languages


def identify_charac_language(text):
    """Identifies the language of a given text by comparing characters trigram probabilities."""

    # Tokenize the text
    tokenizer = MosesTokenizer(lang='en')  # Default to English tokenizer
    tokens_text = tokenizer.tokenize(text)

    if len(tokens_text) < 3 :
        print("Text too short, please send at least 3 words")
        return None

    trigrams_text = extract_charac_ngrams(tokens_text, 3)

    language_scores = {}

    for language, trigram_probs in training_trigram_charac_probs.items():
        language_scores[language] = compute_cross_entropy(trigrams_text, trigram_probs)

    language_probabilities = softmax(language_scores)
    sorted_languages = sorted(language_probabilities.items(), key=lambda x: x[1],reverse=True)



    return sorted_languages





Neither of comparing using characters and tokens probabilities is very efficient, but using both reduce wrong return.

In [None]:
def identify_language(text):
    """ Identifies the language of a given text by comparing trigram of characters and tokens probabilities."""
    tokens_prob = identify_tokens_language(text)
    if tokens_prob[0][1] < 0.34 :
        return identify_charac_language(text)
    return tokens_prob

In [None]:

print("----- Test for Czech Language -----")
text = "Jsem test a pracuji"
print(f"Test text : {text}")
print("\n")
print(f"Both trigrams : {identify_language(text)}")
print("\n")
print(f"Tokens : {identify_tokens_language(text)}")
print(f"Charac{identify_charac_language(text)}")
print("\n")

print("----- Test for English Language -----")
text = "I am a test and I am working"
print(f"Test text : {text}")
print("\n")
print(f"Both trigrams : {identify_language(text)}")
print("\n")
print(f"Tokens : {identify_tokens_language(text)}")
print(f"Charac{identify_charac_language(text)}")
print("\n")

print("----- Test for French Language -----")
text = "Je suis un test et je fonctionne"
print(f"Test text : {text}")
print("\n")
print(f"Both trigrams : {identify_language(text)}")
print("\n")
print(f"Tokens : {identify_tokens_language(text)}")
print(f"Charac{identify_charac_language(text)}")
print("------------------------------------")

----- Test for Czech Language -----
Test text : Jsem test a pracuji


Both trigrams : [('czech', np.float64(0.795612974933282)), ('french', np.float64(0.18468449617678023)), ('english', np.float64(0.01970252888993776))]


Tokens : [('english', np.float64(0.3333333333333333)), ('french', np.float64(0.3333333333333333)), ('czech', np.float64(0.3333333333333333))]
Charac[('czech', np.float64(0.795612974933282)), ('french', np.float64(0.18468449617678023)), ('english', np.float64(0.01970252888993776))]


----- Test for English Language -----
Test text : I am a test and I am working


Both trigrams : [('english', np.float64(0.983146832569954)), ('french', np.float64(0.010652516948764498)), ('czech', np.float64(0.006200650481281525))]


Tokens : [('english', np.float64(0.3333333333333333)), ('french', np.float64(0.3333333333333333)), ('czech', np.float64(0.3333333333333333))]
Charac[('english', np.float64(0.983146832569954)), ('french', np.float64(0.010652516948764498)), ('czech', np.float64