In [38]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

class BERTCardinalConverter:
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('NbAiLab/nb-bert-base')
        self.model = BertForMaskedLM.from_pretrained('NbAiLab/nb-bert-base')

    def predict_masked_token(self, sentence):
        # Tokenize input
        inputs = self.tokenizer(sentence, return_tensors='pt')
        mask_token_index = torch.where(inputs["input_ids"] == self.tokenizer.mask_token_id)[1]

        # Predict all tokens
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Extract the predictions for the masked token
        logits = outputs.logits
        mask_token_logits = logits[0, mask_token_index, :]
        top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

        predictions = [self.tokenizer.decode([token]).strip() for token in top_5_tokens]
        return predictions

# Example usage
converter = BERTCardinalConverter()
masked_sentence = "Eiendommen var på flere 100 mål, men …flere ett [MASK] mål, "
predictions = converter.predict_masked_token(masked_sentence)
print(predictions)


['##oo', '##ue', '100', '000', '##o']


In [47]:
import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Function to convert numbers to words (specific to Norwegian)
def number_to_words(number):
    small_numbers = ["null", "en", "to", "tre", "fire", "fem", "seks", "syv", "åtte", "ni", "ti", "elleve", "tolv", "tretten", "fjorten", "femten", "seksten", "sytten", "atten", "nitten"]
    tens = ["", "", "tjue", "tretti", "førti", "femti", "seksti", "sytti", "åtti", "nitti"]

    if 0 <= number < 20:
        return small_numbers[number]
    elif 20 <= number < 100:
        return tens[number // 10] + (small_numbers[number % 10] if number % 10 != 0 else "")
    elif 100 <= number < 1000:
        if number % 100 == 0:
            return small_numbers[number // 100] + " hundre"
        else:
            return small_numbers[number // 100] + " hundre " + number_to_words(number % 100)
    elif 1000 <= number < 1000000:
        if number % 1000 == 0:
            return number_to_words(number // 1000) + " tusen"
        elif number // 1000 == 1:
            return "ett tusen " + number_to_words(number % 1000)
        else:
            return number_to_words(number // 1000) + " tusen " + number_to_words(number % 1000)
    elif 1000000 <= number < 1000000000:
        if number % 1000000 == 0:
            return number_to_words(number // 1000000) + " million" + ("er" if number // 1000000 > 1 else "")
        else:
            return number_to_words(number // 1000000) + " million" + ("er" if number // 1000000 > 1 else "") + " " + number_to_words(number % 1000000)
    elif 1000000000 <= number < 1000000000000:
        if number % 1000000000 == 0:
            return number_to_words(number // 1000000000) + " milliard" + ("er" if number // 1000000000 > 1 else "")
        else:
            return number_to_words(number // 1000000000) + " milliard" + ("er" if number // 1000000000 > 1 else "") + " " + number_to_words(number % 1000000000)
    else:
        return str(number)  # For simplicity, handle larger numbers as is


# Function to normalize measurements
def normalize_measurement(token):
    measurement_patterns = {
        "kg": "kilogram",
        "g": "gram",
        "m": "meter",
        "cm": "centimeter",
        "mm": "millimeter",
        "km": "kilometer",
        "l": "liter",
        "ml": "milliliter"
    }
    match = re.match(r"(\d+)([a-zA-Z]+)", token)
    if match:
        number, unit = match.groups()
        if unit in measurement_patterns:
            return number_to_words(int(number)) + " " + measurement_patterns[unit]
    return token

# Function to normalize dates
def normalize_date(token):
    date_patterns = [
        r"(\d{2})/(\d{2})/(\d{4})",  # dd/mm/yyyy
        r"(\d{4})-(\d{2})-(\d{2})"   # yyyy-mm-dd
    ]
    for pattern in date_patterns:
        match = re.match(pattern, token)
        if match:
            groups = match.groups()
            if len(groups) == 3:
                day, month, year = groups if pattern == date_patterns[0] else (groups[2], groups[1], groups[0])
                month_names = ["januar", "februar", "mars", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "desember"]
                return f"{number_to_words(int(day))} {month_names[int(month) - 1]} {number_to_words(int(year))}"
    return token

# Function to normalize time
def normalize_time(token):
    time_patterns = [
        r"(\d{1,2}):(\d{2})"  # hh:mm
    ]
    for pattern in time_patterns:
        match = re.match(pattern, token)
        if match:
            hours, minutes = match.groups()
            return f"{number_to_words(int(hours))} {number_to_words(int(minutes))}"
    return token


# Custom function to normalize text
def normalize_text(text):
    # Tokenize text using NLTK
    tokens = word_tokenize(text)
    
    normalized_tokens = []
    for token in tokens:
        # Check if the token is a number
        if token.isdigit():
            normalized_number = number_to_words(int(token))
            normalized_tokens.append(normalized_number)
        elif re.match(r"(\d+)([a-zA-Z]+)", token):
            normalized_tokens.append(normalize_measurement(token))
        elif re.match(r"(\d{2})/(\d{2})/(\d{4})", token) or re.match(r"(\d{4})-(\d{2})-(\d{2})", token):
            normalized_tokens.append(normalize_date(token))
        elif re.match(r"(\d{1,2}):(\d{2})", token):
            normalized_tokens.append(normalize_time(token))
        else:
            normalized_tokens.append(token)
    
    # Reconstruct the normalized tokens back into a sentence
    normalized_text = ' '.join(normalized_tokens)
    return normalized_text

# Example usage
input_text = "Jeg har 1200 epler og 500 appelsiner. Møtet er kl 14:30 den 01/01/2020. Han veier 70kg og er 180cm høy."
normalized_text = normalize_text(input_text)
print(f"Original: {input_text}")
print(f"Normalized: {normalized_text}")


Original: Jeg har 1200 epler og 500 appelsiner. Møtet er kl 14:30 den 01/01/2020. Han veier 70kg og er 180cm høy.
Normalized: Jeg har ett tusen to hundre epler og fem hundre appelsiner . Møtet er kl fjorten tretti den en januar to tusen tjue . Han veier sytti kilogram og er en hundre åtti centimeter høy .


[nltk_data] Downloading package punkt to /home/lemoi18/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [40]:
%pip install inflect

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting inflect
  Downloading inflect-7.3.1-py3-none-any.whl.metadata (21 kB)
Collecting more-itertools>=8.5.0 (from inflect)
  Downloading more_itertools-10.3.0-py3-none-any.whl.metadata (36 kB)
Collecting typeguard>=4.0.1 (from inflect)
  Downloading typeguard-4.3.0-py3-none-any.whl.metadata (3.7 kB)
Collecting typing-extensions>=4.10.0 (from typeguard>=4.0.1->inflect)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Downloading inflect-7.3.1-py3-none-any.whl (34 kB)
Downloading more_itertools-10.3.0-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading typeguard-4.3.0-py3-none-any.whl (35 kB)
Downloading typing_extensions-4.12.2-py3-none-any.whl (37 kB)
Installing collected packages: typing-extensions, more-itertools, typeguard, inflect
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.8.0
    Uninstalling