# Test Env For Testing the G2P Model


In [1]:
import os
import re

#### Normalizeding
import sys
sys.path.append('/home/lemoi18/NB-PL-BERT/NbConverters')  # Add the path to your NBtext_normalize.py
from NBtext_normalize import normalize_text
###### noot used for now

import phonetisaurus
import logging
from convert_pa import nofabet_to_ipa, nofabet_to_sampa, sampa_to_ipa

import re

def apply_custom_adjustments(text):
    # List of century indicators
    centuries = ['atten', 'nitten', 'tjue', 'tjueen', 'tjueto','tjuetre','tjuefire']
    
    # List of decade words
    decades = ['ti', 'tjue', 'tretti', 'førti', 'femti', 'seksti', 'sytti', 'åtti', 'nitti', 'hundre']
    
    # Create regex patterns
    century_pattern = r'|'.join(centuries)
    decade_pattern = r'|'.join(decades)
    
    # Pattern to match phrases like 'nitten nitti - årene'
    decade_phrase_pattern = rf'\b({century_pattern})\b \b({decade_pattern})\b - årene'
    
    # Replacement pattern to convert to 'nitten nitti-tallet'
    replacement_pattern = r'\1 \2-tallet'

    custom_replacements = {
        r'e - post': 'e-post',  # Correcting 'e - post' back to 'e-post'
        r'e - posten': 'e-posten',
        r'«\s+': '«',  # Remove spaces after '«'
        r'\s+»': '»',  # Remove spaces before '»'
        decade_phrase_pattern: replacement_pattern,  # Correcting 'nitten nitti - årene' to 'nitten nitti-tallet'
        r'\s+-\s+': '-',  # Remove spaces around hyphens
    }

    # Apply each custom replacement
    for pattern, replacement in custom_replacements.items():
        text = re.sub(pattern, replacement, text)

    # Remove spaces around punctuation (.,!?;) but keep a space after commas
    text = re.sub(r'\s*([.!?;])\s*', r'\1', text)
    
    # Ensure a space after commas if not present
    text = re.sub(r',\s*', ', ', text)

    # Remove comma if it's the last character
    text = re.sub(r',\s*$', '', text)

    # Handle unmatched quotation marks
    num_left_quotes = text.count('«')
    num_right_quotes = text.count('»')

    if num_left_quotes != num_right_quotes:
        if num_left_quotes > num_right_quotes:
            text = remove_unmatched_quotes(text, '«')
        elif num_right_quotes > num_left_quotes:
            text = remove_unmatched_quotes(text, '»')
    
    return text

def remove_unmatched_quotes(text, quote_char):
    """Remove unmatched quotation marks from the text."""
    if quote_char == '«':
        if text.count('«') > text.count('»'):
            text = re.sub(r'«([^»]*)$', r'\1', text)  # Remove the last unmatched '«'
    elif quote_char == '»':
        if text.count('»') > text.count('«'):
            text = re.sub(r'^([^«]*)»', r'\1', text)  # Remove the first unmatched '»'
    return text

# Function to normalize text using custom rules
def normalize_text_with_custom_rules(text):
    normalized_text = normalize_text(text)  # Perform initial normalization
    normalized_text = apply_custom_adjustments(normalized_text)  # Apply custom adjustments
    return normalized_text



###### Written
###### Spoken 
def transcribe_words(words, dialect='e', style="written"):
    transcriptions = phonetisaurus.predict(words, model_path="/home/lemoi18/G2P-no/models/nb_e_spoken.fst")
    return transcriptions

def format_transcription(pronunciation):
    return " ".join(pronunciation)

def transcribe(text, dialect='e', style="spoken"):
    words = text.split()
    transcriptions = transcribe_words(words, dialect=dialect, style=style)
    return [(word, format_transcription(pron)) for word, pron in transcriptions]
    
def phonimize_IPA(text):
    tokens = re.findall(r"[\w']+|[.,!?;:]", text)
    transcriptions = transcribe(' '.join([t for t in tokens if re.match(r"[\w']+", t)]))

    transcription = []
    result_index = 0

    for token in tokens:
        if re.match(r"[.,!?;:]", token):  # Punctuation check
            if transcription:
                transcription[-1] += token
        else:  # It's a word
            word, phonetic = transcriptions[result_index]
            # Convert phonetic transcription to IPA
            ipa_transcription = nofabet_to_ipa(phonetic)
            # Remove syllable breaks (.)
            ipa_transcription = ipa_transcription.replace('.', '')
            transcription.extend(ipa_transcription.split())
            result_index += 1

    ps = ' '.join(transcription)
    return ps


def phonimize_NOFAB(text):
    

    tokens = re.findall(r"[\w']+|[.,!?;:]", text)
    transcriptions = transcribe(' '.join([t for t in tokens if re.match(r"[\w']+", t)]))
        
    transcription = []
    result_index = 0
        
    for token in tokens:
        if re.match(r"[.,!?;:]", token):
            transcription += token
        else:  # It's a word
            if result_index < len(transcriptions):
                word, phonetic = transcriptions[result_index]
                transcription.extend(phonetic.split())
                result_index += 1
            else:
                print(f"Warning: result_index {result_index} is out of range for transcriptions.")
                break  # or handle the error accordingly
        
    ps = transcription
    ps = ' '.join(transcription)
    return ps

def get_word_window(text, index, window_size=5):
    """Get a window of words around the specified index in the text."""
    words = text.split()
    start = max(0, index - window_size)
    end = min(len(words), index + window_size + 1)
    return ' '.join(words[start:end])


def process_train_listNormalize(input_file_path, output_file_path, k=100):
    with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
        for i, line in enumerate(infile):
            # Split the line into audio_path, text, and speaker_nr
            columns = line.split('|')

            # Normalize the second column (text)
            original_text = columns[1].strip()
            
            phonimize_text = normalize_text_with_custom_rules(original_text)

            # Reconstruct the line with the normalized text
            columns[1] = phonimize_text

            # Write the modified line to the output file in the same format
            outfile.write('|'.join(columns))

            # Stop after processing `k` lines (if required)
            if (i + 1) >= k:
                break



def process_train_listPhonminze(input_file_path, output_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
        lines = infile.readlines()

        for line in lines:
            # Split the line into audio_path, text, and speaker_nr
            columns = line.split('|')

            # Normalize the second column (text)
            original_text = columns[1].strip()
            
            phonimize_text = phonimize_NOFAB(original_text)

            # Reconstruct the line with the normalized text
            columns[1] = phonimize_text

            # Write the modified line to the output file in the same format
            outfile.write('|'.join(columns))

            

# Example usage:
#process_train_listPhonminze('normalized.txt', 'phonimized.txt')
#process_train_listNormalize('outputbooks.txt' , 'normalized.txt')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import pandas as pd
import re

import pandas as pd
import difflib
from IPython.display import Markdown, display

import pandas as pd
import difflib

def get_word_window(tokens, index, window_size=5):
    """Get a window of words around the specified index in the token list."""
    start = max(0, index - window_size)
    end = min(len(tokens), index + window_size + 1)
    return ' '.join(tokens[start:end])

def find_real_changes(original_tokens, normalized_tokens):
    """Find the real changes between two token lists, ignoring shifts in position."""
    diff = list(difflib.ndiff(original_tokens, normalized_tokens))
    changes = []
    
    orig_index = 0  # Index for original tokens
    norm_index = 0  # Index for normalized tokens
    
    for d in diff:
        if d.startswith('- ') and orig_index < len(original_tokens):  # Word removed from original
            changes.append((orig_index, original_tokens[orig_index], 'removed'))
            orig_index += 1
        elif d.startswith('+ ') and norm_index < len(normalized_tokens):  # Word added in normalized
            changes.append((norm_index, normalized_tokens[norm_index], 'added'))
            norm_index += 1
        elif d.startswith('  '):  # No change, increment both indices
            if orig_index < len(original_tokens):
                orig_index += 1
            if norm_index < len(normalized_tokens):
                norm_index += 1
    
    return changes

def process_train_listNormalize_with_context(input_file_path, output_file_path, k=100):
    original_texts = []
    normalized_texts = []
    contexts = []

    with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
        for i, line in enumerate(infile):
            # Split the line into audio_path, text, and speaker_nr
            columns = line.split('|')

            # Normalize the second column (text)
            original_text = columns[1].strip()
            phonimize_text = normalize_text_with_custom_rules(original_text)

            # Tokenize both original and normalized texts
            original_tokens = original_text.split()
            normalized_tokens = phonimize_text.split()

            # Find real changes using diff logic
            changes = find_real_changes(original_tokens, normalized_tokens)

            # Collect the context around each change
            for index, changed_word, change_type in changes:
                # Get 5-word context around the changed word in both original and normalized text
                original_context = get_word_window(original_tokens, index)
                normalized_context = get_word_window(normalized_tokens, index)

                # Collect the data for the markdown table
                original_texts.append(original_context)
                normalized_texts.append(normalized_context)
                contexts.append(f"Change at word {index+1}: '{changed_word}' ({change_type})")

            # Write the modified line to the output file in the same format
            columns[1] = phonimize_text
            outfile.write('|'.join(columns))

            # Stop after processing `k` lines (if required)
            if (i + 1) >= k:
                break

    # Create a DataFrame for comparison
    df = pd.DataFrame({
        "Original Text (Context)": original_texts,
        "Normalized Text (Context)": normalized_texts,
        "Change Description": contexts
    })

    return df


# Call the function and create the comparison DataFrame
df = process_train_listNormalize_with_context('outputbooks.txt' , 'normalized.txt', k=5000)

# Convert DataFrame to markdown format
markdown_table = df.to_markdown(index=True)
display(Markdown(markdown_table))


In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load a Norwegian BERT model
model_name = "NbAiLab/nb-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Reconstruct words by merging sub-word tokens
def reconstruct_words(tokens):
    words = []
    current_word = ""
    for token in tokens:
        if token.startswith("##"):  # Handling sub-word tokens
            current_word += token[2:]
        else:
            if current_word:
                words.append(current_word)
            current_word = token
    if current_word:
        words.append(current_word)
    return words

# Define a pipeline for NER
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# Input text containing a date
text = "Lars-Erik Moi har et møte den 23. juni 2023 i Oslo."

# Get NER results
ner_results = ner_pipeline(text)

# Extract tokens and labels
tokens = [entity['word'] for entity in ner_results]
labels = [entity['entity'] for entity in ner_results]

# Reconstruct full words from subword tokens
reconstructed_tokens = reconstruct_words(tokens)

# Print reconstructed words along with their labels
for i, word in enumerate(reconstructed_tokens):
    print(f"Entity: {word}, Label: {labels[i]}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at NbAiLab/nb-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entity: Lars, Label: LABEL_0
Entity: -, Label: LABEL_1
Entity: Erik, Label: LABEL_0
Entity: Moi, Label: LABEL_0
Entity: har, Label: LABEL_1
Entity: et, Label: LABEL_1
Entity: møte, Label: LABEL_1
Entity: den, Label: LABEL_1
Entity: 23, Label: LABEL_0
Entity: ., Label: LABEL_1
Entity: juni, Label: LABEL_1
Entity: 2023, Label: LABEL_0
Entity: i, Label: LABEL_1
Entity: Oslo, Label: LABEL_1
Entity: ., Label: LABEL_1


In [None]:
import os
import re
import sys
import asyncio
import aiofiles
import nest_asyncio

sys.path.append('/home/lemoi18/NB-PL-BERT/NbConverters')  # Add the path to your NBtext_normalize.py
from NBtext_normalize import normalize_text

import phonetisaurus
from convert_pa import nofabet_to_ipa, nofabet_to_sampa, sampa_to_ipa

nest_asyncio.apply()  # To allow asyncio.run() to be called from a running event loop

async def transcribe_words_async(words, dialect='e', style="written"):
    loop = asyncio.get_event_loop()
    transcriptions = await loop.run_in_executor(None, phonetisaurus.predict, words, "/home/lemoi18/G2P-no/models/nb_e_written.fst")
    return transcriptions

def format_transcription(pronunciation):
    return "".join(pronunciation)

async def transcribe_async(text, dialect='e', style="written"):
    words = text.split()
    transcriptions = await transcribe_words_async(words, dialect=dialect, style=style)
    return [(word, format_transcription(pron)) for word, pron in transcriptions]

async def phonimize_NOFAB_async(text):
    tokens = re.findall(r"[\w']+|[.,!?;:]", text)
    transcriptions = await transcribe_async(' '.join([t for t in tokens if re.match(r"[\w']+", t)]))
        
    transcription = []
    result_index = 0
        
    for token in tokens:
        if re.match(r"[.,!?;:]", token):
            transcription.append(token)
        else:  # It's a word
            if result_index < len(transcriptions):
                word, phonetic = transcriptions[result_index]
                transcription.extend(phonetic.split())
                result_index += 1
            else:
                print(f"Warning: result_index {result_index} is out of range for transcriptions.")
                break  # or handle the error accordingly
        
    ps = ' '.join(transcription)
    return ps

async def process_train_listPhonminze_async(input_file_path, output_file_path):
    async with aiofiles.open(input_file_path, 'r', encoding='utf-8') as infile, \
               aiofiles.open(output_file_path, 'w', encoding='utf-8') as outfile:
        lines = await infile.readlines()

        for line in lines:
            # Split the line into audio_path, text, and speaker_nr
            columns = line.split('|')

            # Normalize the second column (text)
            original_text = columns[1].strip()
            
            phonimize_text = await phonimize_NOFAB_async(original_text)

            # Reconstruct the line with the normalized text
            columns[1] = phonimize_text

            # Write the modified line to the output file in the same format
            await outfile.write('|'.join(columns))

# Example usage:
if __name__ == "__main__":
    asyncio.run(process_train_listPhonminze_async('normalized.txt', 'phonimized.txt'))

In [6]:
import pyparsing as pp
import re

# Helper functions for normalization
def normalize_number(num_str):
    # Only return "tall_" prefix if the number is standalone (not part of a date or measurement)
    return f"tall_{num_str}"

def normalize_date(date_str):
    # Normalize Norwegian dates (e.g., "23rd October" -> "23. oktober" or similar)
    if "/" in date_str:
        # Handle slash-separated dates (dd/mm/yyyy)
        return re.sub(r"(\d{1,2})/(\d{1,2})/(\d{4})", r"\1.\2.\3", date_str)
    return re.sub(r'(\d{1,2})\.\s*(\w+)', r'\1. \2', date_str)  # Ensure proper spacing for dates like "2. juni"

def normalize_measurement(measure_str):
    # Normalize Norwegian measurements (e.g., "10kg" -> "10 kilo")
    measurement_units = {
        "kg": "kilo",
        "cm": "centimeter",
        "m": "meter",
        "g": "gram",
        "km": "kilometer",
        "mm": "millimeter",
        "l": "liter",
        "dl": "desiliter",
    }
    num, unit = re.match(r"(\d+)([a-zA-Z]+)", measure_str).groups()
    return f"{num} {measurement_units.get(unit, unit)}"

def normalize_abbreviation(abbr_str):
    # Normalize Norwegian abbreviations (e.g., "NB" -> "Norsk Bokmål", "kr" -> "kroner")
    abbreviations = {
        "NB": "Norsk Bokmål",
        "NN": "Nynorsk",
        "kr": "kroner",
        "mrd": "milliarder",
        "t": "tonn",
    }
    return abbreviations.get(abbr_str, abbr_str)  # Use the abbreviation if no mapping exists

# Main normalization function
def normalize_text_with_custom_rules(text):
    # Define grammars for different types of data in Norwegian

    # Grammar for standalone numbers (integers and floats)
    number = pp.Combine(pp.Word(pp.nums) + pp.Optional(pp.oneOf(",.") + pp.Word(pp.nums)))

    # Grammar for dates in formats like "2. juni" or "23/10/2024"
    date = pp.Combine(pp.Word(pp.nums) + "." + pp.White() + pp.Word(pp.alphas)) | pp.Regex(r"\d{1,2}/\d{1,2}/\d{4}")

    # Grammar for measurements like "10kg", "5cm", "3.5m" (with Norwegian decimal separators)
    measurement = pp.Combine(pp.Word(pp.nums + ",.") + pp.Word(pp.alphas))

    # Grammar for common Norwegian abbreviations
    abbreviation = pp.Word(pp.alphas, exact=2) | pp.Word(pp.alphas, exact=3)

    # Set parse actions for each pattern
    number.setParseAction(lambda t: normalize_number(t[0]))
    date.setParseAction(lambda t: normalize_date(t[0]))
    measurement.setParseAction(lambda t: normalize_measurement(t[0]))
    abbreviation.setParseAction(lambda t: normalize_abbreviation(t[0]))

    # Combine all patterns into one parser
    parser = number | date | measurement | abbreviation

    # Normalize the text using the parser
    normalized_text = parser.transformString(text)

    return normalized_text

# Example usage
text_to_normalize = "Hendelsen skjer 2. juni, og jeg kjøpte 5kg epler den 23/10/2024. Det kostet 10kr."
normalized = normalize_text_with_custom_rules(text_to_normalize)
print(normalized)


Hendelsen skjer tall_2. juni, og jeg kjøpte tall_5kg epler den tall_23/tall_10/tall_2024. Det kostet tall_10kroner.


In [18]:
# Step 1: Extract word combinations using pyparsing
import pyparsing as pp
import re

# Helper function to extract only valid word combinations
import pyparsing as pp
import re

# Helper function to extract only valid word combinations
def extract_word_combinations(text):
    word_combinations = []

    # Define parsers for specific word combinations
    
    # Date parser (Norwegian date format and slash-separated dates)
    date = pp.Combine(pp.Word(pp.nums) + "." + pp.White() + pp.Word(pp.alphas)) | pp.Regex(r"\d{1,2}/\d{1,2}/\d{4}")

    # Measurement parser (numbers followed by units like 'kg', 'cm')
    measurement = pp.Combine(pp.Word(pp.nums + ",.") + pp.Word(pp.alphas))

    # Abbreviation parser: Only specific known abbreviations (like 'kr', 'mrd') 
    abbreviation = pp.Combine(pp.Word("kr") | pp.Word("mrd"))

    # Standalone number parser (to match numbers not part of dates or measurements)
    number = pp.Combine(pp.Word(pp.nums) + pp.Optional(pp.oneOf(",.") + pp.Word(pp.nums)))

    # Combine all parsers into one
    combined_parser = date | measurement | abbreviation | number

    # Apply the parser with WordBoundary to ensure standalone matches
    standalone_parser = pp.WordStart() + combined_parser + pp.WordEnd()

    # Collect word combinations for conversion
    for match, start, end in standalone_parser.scanString(text):
        word_combinations.append(match[0])

    # Remove duplicates by converting to a set and back to a list (maintain original order)
    word_combinations = list(dict.fromkeys(word_combinations))

    return word_combinations

# Example usage to check what we are extracting
text_to_normalize = "Hendelsen skjer 2. juni, og jeg kjøpte 5kg epler den 23/10/2024. Det kostet 10kr."
word_combinations = extract_word_combinations(text_to_normalize)
print("Extracted word combinations:", word_combinations)



Extracted word combinations: ['5kg']


In [None]:
def get_all_text_in_dict(input_file_path, output_file_path, k=100):
    original_texts = {}
    with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
        for i, line in enumerate(infile):
            # Split the line into audio_path, text, and speaker_nr
            columns = line.split('|')

            # Normalize the second column (text)
            original_text = columns[1].strip()
            

            # Stop after processing `k` lines (if required)
            if (i + 1) >= k:
                break
            original_texts


    return original_texts


# Call the function and create the comparison DataFrame
df = get_all_text_in_dict('outputbooks.txt' , 'normalized.txt', k=5000)

# Convert DataFrame to markdown format
markdown_table = df.to_markdown(index=True)
display(Markdown(markdown_table))

In [11]:
import tiktoken

# Updated GPT-4o mini pricing constants
COST_PER_1M_INPUT_TOKENS = 0.150  # Uncached input cost
COST_PER_1M_OUTPUT_TOKENS = 0.600  # Uncached output cost

BATCH_API_COST_PER_1M_INPUT_TOKENS = 0.075  # Batch API input cost
BATCH_API_COST_PER_1M_OUTPUT_TOKENS = 0.300  # Batch API output cost

# Function to calculate token usage and estimate costs
def get_all_text_in_dict(input_file_path, k=100, model="gpt-4o-mini", use_batch_api=False):
    """
    Reads a file line by line, extracts text, and estimates tokens using tiktoken.
    Calculates the input/output tokens and estimates the cost for OpenAI API usage.

    Params:
    - input_file_path: Path to the input file with lines of text.
    - k: Maximum number of lines to process.
    - use_batch_api: Set to True if using Batch API pricing.
    
    Returns:
    - original_texts: A dictionary where the key is the line number and the value is the text.
    - token_info: A dictionary with total input and output tokens and estimated cost.
    """
    original_texts = {}
    total_input_tokens = 0
    total_output_tokens = 0

    # Get tokenizer for the specified model
    encoding = tiktoken.encoding_for_model(model)

    with open(input_file_path, 'r', encoding='utf-8') as infile:
        for i, line in enumerate(infile):
            # Split the line into components (assuming '|' delimiter)
            columns = line.split('|')

            # Ensure there's at least two columns in each line (one for text)
            if len(columns) >= 2:
                original_text = columns[1].strip()  # Extract the text (2nd column)

                # Calculate input tokens using tiktoken for the text
                input_tokens = encoding.encode(original_text)
                num_input_tokens = len(input_tokens)
                total_input_tokens += num_input_tokens

                # Assume a 1:1 ratio for input to output tokens (modify if output varies)
                num_output_tokens = num_input_tokens
                total_output_tokens += num_output_tokens

                # Store the original text with line number as the key
                original_texts[i + 1] = original_text

                # Stop after processing `k` lines if limit is reached
                if (i + 1) >= k:
                    break

    # Choose pricing based on whether Batch API is used
    input_cost_per_million = BATCH_API_COST_PER_1M_INPUT_TOKENS if use_batch_api else COST_PER_1M_INPUT_TOKENS
    output_cost_per_million = BATCH_API_COST_PER_1M_OUTPUT_TOKENS if use_batch_api else COST_PER_1M_OUTPUT_TOKENS

    # Calculate cost for input and output tokens
    input_cost = (total_input_tokens / 1_000_000) * input_cost_per_million
    output_cost = (total_output_tokens / 1_000_000) * output_cost_per_million
    total_cost = input_cost + output_cost

    # Token and cost summary
    token_info = {
        'total_input_tokens': total_input_tokens,
        'total_output_tokens': total_output_tokens,
        'input_cost': input_cost,
        'output_cost': output_cost,
        'total_cost': total_cost
    }

    return original_texts, token_info

# Example usage
input_file = 'outputbooks.txt'
original_texts, token_info = get_all_text_in_dict(input_file, k=5000, use_batch_api=True)

# Display token and cost analysis
print("Total Input Tokens:", token_info['total_input_tokens'])
print("Total Output Tokens:", token_info['total_output_tokens'])
print("Estimated Input Cost: $", round(token_info['input_cost'], 2))
print("Estimated Output Cost: $", round(token_info['output_cost'], 2))
print("Estimated Total Cost: $", round(token_info['total_cost'], 2))


Total Input Tokens: 174462
Total Output Tokens: 174462
Estimated Input Cost: $ 0.01
Estimated Output Cost: $ 0.05
Estimated Total Cost: $ 0.07


In [14]:
!openai migrate


/bin/bash: line 1: openai: command not found


In [9]:

from openai import OpenAI


In [11]:
import os
from openai import OpenAI

def normalize_texts_with_gpt_and_save(texts, output_file, client, model="gpt-4o-mini", batch_size=5):
    """
    Sends batches of texts to GPT-4 mini for normalization and saves the results to a text file.
    
    Params:
    - texts: List of text strings to normalize.
    - output_file: File path to save the normalized results.
    - client: The OpenAI client instance.
    - model: The model to use (e.g., 'gpt-4o-mini').
    - batch_size: Number of texts to send in each API call.
    
    Returns:
    - None
    """
    
    # Prepare output file to save results
    with open(output_file, 'w', encoding='utf-8') as outfile:
        # Process texts in batches
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            
            # Join the batch with a separator (e.g., "----") to distinguish between texts
            prompt = "\n----\n".join(batch)  
            print(prompt)
            # Make the API call to GPT-4 mini once per batch
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": """
                    
I want you to act as a text-normalizer for Norwegian. I will provide multiple instances containing dates, times, numbers, or other abbreviations, and you will convert each of them into their full, orthographically correct Norwegian forms. Please convert all numbers to their written form and do not include any explanations or additional information in your responses.

For example:
Input:
Hendelsen skjer 2. juni, og jeg kjøpte 5kg epler den 23/10/2024. Det kostet 10kr.
----
Andelen empatiske eller veldig empatiske svar var 45,1 prosent hos ChatGPT, mot 4,6 prosent hos legene.
----
foretrakk dommerpanelet ChatGPT sitt svar i 78,6 prosent av tilfellene, og andelen vurderinger beskrevet som gode og veldig gode var 78,5 prosent for ChatGPT mot 22,1 prosent for legene.

Output:
Hendelsen skjer den andre juni, og jeg kjøpte fem kilo epler den tjue tredje oktober to tusen og tjue fire. Det kostet ti kroner.
----
Andelen empatiske eller veldig empatiske svar var førtifem komma én prosent hos ChatGPT, mot fire komma seks prosent hos legene.
----
Dommerpanelet foretrakk ChatGPT sitt svar i syttiatte komma seks prosent av tilfellene, og andelen vurderinger beskrevet som gode og veldig gode var syttiatte komma fem prosent for ChatGPT mot tjue to komma én prosent for legene."
"""},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=2000,  # Adjust based on expected output size
                temperature=0.0  # Set to 0 for deterministic outputs
            )
            
            # Extract the response text and split based on the separator "----"
            normalized_text = response.choices[0].message.content
            print("Response from model:\n", normalized_text)

            normalized_texts = normalized_text.split("\n----\n")
            
            # Write each normalized text to the output file
            outfile.write(normalized_text.strip() + "\n")

            
    print(f"Results saved to {output_file}")

# Example usage
texts = [
    "Hendelsen skjer 2. juni, og jeg kjøpte 5kg epler den 23/10/2024. Det kostet 10kr.",
    "Andelen empatiske eller veldig empatiske svar var 45,1 prosent hos ChatGPT, mot 4,6 prosent hos legene.",
    "foretrakk dommerpanelet ChatGPT sitt svar i 78,6 prosent av tilfellene, og andelen vurderinger beskrevet som gode og veldig gode var 78,5 prosent for ChatGPT mot 22,1 prosent for legene.",
]

normalize_texts_with_gpt_and_save(texts, 'normalized_output.txt', client)


Hendelsen skjer 2. juni, og jeg kjøpte 5kg epler den 23/10/2024. Det kostet 10kr.
----
Andelen empatiske eller veldig empatiske svar var 45,1 prosent hos ChatGPT, mot 4,6 prosent hos legene.
----
foretrakk dommerpanelet ChatGPT sitt svar i 78,6 prosent av tilfellene, og andelen vurderinger beskrevet som gode og veldig gode var 78,5 prosent for ChatGPT mot 22,1 prosent for legene.
Response from model:
 Hendelsen skjer den andre juni, og jeg kjøpte fem kilo epler den tjue tredje oktober to tusen og tjue fire. Det kostet ti kroner.
----
Andelen empatiske eller veldig empatiske svar var førtifem komma én prosent hos ChatGPT, mot fire komma seks prosent hos legene.
----
Dommerpanelet foretrakk ChatGPT sitt svar i syttiatte komma seks prosent av tilfellene, og andelen vurderinger beskrevet som gode og veldig gode var syttiatte komma fem prosent for ChatGPT mot tjue to komma én prosent for legene.
Results saved to normalized_output.txt


In [17]:
import os
from openai import OpenAI

import os
from openai import OpenAI

import re
import os
from openai import OpenAI

# A function to detect if a line needs normalization (dates, numbers, abbreviations, etc.)
def needs_normalization(text):
    # Regular expression to detect numbers, dates, abbreviations, etc.
    pattern = r'\b\d{1,2}(?:\.|/)\d{1,2}(?:\.|/)\d{2,4}|\d+(?:kr|kg|%)|\b\d+\b'
    
    # If the pattern is found in the text, normalization might be needed
    return re.search(pattern, text) is not None

def get_all_text_in_dict(input_file_path, output_file_path, client, model="gpt-4o-mini", k=10):
    """
    Reads the input file, normalizes the second column (text) using GPT-4 mini if needed, and saves the results to the output file.
    
    Params:
    - input_file_path: Path to the input file containing audio_path, text, and speaker_nr.
    - output_file_path: Path to the output file where normalized text will be saved.
    - client: OpenAI client instance for making API requests.
    - model: The model to use for normalization (default: 'gpt-4o-mini').
    - k: Maximum number of lines to process.
    
    Returns:
    - processed_texts: A dictionary of the processed texts where keys are audio_path, and values are dictionaries with 'original_text', 'normalized_text', and 'speaker_nr'.
    """
    processed_texts = {}
    
    # Open the input and output files
    with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
        
        for i, line in enumerate(infile):
            # Split the line into audio_path, text, and speaker_nr
            columns = line.split('|')
            if len(columns) < 3:
                continue  # Skip lines that do not have all columns
            
            audio_path = columns[0].strip()
            original_text = columns[1].strip()
            speaker_nr = columns[2].strip()
            
            # Check if the text needs normalization
            if needs_normalization(original_text):
                # If normalization is needed, call the OpenAI API
                response = client.chat.completions.create(
                    model=model,
                    messages=[
                        {"role": "system", "content": "I want you to act as a text-normalizer for Norwegian. I will provide dates, times, numbers, or other abbreviations, and you will convert them into their full, orthographically correct Norwegian forms. Please convert all numbers to their written form and do not include any explanations or additional information in your responses."},
                        {"role": "user", "content": original_text}
                    ],
                    max_tokens=2000,
                    temperature=0.0
                )
                
                # Extract the normalized text
                normalized_text = response.choices[0].message.content.strip()
            else:
                # If no normalization is needed, keep the original text
                normalized_text = original_text
            
            # Write the normalized (or original) text back to the output file and save to dictionary
            outfile.write(f"{audio_path}|{normalized_text}|{speaker_nr}\n")
            
            # Store the processed result in the dictionary
            processed_texts[audio_path] = {
                'original_text': original_text,
                'normalized_text': normalized_text,
                'speaker_nr': speaker_nr
            }
            
            # Stop after processing `k` lines (if required)
            if (i + 1) >= k:
                break

    return processed_texts

# Initialize the OpenAI client
client = OpenAI(api_key='')

# Example usage
input_file_path = 'outputbooks.txt'
output_file_path = 'normalized_output.txt'
processed_data = get_all_text_in_dict(input_file_path, output_file_path, client)

# Print the returned data for verification
for audio_path, data in processed_data.items():
    print(f"Audio Path: {audio_path}")
    print(f"Original Text: {data['original_text']}")
    print(f"Normalized Text: {data['normalized_text']}")
    print(f"Speaker Number: {data['speaker_nr']}")
    print("-" * 50)


Audio Path: 34_Dr_Chat_GPT_med_legelisens_133.wav
Original Text: Det var mange som ble overrasket over resultatene til ChatGPT i denne studien.
Normalized Text: Det var mange som ble overrasket over resultatene til ChatGPT i denne studien.
Speaker Number: 6
--------------------------------------------------
Audio Path: 34_Dr_Chat_GPT_med_legelisens_135.wav
Original Text: foretrakk dommerpanelet ChatGPT sitt svar i 78,6 prosent av tilfellene, og andelen vurderinger beskrevet som gode og veldig gode var 78,5 prosent for ChatGPT mot 22,1 prosent for legene.
Normalized Text: Foretrakk dommerpanelet ChatGPT sitt svar i åttiseks komma seks prosent av tilfellene, og andelen vurderinger beskrevet som gode og veldig gode var åttiseks komma fem prosent for ChatGPT mot toogtyve komma en prosent for legene.
Speaker Number: 6
--------------------------------------------------
Audio Path: 34_Dr_Chat_GPT_med_legelisens_136.wav
Original Text: Men det aller mest oppsiktsvekkende var at forskjellen var 

In [22]:
from pyparsing import Word, nums, Combine, Suppress, Optional, alphas, alphanums, Literal

# Define the pyparsing grammar for numbers, dates, and percentages
def create_parser():
    # Basic number (integer and decimal)
    number = Word(nums)
    decimal_number = Combine(Word(nums) + "." + Word(nums))
    
    # Date patterns (e.g., 23/10/2024, 2. juni)
    day = Word(nums, max=2)
    month = Word(alphas, max=10) | Word(nums, max=2)
    year = Word(nums, min=4, max=4)
    date = Combine(day + Suppress("/") + month + Suppress("/") + year) | Combine(day + Suppress(".") + month)
    
    # Number with units (e.g., kg, kr)
    number_with_unit = Combine(number + Optional(Literal("kg") | Literal("kr") | Literal("%")))
    
    # Full parser: detect any of the patterns
    parser = number | decimal_number | date | number_with_unit
    return parser

# Check if the text contains any patterns that require normalization
def needs_normalization(text, parser):
    # Try parsing the text using the defined parser
    try:
        matches = parser.searchString(text)
        return len(matches) > 0
    except Exception:
        return False

def get_all_text_in_dict(input_file_path, output_file_path, client, model="gpt-4o-mini", k=10000):
    """
    Reads the input file, normalizes the second column (text) using GPT-4 mini if needed, and saves the results to the output file.
    
    Params:
    - input_file_path: Path to the input file containing audio_path, text, and speaker_nr.
    - output_file_path: Path to the output file where normalized text will be saved.
    - client: OpenAI client instance for making API requests.
    - model: The model to use for normalization (default: 'gpt-4o-mini').
    - k: Maximum number of lines to process.
    
    Returns:
    - processed_texts: A dictionary of the processed texts where keys are audio_path, and values are dictionaries with 'original_text', 'normalized_text', and 'speaker_nr'.
    """
    processed_texts = {}
    
    # Initialize the parser
    parser = create_parser()
    
    # Open the input and output files
    with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
        
        for i, line in enumerate(infile):
            # Split the line into audio_path, text, and speaker_nr
            columns = line.split('|')
            if len(columns) < 3:
                continue  # Skip lines that do not have all columns
            
            audio_path = columns[0].strip()
            original_text = columns[1].strip()
            speaker_nr = columns[2].strip()
            
            # Check if the text needs normalization using pyparsing
            if needs_normalization(original_text, parser):
                # If normalization is needed, call the OpenAI API
                response = client.chat.completions.create(
                    model=model,
                    messages=[
                        {"role": "system", "content": "I want you to act as a text-normalizer for Norwegian. I will provide dates, times, numbers, or other abbreviations, and you will convert them into their full, orthographically correct Norwegian forms. Please convert all numbers to their written form and do not include any explanations or additional information in your responses."},
                        {"role": "user", "content": original_text}
                    ],
                    max_tokens=2000,
                    temperature=0.0
                )
                
                # Extract the normalized text
                normalized_text = response.choices[0].message.content.strip()
            else:
                # If no normalization is needed, keep the original text
                normalized_text = original_text
            
            # Write the normalized (or original) text back to the output file and save to dictionary
            outfile.write(f"{audio_path}|{normalized_text}|{speaker_nr}\n")
            
            # Store the processed result in the dictionary
            processed_texts[audio_path] = {
                'original_text': original_text,
                'normalized_text': normalized_text,
                'speaker_nr': speaker_nr
            }
            
            # Stop after processing `k` lines (if required)
            if (i + 1) >= k:
                break

    return processed_texts


# Initialize the OpenAI client
client = OpenAI()

# Example usage
input_file_path = 'outputbooks.txt'
output_file_path = 'normalized_output.txt'
processed_data = get_all_text_in_dict(input_file_path, output_file_path, client)

# Print the returned data for verification
# Print only the first 100 items for verification
for idx, (audio_path, data) in enumerate(processed_data.items()):
    if idx >= 100:
        break
    print(f"Audio Path: {audio_path}")
    print(f"Original Text: {data['original_text']}")
    print(f"Normalized Text: {data['normalized_text']}")
    print(f"Speaker Number: {data['speaker_nr']}")
    print("-" * 50)



Audio Path: 34_Dr_Chat_GPT_med_legelisens_133.wav
Original Text: Det var mange som ble overrasket over resultatene til ChatGPT i denne studien.
Normalized Text: Det var mange som ble overrasket over resultatene til ChatGPT i denne studien.
Speaker Number: 6
--------------------------------------------------
Audio Path: 34_Dr_Chat_GPT_med_legelisens_135.wav
Original Text: foretrakk dommerpanelet ChatGPT sitt svar i 78,6 prosent av tilfellene, og andelen vurderinger beskrevet som gode og veldig gode var 78,5 prosent for ChatGPT mot 22,1 prosent for legene.
Normalized Text: Foretrakk dommerpanelet ChatGPT sitt svar i åttiseks komma seks prosent av tilfellene, og andelen vurderinger beskrevet som gode og veldig gode var åttiseks komma fem prosent for ChatGPT mot toogtyve komma en prosent for legene.
Speaker Number: 6
--------------------------------------------------
Audio Path: 34_Dr_Chat_GPT_med_legelisens_136.wav
Original Text: Men det aller mest oppsiktsvekkende var at forskjellen var 