In [16]:
# Author: Keegan Reeve

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string

import os

/kaggle/input/gutenberg-helsinkiin-edited/Helsinkiin_edited.txt


# Step 1

First, I'll form a set of unique words from the text (ignoring 'words' composed of punctuation marks and white space).


# Step 2

Then,  I'll form a matching set of phonetic transcriptions, each word to each transcription, separating dipthongs ('ai', '') into groups and separating each character otherwise to form a sequence like so: 'tuuli\tT UU L I'. I'll use the classic transcriptions for the rest (that is, vowel sequences ending in a mid or low vowel). I've crossreferenced the list '/yi, öi, äi, ui, oi, ai, äy, au, yö, öy, uo, ou, ie, ei, eu, iu, ey, iy/ (Wiik 1965; Karlsson 1983).' here: https://forum.unilang.org/viewtopic.php?t=51820.

# Step 3
Write the word-tab-transcription sequences to a text file 

## Step 1

In [17]:
filename = './gutenberg-helsinkiin-edited/Helsinkiin_edited.txt'

In [3]:
#### Change this to make a new file using only the last half of the book

In [18]:
try:
    with open(filename, 'r') as file:
        text = file.read()
except FileNotFoundError:
    print("File not found.")

'''
Prompt:

ignoring punctuation, make a set of all unique words in a string

'''
def find_unique(text):
    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text_without_punctuation = text.translate(translator)
    
    # Split the text into words
    words = text_without_punctuation.split()
    
    # Convert words to lowercase and create a set
    unique_words = set(word.lower() for word in words)
    
    return unique_words

words = find_unique(text)

In [19]:
'''
Prompt:
write a python function to randomly sample a set
'''

import random

def random_sample(input_set, sample_size):
    # Convert the set to a list to allow indexing
    set_list = list(input_set)
    
    # Check if the sample size is greater than the set size
    if sample_size > len(set_list):
        raise ValueError("Sample size cannot be greater than the size of the set.")
    
    # Randomly sample elements from the set list
    sampled_elements = random.sample(set_list, sample_size)
    
    return sampled_elements

print(random_sample(words, 20))

['pääsee', 'kuudennelle', 'haihduttaa', 'nuoria', 'portterin', 'olekaan', 'kalkkoivat', 'herättää', 'obtain', 'ajetaanko', 'tuutingit', 'liköörilasia', 'niinkin', 'ovat', 'muutamasta', 'savonlinnassa', 'ylioppilailta', 'hellyyttä', 'references', 'soittivat']


## Step 2

In [20]:
'''
Prompt:

write a python function to remove guillemets from a set of strings in python

'''

def rm_guillemets(input_set):
    # Set to store modified strings
    modified_set = set()

    # Iterate through the set of strings
    for string_value in input_set:
        # Remove both left-pointing and right-pointing guillemets
        modified_string = string_value.replace("«", "").replace("»", "")
        # Add the modified string to the set
        modified_set.add(modified_string)

    return modified_set

print("\n\nSample from the unmodified set of words:\n")
print(*random_sample(words, 10), sep="\n")

mod_words = rm_guillemets(words)
print("\n\nSample from the modified set of words:\n")
print(*random_sample(mod_words, 10), sep="\n")



Sample from the unmodified set of words:

menevän
luokalla
kilinä
vaatteisiin
ulommas
kanssamme
vaikea
naurahti
päässä
restrictions


Sample from the modified set of words:

isvoshik
helistivät
viiksiään
matalapohjaisista
country
kylmää
nenäliinansa
nojasi
tanssittu
itse


In [27]:
def set2list(input_set):
    # Convert the set to a list
    output_list = list(input_set)
    return output_list

mod_words_s = mod_words
mod_words_l = set2list(mod_words_s)
print(random_sample(mod_words_l, 10))

['laskeutuisiko', 'haihduttaakseen', 'kaikkia', 'ulommaksikin', 'bufettineidiltä', 'kihlauksensa', 'pekkaa', 'välähdyksen', 'asettaa', 'kulma']


If you want to get rid of all non-Latin characters altogether, run the next cell:

In [30]:
latin_characters = set('abcdefghijklmnopqrstuvwxyzäöABCDEFGHIJKLMNOPQRSTUVWXYZÄÖ')

valid_strings = []

for string in mod_words_l:
    mod_string = ''.join(char if char in latin_characters else char for char in string)
    valid_strings.append(mod_string)

mod_words_l = valid_strings
print(random_sample(mod_words_l, 10))

['innostui', 'koridooriin', 'jääkään', 'ahtaassa', 'ryyppy', 'täydeltä', 'musta', 'sanomattoman', 'kihlakortteja', 'ruotsalaisiin']


In [31]:
'''
Prompt:
write a python function to iterate over a set of strings, and produce a list of sequences of characters corresponding to each of the strings in the set,
 and separate the characters with a space unless the two characters are in a defined list of character-combinations,
 namely, the following: [yi, öi, äi, ui, oi, ai, äy, au, yö, öy, uo, ou, ie, ei, eu, iu, ey, iy].

'''

def generate_sequences(input_set, char_combinations):
    # List to store sequences for each string
    sequences_list = []

    # Function to generate sequences for a single string
    def generate_sequence(string_value):
        sequence = ""
        for i, char in enumerate(string_value):
            # Append current character to sequence
            sequence += char
            if i < len(string_value) - 1:
                # Check if the current character and the next one form a defined combination
                current_next_combination = char + string_value[i + 1]
                if current_next_combination in char_combinations:
                    # If it's a defined combination, don't add space
                    continue
                else:
                    # If not, add a space
                    sequence += " "
        return sequence

    # Generate sequences for each string in the input set
    for string_value in input_set:
        sequences_list.append(generate_sequence(string_value))

    return sequences_list

diphthongs = ['yi', 'öi', 'äi', 'ui', 'oi', 'ai', 'äy', 'au', 'yö', 'öy', 'uo', 'ou', 'ie', 'ei', 'eu', 'iu', 'ey', 'iy']
long_vowels = ['ää', 'aa', 'ee', 'ii', 'öö', 'oo', 'yy', 'uu']
char_combs = diphthongs + long_vowels
pronounce_dict_transcrs = generate_sequences(mod_words_l, char_combs)

print(*random_sample(pronounce_dict_transcrs, 20), sep="\n")

t u n k i
t ää l l ä k ö
j oi l l e k ui l l e
l iu k ui
v a l o
p ää t t ä n ee t
t u l l e s s aa n k aa n
m i l l oi n k a
r a k a s t u n ee t
k a s v oi l t aa n
s a m a t
p u h u a
v i h a m ie h e t
r uo t s a l ai s e t
t uu h ei t t e n
au t i o l l e
t oi s e l l a k i n
t o t i h uo n ee s t a
s a l o n g i n
t e h t aa n h oi t a j a n


In [33]:
'''
Prompt:

write a python function to go through a list strings and capitalize the latin characters in the strings
include ä and ö characters

'''

def cap_lat_chars(string_list):
    # List to store modified strings
    modified_list = []

    # Define Latin characters including ä and ö
    latin_characters = set('abcdefghijklmnopqrstuvwxyzäöABCDEFGHIJKLMNOPQRSTUVWXYZÄÖ')

    # Iterate through the list of strings
    for string_value in string_list:
        # Capitalize Latin characters in the string
        modified_string = ''.join(char.upper() if char in latin_characters else char for char in string_value)
        # Add the modified string to the list
        modified_list.append(modified_string)

    return modified_list

edited_dict_transcrs = cap_lat_chars(pronounce_dict_transcrs)

print(*random_sample(edited_dict_transcrs, 20), sep="\n")

V Ä S Y M Y S T Ä
L U K K O
H UO N EE S T A
H U R J AU T T II N
I S O N
W O R L D
A S E N T OO N
T A P AU S
P I T K ÄÄ N
K Ä V E L I
K Ä R E L L Ä
K A H V I A
K A R V A N S A
V Ä L T T YI S I
T U L I S E N
I L T AA
N E N Ä N S Ä
K O M E A V A R T A L OI N E N
L E P ÄÄ M ÄÄ N
L A S K E V A T


## Optional step: matching this transcription to the Hungarian XPF transcription style  ː
_This step is necessary if using an Hungarian XPF acoustic model trained on a slightly different phonology and different transcription style._

_Optional step extends up to Step 3, where the pronouncing dictionary is produced as output in a MFA-friendly format._

Thus, I'll be modifying my transcription above in order to use an Hungarian acoustic model to align Finnish. This is because I couldn't find a usable Finnish aligner, but I did manage to find an Hungarian one.

phone set: aː b bː c cː d dː eː f fː h hː i iː j jː k kː l lː m mː n nː o oː p pː r rː s sː t ts tsː tʃ tʃː tː u uː v vː y yː z zː ø øː ɑ ɛ ɟ ɟː ɡ ɡː ɲ ɲː ʃ ʃː ʒ

In [34]:
'''
Prompt:

in a list of strings, replace any capital letter X with "K S" including the space
'''

modified_list = []

# Iterate through each string in the input list
for string in edited_dict_transcrs:
    # Replace all occurrences of 'X' with "K S"
    modified_string = string.replace('X', 'K S')
    # Add the modified string to the modified_list
    modified_list.append(modified_string)
    
edited_dict_transcrs = modified_list

In [44]:
fin_chars = "A AA B C D E EE F G H I II J K L M N O OO P Q R S T U UU V W Y yy Z Ä ÄÄ Ö ÖÖ"
hun_chars = "ɑ aː b k d ɛ eː f g h i iː j k l m n o oː p k r s t u uː v v y yː z ɑ aː ø øː" # vowel length is more important than quality my purpose

fin_list = fin_chars.split()
hun_list = hun_chars.split()

fin_monophthongs = "A E I O U Y Ä Ö"
fin_monophthong_list = fin_monophthongs.split()
hun_fin_monos = "ɑ ɛ i o u y ɑ ø"
hun_fin_mono_list = hun_fin_monos.split()
print(hun_fin_mono_list)

['ɑ', 'ɛ', 'i', 'o', 'u', 'y', 'ɑ', 'ø']


In [41]:
'''
Prompt:
write python code to go through a list of strings containing words separated by spaces and replace each word that is present in list1 by its corresponding word in another list2
'''

replacement_dict = dict(zip(fin_list, hun_list))

# Function to replace words in a single string based on the replacement dictionary
def replace_words_in_string(string, replacements):
    # Split the string into words
    words = string.split()
    # Replace each word if it is in the replacements dictionary
    replaced_words = [replacements.get(word, word) for word in words]
    # Join the replaced words back into a single string
    return ' '.join(replaced_words)

# Process each string in the input list
hun_modified_dict = [replace_words_in_string(string, replacement_dict) for string in edited_dict_transcrs]
print(*random_sample(hun_modified_dict, 20), sep="\n")

v ɑ s t AU s t aː n
n AI n ɛ n
s EU r aː v ɑ k s i
v aː h t OI s ɛ n
r EI
ɛ r i l l ɛ n i
j aː k aː n
m uː t t UI
t ɑ r t t u v ɑ t
n ɑ k ÖI s i ɑ
p ɛ h m OI n ɛ n
s uː d ɛ l l ɑ
uː d ɛ s t ɑ
i k k u n ɑ s s ɑ
d ɛ s k r i b ɛ d
ɑ r k uː s
h u r j AU t t iː n
k i r j OI t ɑ
ɑ s i ɑ n
l ɑ h t ɛ ɑ


In [45]:
'''
Prompt:

go through a list of strings containing segments separated by spaces and separate each character with spaces if the segment is a combination of the vowels
'''

def process_string(string):
    # Split the string into segments
    segments = string.split()
    # Process each segment
    processed_segments = []
    for segment in segments:
        # Check if all characters in the segment are vowels
        if all(char in fin_monophthong_list for char in segment):
            # Separate each character with spaces
            processed_segment = ' '.join(segment)
        else:
            # Leave the segment as is
            processed_segment = segment
        # Add the processed segment to the list
        processed_segments.append(processed_segment)
    # Join the processed segments back into a single string
    return ' '.join(processed_segments)

# Process each string in the input list
sepd_diphths = [process_string(string) for string in hun_modified_dict]
print(*random_sample(sepd_diphths, 20), sep="\n")

h o t ɛ l l iː n
o n h ɑ n
k ɑ p i n eː n s ɑ
r ɑ h O I l l ɑ
n O U k k i m ɑ s s ɑ
t O I s ɛ l t ɑ
l I E h u t t i
ɛ t Ä I s Y Y d ɛ n
s uː t t u m u s t ɑ
s aː d ɑ k s eː n
v ɑ l i n s ɑ
s uː t t U I
”
p ɛ r f o r m
s i v u l t ɑ
l A I t u r i s t aː n
v h ɛ r ɛ
t O I s i l l ɛ
d i s t r i b u t o r
l i s ɑ k s i


In [46]:
edited_dict_transcrs = [replace_words_in_string(string, replacement_dict) for string in sepd_diphths]
print(*random_sample(edited_dict_transcrs, 20), sep="\n")

i
t ɑ u t ɛ j ɑ
v aː t i v ɑ i n ɛ n
ɑ j ɑ t u k s i aː n
k o l i s t iː n
l ɑ n t eː n
l u ɛ n t o i h i n
r u o k i ɑ
m i ɛ l uː m m i n
v ɑ r m ɑ l t ɑ
k o p y
v i h ɛ l l y k s ɛ s s ɑ
l i v ɑ h t i
y l i o p p i l ɑ s h u o n eː l l ɑ
f r eː
ɑ s t u ɛ s s aː n
p u o l i h ɑ m ɑ r ɑ
t ɑ v o i t t ɛ l ɛ v i ɑ
ɑ i o t t iː n
s uː n t aː n


## Step 3

In [47]:
'''
Prompt:

using python, write a tab-delimited file where the first column is a set of strings and the second column is a list of strings.
Name this tab-delimited file "fin_custom.dict"
'''

import csv

def w_dict(lst1, lst2, filename):
    # Zip the set and list columns together
    rows = zip(lst1, lst2)

    # Open the file in write mode with tab delimiter
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter='\t')

        # Write rows
        writer.writerows(rows)

w_dict(mod_words_l, edited_dict_transcrs, "fin_custom.dict")