###  _uroman_ package examples
by Ulf Hermjakob, June 28, 2024 (uroman version 1.3.1.1)

In [2]:
# pip install uroman

import uroman as ur

# load uroman data (once at the beginning)
uroman = ur.Uroman()

In [3]:
# Romanizing strings from different scripts

for s in ('Νεπάλ', 'नेपाल', 'نیپال', '三万一'):
    print(s, uroman.romanize_string(s))

Νεπάλ Nepal
नेपाल nepaal
نیپال nipal
三万一 31000


In [4]:
# Romanizing a string using different language codes

s = 'Игорь'
for lang_code in ('rus', 'ukr', None):
    print(s, lang_code, uroman.romanize_string(s, lcode=lang_code))

Игорь rus Igor
Игорь ukr Yhor
Игорь None Igor


In [5]:
# Romanizing a string in different output formats (string, JSONL)

import sys

s = 'ایران'
lcode = 'fas'

input_filename = 'in.txt'
with open(input_filename, 'w') as f: 
    f.write(s + '\n')
print(f"Input string:\n{s}\n")
    
format_output_list = ((ur.RomFormat.STR,     'str.txt'), 
                      (ur.RomFormat.EDGES,   'edges.jsonl'), 
                      (ur.RomFormat.ALTS,    'alts.jsonl'), 
                      (ur.RomFormat.LATTICE, 'lattice.jsonl'))

for rom_format, output_filename in format_output_list:
    uroman.romanize_file(input_filename=input_filename, 
                         output_filename=output_filename, 
                         lcode=lcode,
                         rom_format=rom_format)
    with open(output_filename) as f:
        sys.stdout.write(f"Output format: {rom_format}\n{f.read()}\n")        

Input string:
ایران

Output format: str
iran

Output format: edges
[[0, 2, "i", "rom"][2, 3, "r", "rom"][3, 4, "a", "rom"][4, 5, "n", "rom"]]

Output format: alts
[[0, 2, "i", "rom"][0, 2, "ai", "rom-alt"][2, 3, "r", "rom"][3, 4, "a", "rom"][4, 5, "n", "rom"]]

Output format: lattice
[[0, 2, "i", "rom"][0, 2, "ai", "rom-alt"][0, 1, "a", "rom"][1, 2, "i", "rom"][1, 2, "y", "rom-alt"][2, 3, "r", "rom"][3, 4, "a", "rom"][4, 5, "n", "rom"]]



In [7]:
# A file with text from different scripts and languages (marked by line-initial ::lcode <lcode>)
# File location: https://github.com/isi-nlp/uroman/blob/master/uroman/mini-test/multi-script.txt

input_filename  = 'test/multi-script.txt'

with open(input_filename) as f:
    print(f.read())

::lcode deu Grüße aus Bordeaux
::lcode tur İstanbul, Türkiye'de yer alan şehir ve ülkenin 81 ilinden biri.
::lcode eng ⠠⠺⠑⠀⠓⠕⠇⠙⠀⠘⠮⠀⠞⠗⠥⠹⠎⠀⠞⠕⠀⠆⠀⠎⠑⠇⠋⠤⠑⠧⠊⠙⠢⠞⠂⠀⠞⠀⠁⠇⠇⠀⠍⠑⠝⠀⠜⠑⠀⠉⠗⠂⠞⠫⠀⠑⠟⠥⠁⠇⠂⠀⠞⠀⠮⠽⠀⠜⠑⠀⠑⠝⠙⠪⠫⠀⠃⠽⠀⠸⠮⠀⠠⠉⠗⠑⠁⠞⠕⠗⠀⠾⠀⠉⠻⠞⠁⠔⠀⠥⠝⠁⠇⠊⠑⠝⠁⠃⠇⠑⠀⠠⠐⠗⠎⠂⠀⠞⠀⠁⠍⠰⠛⠀⠘⠮⠀⠜⠑⠀⠠⠇⠊⠋⠑⠂⠀⠠⠇⠊⠃⠻⠞⠽⠀⠯⠀⠮⠀⠏⠥⠗⠎⠥⠊⠞⠀⠷⠀⠠⠓⠁⠏⠏⠊⠰⠎⠲
::lcode ell Το Λος Άντζελες (στα ισπανικά Los Angeles = Οι Άγγελοι) ή στην Αμερικανική αργκό L.A., ελ έι) είναι η δεύτερη μεγαλύτερη πόλη των Ηνωμένων Πολιτειών από άποψη πληθυσμού, καθώς και ένα από τα σημαντικότερα οικονομικά, πολιτιστικά επιστημονικά και ψυχαγωγικά κέντρα του κόσμου.
::lcode rus Герма́ния (нем. Deutschland), официальное название — Федерати́вная Респу́блика Герма́ния (нем. Bundesrepublik Deutschland), ФРГ (нем. BRD) — государство в Западной Европе. Площадь территории — 357 021 км². Численность населения по переписи 2011 года — более 80 миллионов человек. [2][6].
::lcode ukr Володи́мир Олекса́ндрович Зеле́нський (нар. 25 січня 1978, Кривий Ріг) — український державний діяч, політик, шо

In [8]:
# Romanization of a file with text from different scripts and languages
# How many US states can you identify in the romanized Hindi (::lcode hin) sentence below?

output_filename = 'test/multi-script.uroman.txt'

uroman.romanize_file(input_filename=input_filename, output_filename=output_filename)

with open(output_filename) as f:
    print(f.read())

::lcode deu Gruesse aus Bordeaux
::lcode tur Istanbul, Tuerkiye'de yer alan shehir ve uelkenin 81 ilinden biri.
::lcode eng We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness.
::lcode ell To Los Andzeles (sta ispanika Los Angeles = Oi Angeloi) e sten Amerikanike arngo L.A., el ei) einai e deutere megalutere pole ton Enomenon Politeion apo apopse plethysmou, kathos kai ena apo ta semandikotera oikonomika, politistika epistemonika kai psychagogika kendra tou kosmou.
::lcode rus Germaniya (nem. Deutschland), ofitsialnoye nazvaniye — Federativnaya Respublika Germaniya (nem. Bundesrepublik Deutschland), FRG (nem. BRD) — gosudarstvo v Zapadnoy Yevrope. Ploshchad territorii — 357 021 km². Chislennost naseleniya po perepisi 2011 goda — boleye 80 millionov chelovek. [2][6].
::lcode ukr Volodymyr Oleksandrovych Zelensky (nar. 25 sichnya 197