# Test IPA Template Extraction

In [1]:
# Register tools
import os
import sys

sys.path.append(os.path.abspath("../"))

In [2]:
from collections import Counter

import regex as re

from tqdm import tqdm

from extract import iterate_dump

In [3]:
# Look at French corpus
path = "../frwiktionary-latest-pages-articles.xml.bz2"

In [4]:
# Partial matching for relevant templates
filter_r = re.compile(r"\{\{(?:pron|phon|phono)\|")

In [5]:
# Process the whole dump
with open("phon.txt", "w", encoding="utf-8", newline="\n") as file:
    for title, text in tqdm(iterate_dump(path)):

        # Add header to easily go back to source page
        file.write(f"\n>>> {title} <<<\n")
        for line in text.split("\n"):

            # If there is at least one match, keep it
            if filter_r.search(line):
                file.write(line)
                file.write("\n")

3820284it [05:15, 12122.03it/s]


In [8]:
# Load lines
with open("phon.txt", "r", encoding="utf-8", newline="\n") as file:
    lines = [line.rstrip("\n") for line in file]

In [9]:
# Show extract
for line in lines[:50]:
    print(line)


>>> accueil <<<
'''accueil''' {{pron|a.kœj|fr}} {{m}}
* {{pron|a.kœj|fr}}

>>> lire <<<
'''lire''' {{pron|liʁ|fr}} {{conjugaison|fr}} {{conjugaison|fr|grp=3}}
'''lire''' {{pron|liʁ|fr}} {{f}}
'''lire''' {{pron|liʁ|fro}}
'''lire''' {{pron||af}}

>>> encyclopédie <<<
'''encyclopédie''' {{pron|ɑ̃.si.klɔ.pe.di|fr}} {{f}}
'''encyclopédie''' {{pron|ɑ̃.si.klɔ.pe.di|fr}}

>>> manga <<<
'''manga''' {{pron|mɑ̃.ɡa|fr}} {{m}}
'''manga''' {{pron|ˈmæŋ.ɡə|en}}
'''manga''' {{pron|ˈmaŋ.ɡa|es}} {{f}}
'''manga''' {{pron|ˈmaŋ.ɡa|es}} {{f}} {{pluriel ?|es}}
'''manga''' {{pron||it}} {{m}} {{invariable}}
'''manga''' {{pron||mg}}
'''manga''' {{pron||pap}} {{f}}
'''manga''' {{pron||pt}} {{f}}
'''manga''' {{pron||pt}} {{f}}
'''manga''' {{pron||sv}} {{c}}

>>> ouvrage <<<
'''ouvrage''' {{pron|u.vʁaʒ|fr}} {{m}}
'''ouvrage''' {{pron|u.vʁaʒ|fr}}
* {{pron|u.vʁaʒ|fr}}

>>> siège <<<
'''siège''' {{pron|sjɛʒ|fr}} {{m}}
'''siège''' {{pron|sjɛʒ|fr}}

>>> chaise <<<
: De ''[[chaire#fr|chaire]]'' par [[assibilation]] dial

In [10]:
# Look at symbols
pronounced_r = re.compile(r"\{\{prononcé\|")
ps = []
for line in lines:
    if pronounced_r.search(line):
        ps.append(line)

In [11]:
len(ps)

25

In [12]:
ps

["'''Y''' {{pron|i ɡʁɛk|fr}} {{prononcé|[[i grec]]|fr}} {{m}} ",
 "'''Y''' {{pron|ɡʁɑ̃(t‿) i.ɡʁɛk|fr}} {{prononcé|grand y|fr}} {{m}}",
 "'''Y''' {{pron|ˌbɪɡ ˈwaɪ|en}} {{prononcé|big y|en}} ''ou'' {{pron|ˌkæp ˈwaɪ|en}} {{prononcé|cap y|en}}",
 "'''al''' {{prononcé|année-lumière|fr}} {{pron|a.ne ly.mjɛʁ|fr}} {{f}}.",
 "'''Z''' {{pron|ɡʁɑ̃ zɛd|fr}} {{prononcé|grand z|fr}} {{m}}",
 "'''N''' {{pron|nɔʁ|fr}} {{prononcé|nord|fr}} {{m}} {{invar}}, {{abréviation|fr}}",
 "'''y''' {{pron|pə.ti i.ɡʁɛk|fr}} {{prononcé|petit y|fr}}\xa0{{m}}",
 "'''z''' {{pron|pə.ti zɛd|fr}} {{prononcé|petit z|fr}} {{m}}",
 "'''e.g.''' {{pron|fɔɹ.ɪɡ.ˈzæm.pl̩|en}} {{US|nocat=1}}, {{pron|fɔː.ɪɡ.ˈzɑːm.pl̩|en}} {{UK|nocat=1}} {{prononcé|for example|en}}",
 "'''cf.''' {{pron|kɔ̃.fɛʁ|fr}} {{prononcé|confère|fr}} <small>ou</small> {{pron|se.ɛf|fr}} {{invar}}",
 "'''ssi''' {{pron|si e sœl.mɑ̃ si|fr}} {{prononcé|si et seulement si|fr}}",
 "'''iff''' {{pron|ɪf ænd ˈoʊn.li ɪf|en}} {{US|nocat=1}}, {{pron|ɪf ænd ˈəʊn.li ɪf|en}} {