# Explore French Templates

In [1]:
# Register tools
import os
import sys

sys.path.append(os.path.abspath("../"))

In [8]:
from collections import Counter

import regex as re

from tqdm import tqdm

from extract import iterate_dump

In [3]:
# Look at French corpus
path = "../frwiktionary-latest-pages-articles.xml.bz2"

In [4]:
# Select characters that are unlikely to appear outside IPA notation
#   https://en.wikipedia.org/wiki/Phonetic_symbols_in_Unicode
IPA_KEY_CHARACTERS = set("ɑøʁɔɲŋɛɜɪʒːʃɥʊɹ")

In [5]:
# Process the whole dump
with open("ipa.txt", "w", encoding="utf-8", newline="\n") as file:
    for title, text in tqdm(iterate_dump(path)):

        # Add header to easily go back to source page
        file.write(f"\n>>> {title} <<<\n")
        for line in text.split("\n"):

            # If there is at least a few matches, keep it
            s = set(line)
            s.intersection_update(IPA_KEY_CHARACTERS)
            if len(s) > 2:
                file.write(line)
                file.write("\n")

3820284it [06:10, 10322.06it/s]


In [6]:
# Load lines
with open("ipa.txt", "r", encoding="utf-8", newline="\n") as file:
    lines = [line.rstrip("\n") for line in file]

In [7]:
# Show extract
for line in lines[:50]:
    print(line)


>>> accueil <<<

>>> lire <<<

>>> encyclopédie <<<

>>> manga <<<

>>> ouvrage <<<
** {{écouter|lang=fr|France (Paris)|ɛ̃.n‿u.vʁaʒ|audio=Fr-ouvrage.ogg|titre=un ouvrage}}

>>> siège <<<

>>> chaise <<<
* Canada : {{pron|ʃɛːz|fr}}, {{phon|ʃaɛ̯z|fr}}

>>> fauteuil <<<

>>> meuble <<<

>>> armchair <<<
{{en-nom-rég|ˌɑː(ɹ)m.ˈtʃeə(ɹ)}}
* {{écouter|lang=en|États-Unis <!-- précisez svp la ville ou la région -->|ˌɑː(ɹ)m.ˈtʃeə(ɹ)|audio=En-us-armchair.ogg}}

>>> mardi <<<

>>> lundi <<<

>>> semaine <<<

>>> militaire <<<

>>> suis <<<

>>> barbe à papa <<<

>>> manchot <<<

>>> pingouin <<<

>>> penguin <<<
{{en-nom-rég|ˈpɛŋ.gwɪn}}
'''penguin''' {{pron|ˈpɛŋ.ɡwɪn|en}}
* {{écouter|lang=en|États-Unis <!-- précisez svp la ville ou la région -->|ˈpɛŋ.gwɪn|audio=En-us-penguin.ogg}}

>>> mercredi <<<

>>> bande dessinée <<<



In [9]:
# Count most common templates
template_r = re.compile(r"\{\{[^\|\{\}]+[\|\}]")
counter = Counter()
for line in lines:
    counter.update(template_r.findall(line))

In [12]:
# Show most common ones
counter.most_common(50)

[('{{pron|', 398581),
 ('{{phono|', 60584),
 ('{{fr-rég|', 55346),
 ('{{m}', 32501),
 ('{{f}', 25925),
 ('{{écouter|', 9554),
 ('{{fr-inv|', 5655),
 ('{{mf}', 5236),
 ('{{équiv-pour|', 4838),
 ('{{en-nom-rég|', 4646),
 ('{{US|', 3894),
 ('{{UK|', 3842),
 ('{{conj|', 3401),
 ('{{t|', 3168),
 ('{{voir-conj|', 2400),
 ('{{fr-accord-cons|', 2358),
 ('{{conjugaison|', 1473),
 ('{{prnl|', 1367),
 ('{{invar}', 1269),
 ('{{fr-accord-s|', 1193),
 ('{{i|', 1163),
 ('{{sp}', 1054),
 ('{{n}', 918),
 ('{{fr-accord-rég|', 896),
 ('{{fr-accord-en|', 852),
 ('{{p}', 843),
 ('{{en-conj-rég|', 834),
 ('{{m|', 823),
 ('{{ortho1990}', 788),
 ('{{tradit}', 741),
 ('{{msing}', 711),
 ('{{h muet}', 659),
 ('{{fr-accord-el|', 589),
 ('{{fr-accord-al|', 581),
 ('{{en-conj-rég-e|', 576),
 ('{{de-nom-f-en|', 563),
 ('{{lien pronominal|', 507),
 ('{{fsing}', 504),
 ('{{au singulier uniquement|', 502),
 ('{{fr-accord-eur|', 368),
 ('{{h aspiré}', 360),
 ('{{en-nom|', 359),
 ('{{phon|', 355),
 ('{{fr-accord-mf|', 3