# French SAMPA

In [1]:
import pandas as pd

import regex as re

In [2]:
# Load dataset
fr_df = pd.read_csv("../fr.tsv", sep="\t", encoding="utf-8", na_filter=False)
fr_df.head()

Unnamed: 0,text,language,pronunciation
0,accueil,fr,a.kœj
1,lire,fr,liʁ
2,encyclopédie,fr,ɑ̃.si.klɔ.pe.di
3,manga,fr,mɑ̃.ɡa
4,ouvrage,fr,u.vʁaʒ


In [3]:
# French SAMPA
#   https://www.phon.ucl.ac.uk/home/sampa/french.htm
#   https://en.wikipedia.org/wiki/X-SAMPA
FRENCH_SAMPA_MAPPING = {
    # Punctuation
    " ": "-",
    ".": ".",
    "ː": ":",
    "\u0361": "",
    "‿": "-\\",
    "(": "(",
    ")": ")",
    # Plosives consonants
    "p": "p",  # pont
    "b": "b",  # bon
    "t": "t",  # temps
    "d": "d",  # dans
    "k": "k",  # quand
    "ɡ": "g",  # gant
    # Voiceless plosives consonants
    "f": "f",  # femme
    "v": "v",  # vent
    "s": "s",  # sans
    "z": "z",  # zone
    "ʃ": "S",  # champ
    "ʒ": "Z",  # gens
    "j": "j",  # ion
    # Nasals consonants
    "m": "m",  # mont
    "n": "n",  # nom
    "ŋ": "J",  # oignon
    "ɲ": "N",  # camping
    # Liquids consonants
    "l": "l",  # long
    "ʁ": "R",  # rond
    # Vowel glides
    "w": "w",  # coin
    "ɥ": "H",  # juin
    "j": "j",  # pierre
    # Oral vowels
    "i": "i",  # si
    "e": "e",  # ces
    "ɛ": "E",  # seize
    "a": "a",  # patte
    "ɑ": "A",  # pâte
    "ɔ": "O",  # comme
    "o": "o",  # gros
    "u": "u",  # doux
    "y": "y",  # du
    "ø": "2",  # deux
    "œ": "9",  # neuf
    "ə": "@",  # justement
    # Nasal vowels
    "ɛ̃": "e~",  # vin
    "ɑ̃": "a~",  # vent
    "ɔ̃": "o~",  # bon
    "œ̃": "9~",  # brun
    # TODO glottal sounds?
    "h": "h",
    "ʔ": "?",
}

In [4]:
# Use a regular expression to properly segment (and validate) the input
parts = []
for part in FRENCH_SAMPA_MAPPING.keys():
    for c in ".:()":
        part = part.replace(c, "\\" + c)
    parts.append(part)
IPA_R = re.compile("(" + "|".join(parts) + ")*")

# Map symbols to their SAMPA equivalent
def convert(pronunciation):
    match = IPA_R.fullmatch(pronunciation)
    symbols = match.captures(1)
    sampa = "".join(FRENCH_SAMPA_MAPPING[s] for s in symbols)
    return sampa

In [5]:
# Replace IPA by SAMPA
fr_df["pronunciation"] = fr_df["pronunciation"].apply(convert)
fr_df.head()

Unnamed: 0,text,language,pronunciation
0,accueil,fr,a.k9j
1,lire,fr,liR
2,encyclopédie,fr,a~.si.klO.pe.di
3,manga,fr,ma~.ga
4,ouvrage,fr,u.vRaZ


In [6]:
# Export
fr_df.to_csv(
    "../fr.sampa.tsv",
    index=False,
    sep="\t",
    encoding="utf-8",
    line_terminator="\n",
)