# Analyze IPA charset

The official usage is documented [here](https://fr.wiktionary.org/wiki/Annexe:Prononciation/fran%C3%A7ais#Tableau_de_l%E2%80%99API).

In [1]:
from collections import Counter

import regex as re

import pandas as pd

In [2]:
# Load French dump
df = pd.read_csv('../fr.raw.tsv', sep='\t', na_filter=False)
df.head()

Unnamed: 0,text,language,pronunciation
0,accueil,fr,a.kœj
1,lire,fro,liʁ
2,lire,fr,liʁ
3,encyclopédie,fr,ɑ̃.si.klɔ.pe.di
4,manga,en,ˈmæŋ.ɡə


In [3]:
# Keep French only
fr_df = df[df["language"] == "fr"].drop("language", axis=1)

In [4]:
# Count IPA characters
ipa_characters = Counter(c for p in fr_df["pronunciation"] for c in p)

In [5]:
# Show
ipa_characters.most_common()

[('.', 4282317),
 ('ʁ', 1506139),
 ('a', 1176332),
 ('i', 947977),
 ('e', 912426),
 ('ɔ', 755352),
 ('t', 736713),
 ('s', 675157),
 ('̃', 650478),
 ('ɛ', 604601),
 ('ə', 583862),
 ('l', 524632),
 ('k', 480223),
 ('z', 425464),
 ('d', 424553),
 ('j', 420889),
 ('n', 360140),
 ('p', 330666),
 ('m', 319391),
 ('ɑ', 296781),
 ('b', 249885),
 ('o', 206807),
 ('f', 206212),
 ('y', 201957),
 ('ɡ', 143374),
 ('u', 137120),
 ('v', 136112),
 ('ʒ', 114572),
 (' ', 106084),
 ('ʃ', 101838),
 ('w', 63937),
 ('œ', 32460),
 ('ɥ', 27246),
 ('ɲ', 22710),
 ('ø', 19413),
 ('‿', 16207),
 (')', 4936),
 ('(', 4933),
 ('ŋ', 1485),
 ('h', 641),
 ('ː', 488),
 ('ˈ', 312),
 ('r', 303),
 ('g', 135),
 ('ɪ', 89),
 ('·', 87),
 ('ʎ', 62),
 ('͡', 58),
 ('ʔ', 54),
 ('̥', 50),
 ('-', 48),
 ('ǝ', 45),
 ('ε', 43),
 ('x', 39),
 ('ʀ', 30),
 ('ɹ', 29),
 ('ɾ', 22),
 ('ˌ', 21),
 (',', 20),
 ('ʊ', 20),
 ('ʰ', 18),
 ('ɜ', 15),
 ('ẽ', 14),
 ('?', 13),
 ('æ', 12),
 ('ð', 12),
 ('/', 12),
 ('\\', 10),
 ('ɣ', 8),
 ('c', 8),
 ('ʌ', 8)

In [6]:
fr_df[fr_df["pronunciation"].str.contains("h")]

Unnamed: 0,text,pronunciation
1296,bahá’í,ba.hɑ.i
10621,hagarde,ha.ɡaʁd
10622,haineuse,hɛ.nøz
10623,haletante,hal.tɑ̃t
10625,hardie,haʁ.di
10628,hasardeuse,ha.zaʁ.døz
10651,houleuse,hu.løz
10656,huppée,hy.pe
10664,hâbleuse,hɑ.bløz
10665,hâtive,ha.tiv


In [7]:
fr_df[fr_df["pronunciation"].str.contains("ʔ")]

Unnamed: 0,text,pronunciation
6258,déhiscence,de.ʔi.sɑ̃s
9596,de dehors,də də.ʔɔʁ
11716,par-dehors,paʁ də.ʔɔʁ
16077,haha,a.ʔa
16145,rimer comme hallebarde et miséricorde,ʁime kɔm ʔal.baʁ.d‿e mi.ze.ʁi.kɔʁd
101697,tapisserie à personnages,ta.pis.ʁi ʔa pɛʁ.sɔ.naʒ
101870,conseil aulique,kɔ̃.sɛj ʔo.lik
119145,indo-hellénique,ɛ̃.do ʔɛ.le.nik
121350,abrahamiques,a.bʁa.ʔa.mik
193735,djihads,dʒi.ʔad
