In [93]:
import pandas as pd
import numpy as np
import re

# === Load data ===
shanghai = pd.read_csv(
    "shanghai.csv",
    header=0,
    usecols=[1, 3],
    dtype={"form": "string", "聲調": "category"}
)

# Rename columns and drop missing rows
shanghai = shanghai.rename(columns={"聲韻": "Form", "聲調": "Tone"}).dropna()

# === Tone mapping ===
# print(f"Tonal inventory: {shanghai['Tone'].unique()}")

tone_map = {
    "5": "H",
    "4": "h",
    "3": "M",
    "2": "l",
    "1": "L",
}

def map_tone_sequence(seq):
    """Convert digit tone labels to letter labels."""
    if pd.isna(seq):
        return None
    return "".join(tone_map.get(ch, "") for ch in str(seq))

shanghai["tone_letter"] = shanghai["Tone"].apply(map_tone_sequence)

print(f"all characters: {''.join(sorted(set(''.join(shanghai['Form'].dropna().unique()))))}")

# === Feature extraction ===
feature_system = {
    "vocalic":   {"a", "i", "u", "o", "e", "y", "ø", "ɑ", "ɔ", "ə", "ɤ", "ɪ", "ɿ"},
    "sonorant":  {"l", "m", "n", "ɲ", "ŋ", "ȵ"},
    "consonantal": {
        "b", "d", "f", "h", "k", "l", "m", "n", "p", "s", "t", "v", "z",
        "ɲ", "ȵ", "ɕ", "ŋ", "ɡ", "ɦ", "ʑ", "ʔ", "ʥ", "ʦ", "ʨ"
    },
    "cg": {"ʔ","ɦ"},   # consonantal glottalization
    "voiced": {"b", "d", "ɡ", "z", "v", "l", "j", "ɦ", "ʥ"},
    "voiceless": {"f", "h", "k", "p", "s", "t", "ʦ", "ʨ","ʔ"},
}

def get_feature(form):
    """Map characters in Form to a simplified structural code."""
    simplified = ""
    for ch in str(form):
        if ch in feature_system["vocalic"]:
            simplified += "V"
        elif ch in feature_system["cg"] and ch in feature_system["voiced"]:
            simplified += "G"
        elif ch in feature_system["cg"] and ch in feature_system["voiceless"]:
            simplified += "g"
        elif ch in feature_system["consonantal"]:
            if ch in feature_system["sonorant"]:
                simplified += "N"
            elif ch in feature_system["voiced"] and ch not in feature_system["sonorant"]:
                simplified += "D"
            elif ch in feature_system["voiceless"]:
                simplified += "T"
    return simplified

shanghai["simple"] = shanghai["Form"].apply(get_feature)

# === Diacritic stripping ===
strip_diacritics = {
    "ʰ": "",   # superscript h
    "̃": "",   # combining tilde
    "̍": "",   # combining vertical line above
}

def replace_diacritics(s):
    s = str(s)
    for old, new in strip_diacritics.items():
        s = s.replace(old, new)
    return s

shanghai["Form"] = shanghai["Form"].apply(replace_diacritics)

print(f"All syllable structures in the corpus:\n {shanghai['simple'].unique()}")

print("The corpus shows some superheavy structures inconsistent with literature such as:")
print(shanghai[shanghai["simple"].str.len() == 4]['simple'].unique())

# === Filtering ===
# Remove forms ending with glottal stop or superheavy syllables
shanghai = shanghai[~shanghai["simple"].str.endswith(("VVG", "VVN","VVg"))] 
# or shanghai = shanghai[shanghai["simple"].str.len() != 4]


print(f"After filtering: {shanghai['simple'].unique()}")

print(f"{shanghai['tone_letter'].unique()}")
shanghai

all characters: abdefhiklmnopstuvyzøŋȵɑɔɕəɡɤɦɪɿʑʔʥʦʨʰ̃̍
All syllable structures in the corpus:
 ['Vg' 'VVg' 'TVg' 'TVVg' 'VN' 'V' 'VV' 'VVN' 'TV' 'TVV' 'TVN' 'NV' 'NVN'
 'TVVN' 'NVV' 'GV' 'N' 'DVg' 'DVVg' 'NVg' 'NVVg' 'GVg' 'GVVg' 'DV' 'DVV'
 'DVN' 'DVVN' 'NVVN' 'GVN' 'GVV' 'GVVN' 'GN']
The corpus shows some superheavy structures inconsistent with literature such as:
['TVVg' 'TVVN' 'DVVg' 'NVVg' 'GVVg' 'DVVN' 'NVVN' 'GVVN']
After filtering: ['Vg' 'TVg' 'VN' 'V' 'VV' 'TV' 'TVV' 'TVN' 'NV' 'NVN' 'NVV' 'GV' 'N' 'DVg'
 'NVg' 'GVg' 'DV' 'DVV' 'DVN' 'GVN' 'GVV' 'GN']
['H', 'MH', 'HM', 'L', 'LM']
Categories (5, object): ['H', 'MH', 'HM', 'L', 'LM']


Unnamed: 0,Form,Tone,tone_letter,simple
0,aʔ,陰入5,H,Vg
3,oʔ,陰入5,H,Vg
5,əʔ,陰入5,H,Vg
9,paʔ,陰入5,H,TVg
10,poʔ,陰入5,H,TVg
...,...,...,...,...
673,ɦiŋ,陽去13,LM,GVN
675,ɦoŋ,陽去13,LM,GVN
677,ɦəl,陽去13,LM,GVN
678,ɦm,陽去13,LM,GN


In [94]:
# === Embed tone in vowel ===
def embed_tone_in_vowel(tone, syl):
    simplied_form = list(syl)  # make it mutable
    v_positions = [i for i, c in enumerate(simplied_form) if c == "V"]

    if len(tone) == 1 and len(v_positions) == 1:
        if tone == "H":
            simplied_form = syl.replace("V", "á")
        elif tone == "L":
            simplied_form = syl.replace("V", "à")
        elif tone == "M":    
            simplied_form = syl.replace("V", "a")   
            
    elif len(tone) == 2 and len(v_positions) == 1:
        if tone == "MH":
             simplied_form = syl.replace("V", "â")
        elif tone == "HM":
             simplied_form = syl.replace("V", "ǎ")
        elif tone == "LM":    
             simplied_form = syl.replace("V", "ā")   
    
    elif len(tone) == 2 and len(v_positions) == 2:  # match
        if tone == "MH":
            simplied_form = syl.replace("VV", "â")
        elif tone == "HM":
            simplied_form = syl.replace("VV", "ǎ")
        elif tone == "LM":    
            simplied_form = syl.replace("VV", "ā")
            
    elif len(v_positions) == 0:
        if tone == "MH":
            simplied_form = syl.replace("N", "ń")
        elif tone == "HM":
            simplied_form = syl.replace("N", "ň")
        elif tone == "LM":    
            simplied_form = syl.replace("N", "ņ")
    simplied_form += ''
        
    return " ".join(simplied_form)
shanghai

Unnamed: 0,Form,Tone,tone_letter,simple
0,aʔ,陰入5,H,Vg
3,oʔ,陰入5,H,Vg
5,əʔ,陰入5,H,Vg
9,paʔ,陰入5,H,TVg
10,poʔ,陰入5,H,TVg
...,...,...,...,...
673,ɦiŋ,陽去13,LM,GVN
675,ɦoŋ,陽去13,LM,GVN
677,ɦəl,陽去13,LM,GVN
678,ɦm,陽去13,LM,GN


In [95]:

shanghai["with_tone"] = shanghai.apply(
    lambda row: embed_tone_in_vowel(row["tone_letter"], row["simple"]), axis=1
)


shanghai_tone = pd.Series(shanghai['with_tone'].unique())
shanghai_tone.to_csv("~/Documents/BUFIA/data/shanghai_data.txt", index=False, header=False)

shanghai_tone

0       á g
1     T á g
2       â N
3         â
4       T â
5     T â N
6       N â
7     N â N
8       G â
9         ň
10        ǎ
11      ǎ N
12      T ǎ
13    T ǎ N
14      N ǎ
15    N ǎ N
16      G ǎ
17    D à g
18    N à g
19    G à g
20      D ā
21    D ā N
22      N ā
23    N ā N
24      ā N
25        ā
26    G ā N
27      G ā
28      G ņ
dtype: object

In [98]:

# symbols (from your inventory)
all_symbol = sorted(
    set(''.join(shanghai['with_tone'].dropna().unique()).replace(" ", "")) | {"ń"}
)

print(f"Symbols: {all_symbol}")

features = [
    "nas",      # nasal consonant
    "cons",      # nasal consonant
    "son",      # sonorant consonant
    "voi",        # voiced consonant
    "cg",         # consonantal glottalization
    "H",      # upper register tone
    "L",      # upper register tone
    "short"
]

# create DataFrame with "0" everywhere
df = pd.DataFrame("0", index=features, columns=all_symbol)

# example: fill in a few cells manually
# make every symbol "-" first

df.loc["cons"] = "-"
df.loc["cons", ['T','D','G','N','g','ń', 'ņ', 'ň']] = "+"


df.loc["son"] = "+"
df.loc["son", ['T','D','G','g']] = "-"

df.loc["nas"] = "-"
df.loc["nas", ['N','ń','ņ','ň']] = "+"

df.loc["voi"] = "+"
df.loc["voi", ['T','g']] = "-"

df.loc["cg"] = "-"
df.loc["cg", ['G','g']] = "+"


df.loc["H"] = "0"
df.loc["H", ['á','ǎ','ň']] = "+"
df.loc["H", ['à','â','ā','ń','ņ']] = "-"

df.loc["L"] = "0"
df.loc["L", ['à','ā','ņ']] = "+"
df.loc["L", ['á','ǎ','â','ń','ň']] = "-"

df.loc["short"] = "0"
df.loc["short", ['á','à']] = "+"
df.loc["short", ['â','ń','ņ','ň','ǎ','ā']] = "-"


df.loc["fall"] = "0"
df.loc["fall", ['ň','ǎ']] = "+"
df.loc["fall", ['â','ń','á','à','ņ','ā']] = "-"


print(df)

df.to_csv("~/Documents/BUFIA/data/shanghai_features.csv")
# shanghai

Symbols: ['D', 'G', 'N', 'T', 'g', 'à', 'á', 'â', 'ā', 'ń', 'ņ', 'ň', 'ǎ']
       D  G  N  T  g  à  á  â  ā  ń  ņ  ň  ǎ
nas    -  -  +  -  -  -  -  -  -  +  +  +  -
cons   +  +  +  +  +  -  -  -  -  +  +  +  -
son    -  -  +  -  -  +  +  +  +  +  +  +  +
voi    +  +  +  -  -  +  +  +  +  +  +  +  +
cg     -  +  -  -  +  -  -  -  -  -  -  -  -
H      0  0  0  0  0  -  +  -  -  -  -  +  +
L      0  0  0  0  0  +  -  -  +  -  +  -  -
short  0  0  0  0  0  +  +  -  -  -  -  -  -
fall   0  0  0  0  0  -  -  -  -  -  -  +  +


In [87]:
# search for certain type
filtered = shanghai[
	(shanghai['with_tone'].str.startswith('D'))&
 (shanghai['tone_letter'].str.startswith('H'))][['Form', 'tone_letter', 'Tone', 'with_tone']]
print(f"After filtering: {filtered.values.tolist()}")


After filtering: []
