In [36]:
# collapse all forms into one long string, then take set of chars
all_chars = set(''.join(shanghai['Form']))
print(f"Phonemes: {set(all_chars)}")

Phonemes: {'o', 'ʥ', 't', 'f', 'ȵ', 'ʑ', 'n', 'p', 'ɿ', 'k', '̍', 'd', 'ʦ', 'y', 'ɤ', 'ŋ', 'ɦ', 'ʨ', 'ʔ', 'i', 'b', 'v', 'l', 'ɑ', 'ə', 'ɪ', 'ø', 'm', 'h', 'ʰ', 'ɔ', 'ɕ', 'ɡ', '̃', 'e', 's', 'u', 'a', 'z'}


In [103]:
import pandas as pd
import numpy as np
import re

# === Load data ===
shanghai = pd.read_csv(
    "shanghai.csv",
    header=0,
    usecols=[1, 3],
    dtype={"form": "string", "聲調": "category"}
)

# Rename columns and drop missing rows
shanghai = shanghai.rename(columns={"聲韻": "Form", "聲調": "Tone"}).dropna()

# === Tone mapping ===
print(f"Tonal inventory: {shanghai['Tone'].unique()}")

tone_map = {
    "5": "H",
    "4": "M+",
    "3": "M",
    "2": "M-",
    "1": "L",
}

def map_tone_sequence(seq):
    """Convert digit tone labels to letter labels."""
    if pd.isna(seq):
        return None
    return "".join(tone_map.get(ch, "") for ch in str(seq))

shanghai["tone_letter"] = shanghai["Tone"].apply(map_tone_sequence)

# === Feature extraction ===
feature_system = {
    "vocalic":   {"a", "i", "u", "o", "e", "y", "ø", "ɑ", "ɔ", "ə", "ɤ", "ɪ", "ɿ"},
    "sonorant":  {"l", "m", "n", "ɲ", "ŋ", "ȵ"},
    "consonantal": {
        "b", "d", "f", "h", "k", "l", "m", "n", "p", "s", "t", "v", "z",
        "ɲ", "ȵ", "ɕ", "ŋ", "ɡ", "ɦ", "ʑ", "ʔ", "ʥ", "ʦ", "ʨ"
    },
    "cg": {"ʔ"},   # consonantal glottalization
    "voiced": {"b", "d", "ɡ", "z", "v", "l", "j", "ɦ", "ʥ"},
    "voiceless": {"f", "h", "k", "p", "s", "t", "ʦ", "ʨ"},
}

def get_feature(form):
    """Map characters in Form to a simplified structural code."""
    simplified = ""
    for ch in str(form):
        if ch in feature_system["vocalic"]:
            simplified += "V"
        elif ch in feature_system["cg"]:
            simplified += "G"
        elif ch in feature_system["consonantal"]:
            if ch in feature_system["sonorant"]:
                simplified += "N"
            elif ch in feature_system["voiced"]:
                simplified += "D"
            elif ch in feature_system["voiceless"]:
                simplified += "T"
    return simplified

shanghai["simple"] = shanghai["Form"].apply(get_feature)

# === Diacritic stripping ===
strip_diacritics = {
    "ʰ": "",   # superscript h
    "̃": "",   # combining tilde
    "̍": "",   # combining vertical line above
}

def replace_diacritics(s):
    s = str(s)
    for old, new in strip_diacritics.items():
        s = s.replace(old, new)
    return s

shanghai["Form"] = shanghai["Form"].apply(replace_diacritics)

print(f"Before filtering: {shanghai['simple'].unique()}")

# === Filtering ===。
# Remove forms ending with glottal stop or superheavy syllables
shanghai = shanghai[~shanghai["simple"].str.endswith(("VVG", "VVN"))]

print(f"After filtering: {shanghai['simple'].unique()}")

print(f"{shanghai['tone_letter'].unique()}")

Tonal inventory: ['陰入5', '陰去35', '陰平53', '陽入1', '陽去13']
Categories (5, object): ['陰入5', '陰去35', '陰平53', '陽入1', '陽去13']
Before filtering: ['VG' 'VVG' 'TVG' 'TVVG' 'VN' 'V' 'VV' 'VVN' 'TV' 'TVV' 'TVN' 'NV' 'NVN'
 'TVVN' 'NVV' 'DV' 'N' 'DVG' 'DVVG' 'NVG' 'NVVG' 'DVV' 'DVN' 'DVVN' 'NVVN'
 'DN']
After filtering: ['VG' 'TVG' 'VN' 'V' 'VV' 'TV' 'TVV' 'TVN' 'NV' 'NVN' 'NVV' 'DV' 'N' 'DVG'
 'NVG' 'DVV' 'DVN' 'DN']
['H', 'MH', 'HM', 'L', 'LM']
Categories (5, object): ['H', 'MH', 'HM', 'L', 'LM']


In [156]:
replace_tone = { "H": "á", "L": "à", "M": "a" }
def embed_tone_in_vowel(tone, syl):
    simplied_form = list(syl)  # make it mutable
    v_positions = [i for i, c in enumerate(simplied_form) if c == "V"]

    if len(tone) == len(v_positions):  # match
        for j, t in enumerate(tone):
            simplied_form[v_positions[j]] = replace_tone[t]
#
        # return "".join(simplied_form)
    
    elif len(tone) == 2 and len(v_positions) == 1:  # match
        if tone == "MH":
            simplied_form = syl.replace("V", "â")
        elif tone == "HM":
            simplied_form = syl.replace("V", "ǎ")
        elif tone == "LM":    
            simplied_form = syl.replace("V", "ā")
        
    # elif len(v_positions)== 0:  # no vowel
    #     #drop
    return "".join(simplied_form)


shanghai["with_tone"] = shanghai.apply(
    lambda row: embed_tone_in_vowel(row["tone_letter"], row["simple"]), axis=1
)

shanghai = shanghai.dropna()
shanghai_tone = pd.Series(shanghai['with_tone'].unique())
shanghai_tone.to_csv("shanghai_tonotactics.txt", index=False, header=False)


In [158]:

# symbols (from your inventory)
all_symbol = sorted(set(''.join(shanghai['with_tone'].fillna(''))))
print(f"Symbols: {all_symbol}")
# list of features you want
features = [
    "nasal", "voi", "cg","fall","Upper"
]

# create DataFrame with "0" everywhere
df = pd.DataFrame("0", index=features, columns=all_symbol)

# example: fill in a few cells manually
# make every symbol "-" first

df.loc["nasal"] = "-"
df.loc["nasal", ['N']] = "+"

df.loc["voi"] = "-"
df.loc["voi", ['D']] = "+"

df.loc["cg"] = "-"
df.loc["cg", ['G']] = "+"


df.loc["Fall"] = "0"
df.loc["Fall", ['â']] = "+"
df.loc["Fall", ['a', 'à', 'á', 'ā', 'ǎ']] = "-"

df.loc["Upper"] = "0"
df.loc["Upper", ['â','á','ǎ']] = "+"
df.loc["Upper", ['a','à','ā',]] = "-"

print(df)

df.to_csv("features.csv")


Symbols: ['D', 'G', 'N', 'T', 'a', 'à', 'á', 'â', 'ā', 'ǎ']
       D  G  N  T  a  à  á  â  ā  ǎ
nasal  -  -  +  -  -  -  -  -  -  -
voi    +  -  -  -  -  -  -  -  -  -
cg     -  +  -  -  -  -  -  -  -  -
fall   0  0  0  0  0  0  0  0  0  0
Upper  0  0  0  0  -  -  +  +  -  +
Fall   0  0  0  0  -  -  -  +  -  -
