In [9]:
import pandas as pd 
import numpy as np 

In [3]:

shanghai = pd.read_csv(
    'shanghai.csv',
    header=0,
    na_values=['', 'NA', 'NaN'],
    usecols=[1, 3, 4, 5],
    dtype={'form': 'string', '聲調': 'category', '字元': 'string', '字數': 'Int64'}
)

# Rename columns
shanghai = shanghai.rename(columns={'聲韻': 'Form', '聲調': 'Tone', '字元': 'Char', '字數': 'Count'})
shanghai.to_csv('shanghai_cleaned.csv', index=False)
# Check info




In [26]:
# get all phonemes
shanghai = pd.read_csv('shanghai_cleaned.csv')
phonemes = set(''.join(shanghai['Form'].dropna()))
vowel_phonemes = set('aeiouyøɑɔəɤɪɿ')
cons = phonemes - vowel_phonemes

print("All phonemes:", ''.join(sorted(phonemes)))
print("Vowel phonemes:", ''.join(sorted(vowel_phonemes)))
print("Consonant phonemes:", ''.join(sorted(cons))) 

# Define diacritics to remove
diacritics = {"ʰ", "̃", "̍", "ː", "ˑ", "̥", "̬"}

# Example: filter from a set
phonemes = [p for p in phonemes if p not in diacritics]

print("Filtered phonemes:", phonemes)



All phonemes: abdefhiklmnopstuvyzøŋȵɑɔɕəɡɤɦɪɿʑʔʥʦʨʰ̃̍
Vowel phonemes: aeiouyøɑɔəɤɪɿ
Consonant phonemes: bdfhklmnpstvzŋȵɕɡɦʑʔʥʦʨʰ̃̍
Filtered phonemes: ['ø', 'ɑ', 'ɿ', 'v', 'n', 'z', 'd', 'i', 'ŋ', 'b', 'f', 'ɦ', 'ʔ', 'e', 'u', 'ʦ', 's', 'ɤ', 'ʑ', 'ɔ', 'ʥ', 'ə', 'l', 'ɡ', 'ȵ', 'k', 'ʨ', 'y', 'a', 'ɪ', 'ɕ', 't', 'm', 'h', 'o', 'p']


In [36]:
import pandas as pd

# Example phoneme inventory
# phonemes = ["p","t","k","m","n","a","i","u","#"]  

# --- Define feature sets ---
feature_system = {
    "vocalic":   {"a","i","u","o","e","y","ø","ɑ","ɔ","ə","ɤ","ɪ","ɿ"},
    "sonorant":  {"m","n","ɲ","ȵ","l","j","a","i","u","o","e","y","ø","ɑ","ɔ","ə","ɤ","ɪ","ɿ"},
    "consonantal": {"b", "d", "f", "h", "k", "l", "m", "n", "p", "s", "t", "v", "z", 
                    "ɲ", "ȵ", "ɕ", "ɡ", "ɦ", "ʑ", "ʔ", "ʥ", "ʦ", "ʨ"},
    "cg":        {"ʔ"},   # consonantal glottalization
    "voice":     {"b","d","ɡ","z","v","m","n","ɲ",'ŋ',"ȵ","l","j",
                  "a","i","u","o","e","y","ø","ɑ","ɔ","ə","ɤ","ɪ","ɿ"}
}

# --- Build feature chart ---
data = {}
for feature, segs in feature_system.items():
    row = []
    for p in phonemes:
        if p in segs:
            row.append("+")            
        else:
            row.append("-")
    data[feature] = row

# --- DataFrame: features as rows, phonemes as columns ---
df = pd.DataFrame(data, index=phonemes).T

df.to_csv("feature_chart.csv", index=True)


df


Unnamed: 0,ø,ɑ,ɿ,v,n,z,d,i,ŋ,b,...,ʨ,y,a,ɪ,ɕ,t,m,h,o,p
vocalic,+,+,+,-,-,-,-,+,-,-,...,-,+,+,+,-,-,-,-,+,-
sonorant,+,+,+,-,+,-,-,+,-,-,...,-,+,+,+,-,-,+,-,+,-
consonantal,-,-,-,+,+,+,+,-,-,+,...,+,-,-,-,+,+,+,+,-,+
cg,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
voice,+,+,+,+,+,+,+,+,+,+,...,-,+,+,+,-,-,+,-,+,-


In [37]:
shanghai

Unnamed: 0,Form,Tone,Char,Count,Features
0,aʔ,陰入5,"阿, 鴨, 押, 壓",4.0,"{'voi': '++', 'syl': '+-', 'glottal': '-+'}"
1,iaʔ,陰入5,約,1.0,"{'voi': '+++', 'syl': '++-', 'glottal': '--+'}"
2,uaʔ,陰入5,挖,1.0,"{'voi': '+++', 'syl': '++-', 'glottal': '--+'}"
3,oʔ,陰入5,"惡, 握, 屋, 沃, 噩, 幄, 齷, 渥",8.0,"{'voi': '++', 'syl': '+-', 'glottal': '-+'}"
4,ioʔ,陰入5,郁,1.0,"{'voi': '+++', 'syl': '++-', 'glottal': '--+'}"
...,...,...,...,...,...
675,ɦoŋ,陽去13,"虹, 弘, 宏, 紅, 洪, 鴻, 泓",7.0,"{'voi': '+++', 'syl': '-++', 'glottal': '---'}"
676,ɦioŋ,陽去13,"勻, 允, 云, 雲, 韻, 運, 暈, 孕, 螢, 榮, 營, 塋, 熊, 雄, 融, 容...",30.0,"{'voi': '++++', 'syl': '-+++', 'glottal': '----'}"
677,ɦəl,陽去13,"兒, 爾, 二, 貳, 而, 耳, 餌, 邇, 洱, 珥",10.0,"{'voi': '+++', 'syl': '-+-', 'glottal': '---'}"
678,ɦm̍,陽去13,"畝, 嘸",2.0,"{'voi': '++_', 'syl': '--_', 'glottal': '--_'}"
