In [114]:
from collections import Counter, defaultdict
import re

from IPython.core.display import HTML
import numpy as np
import pandas as pd
import pydub
import seaborn as sns
from tqdm.auto import tqdm
tqdm.pandas()

## Prepare data

### CEDICT

In [2]:
!wget -O - https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz | gunzip > cedict_ts.u8

--2023-10-04 10:53:24--  https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz
Resolving www.mdbg.net (www.mdbg.net)... 45.79.102.60
Connecting to www.mdbg.net (www.mdbg.net)|45.79.102.60|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3802936 (3.6M) [application/x-gzip]
Saving to: ‘STDOUT’


2023-10-04 10:53:26 (5.55 MB/s) - written to stdout [3802936/3802936]



In [3]:
from cedict_utils.cedict import CedictParser
parser = CedictParser()
parser.read_file("cedict_ts.u8")
entries = parser.parse()

cedict = defaultdict(set)
for entry in entries:
    cedict[entry.simplified].add(entry.pinyin.lower().replace("u:", "v"))

### AISHELL

In [37]:
df = pd.read_csv("train.sep.txt", header=None, engine="python")
df.columns = ["file", "pinyin", "text"]
df["file"] = df.file.str.rstrip(".wav")
df["pinyin"] = df.pinyin.str.strip()
df["text"] = df.text.str.strip()

### Preprocessing

#### Infer underlying tones

In [38]:
count = 0
found_sandhi = False
pinyin_re = re.compile(r"^([a-z]+)([0-9])$")
new_pinyin_list = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    if row.pinyin.strip().count(" ") != row.text.strip().count(" "):
        count += 1
        continue
        
    pinyin = row.pinyin.strip().split(" ")
    hanzi = row.text.strip().split(" ")
    
    new_pinyin = []
    for j, (pinyin_j, hanzi_j) in enumerate(zip(pinyin, hanzi)):
        segments, tone = pinyin_re.findall(pinyin_j)[0]
        cedict_j = cedict[hanzi_j]
        if tone == "2" and any(reading[:-1] == segments and reading[-1] == "3" for reading in cedict_j):
            # found_sandhi = True
            new_pinyin_j = segments + "6"
        else:
            new_pinyin_j = pinyin_j
        new_pinyin.append(new_pinyin_j)
    
    new_pinyin_list.append(" ".join(new_pinyin))
        
count

  0%|          | 0/63262 [00:00<?, ?it/s]

0

In [39]:
df["pinyin_underlying"] = new_pinyin_list
df["has_sandhi"] = df.pinyin != df.pinyin_underlying
df

Unnamed: 0,file,pinyin,text,pinyin_underlying,has_sandhi
0,SSB00050001,guang3 zhou1 nv3 da4 xue2 sheng1 deng1 shan1 s...,广 州 女 大 学 生 登 山 失 联 四 天 警 方 找 到 疑 似 女 尸,guang3 zhou1 nv3 da4 xue2 sheng1 deng1 shan1 s...,False
1,SSB00050002,zhun1 zhong4 ke1 xue2 gui1 lv4 de5 yao1 qiu2,尊 重 科 学 规 律 的 要 求,zhun1 zhong4 ke1 xue2 gui1 lv4 de5 yao1 qiu2,False
2,SSB00050003,qi1 lu4 wu2 ren2 shou4 piao4,七 路 无 人 售 票,qi1 lu4 wu2 ren2 shou4 piao4,False
3,SSB00050004,hei1 ke4 xuan1 bu4 zhi3 yao4 bo1 da2 mou3 yi2 ...,黑 客 宣 布 只 要 拨 打 某 一 个 电 话,hei1 ke4 xuan1 bu4 zhi3 yao4 bo1 da6 mou3 yi2 ...,True
4,SSB00050005,bei3 jing1 wan4 ke1 zhong3 jing1 li3 liu2 xiao...,北 京 万 科 总 经 理 刘 肖 的 观 点 极 具 代 表 性,bei3 jing1 wan4 ke1 zhong3 jing1 li3 liu2 xiao...,False
...,...,...,...,...,...
63257,SSB19560477,qing2 ge1 dui4 chang4,情 歌 对 唱,qing2 ge1 dui4 chang4,False
63258,SSB19560478,da4 jia1 shi4 bu2 shi4 hui4 geng4 jing1 xi3 yi...,大 家 是 不 是 会 更 惊 喜 一 些 呢,da4 jia1 shi4 bu2 shi4 hui4 geng4 jing1 xi3 yi...,False
63259,SSB19560479,huan4 yi4 yu4 zheng4 lao3 han4 shi2 ji3 nian2 ...,患 抑 郁 症 老 汉 十 几 年 足 不 出 户 身 背 五 四 次 违 章 记 录,huan4 yi4 yu4 zheng4 lao3 han4 shi2 ji3 nian2 ...,False
63260,SSB19560480,yi4 bai3 san1 shi2 wu3 wan4 jiu3 qian1 yi1 bai...,一 百 三 十 五 万 九 千 一 百 二 十 三,yi4 bai3 san1 shi2 wu3 wan4 jiu3 qian1 yi1 bai...,False


In [7]:
df.has_sandhi.mean()

0.295675128829313

#### Compute unigram and bigram vocabularies

In [8]:
vocabulary = Counter(df.pinyin.str.cat(sep=" ").strip().split(" "))
vocabulary.most_common(30)

[('de5', 24707),
 ('shi4', 15367),
 ('shi2', 10132),
 ('you3', 8592),
 ('yi4', 7129),
 ('me5', 6579),
 ('zai4', 6525),
 ('wo3', 5619),
 ('yi1', 5425),
 ('bu4', 5159),
 ('ren2', 5003),
 ('shen2', 4888),
 ('le5', 4801),
 ('san1', 4782),
 ('er4', 4763),
 ('gong1', 4747),
 ('yi2', 4585),
 ('zhong1', 4521),
 ('qi1', 4455),
 ('ji4', 4396),
 ('he2', 4285),
 ('dao4', 4226),
 ('cheng2', 3928),
 ('si4', 3893),
 ('jia1', 3710),
 ('yuan2', 3689),
 ('ta1', 3677),
 ('you2', 3645),
 ('li4', 3628),
 ('bai3', 3560)]

In [9]:
idx2syl = list(vocabulary.keys())
syl2idx = {syl: idx for idx, syl in enumerate(idx2syl)}

In [10]:
bigrams = np.zeros((len(idx2syl), len(idx2syl)))
for _, row in tqdm(df.iterrows(), total=len(df)):
    toks = row.pinyin.strip().split(" ")
    hanzi_toks = row.text.strip().split(" ")
    for tok1, tok2, hanzi_tok1, hanzi_tok2 in zip(toks, toks[1:], hanzi_toks, hanzi_toks[1:]):
        bigrams[syl2idx[tok1], syl2idx[tok2]] += 1

  0%|          | 0/63262 [00:00<?, ?it/s]

In [11]:
bigram_df = pd.DataFrame(bigrams)
bigram_df.index = idx2syl
bigram_df.index.name = "tok1"
bigram_df.columns = idx2syl
bigram_df.columns.name = "tok2"
bigram_df = pd.DataFrame(bigram_df.unstack().reset_index(name="count"))
bigram_df

Unnamed: 0,tok2,tok1,count
0,guang3,guang3,0.0
1,guang3,zhou1,0.0
2,guang3,nv3,0.0
3,guang3,da4,0.0
4,guang3,xue2,1.0
...,...,...,...
3400331,cong5,xingr2,0.0
3400332,cong5,nan1,0.0
3400333,cong5,liur1,0.0
3400334,cong5,ning1,0.0


In [12]:
bigram_df[["tok1_base", "tok1_tone"]] = bigram_df.tok1.str.extract("([^0-9]+)([0-9])")
bigram_df[["tok2_base", "tok2_tone"]] = bigram_df.tok2.str.extract("([^0-9]+)([0-9])")

In [13]:
bigram_df["tok1_tone"] = bigram_df.tok1_tone.astype(int)
bigram_df["tok2_tone"] = bigram_df.tok2_tone.astype(int)

In [14]:
bigram_df[(bigram_df.tok2_tone == 3) & (bigram_df.tok1_tone == 3) & (bigram_df["count"] > 0)].sort_values("count")

Unnamed: 0,tok2,tok1,count,tok1_base,tok1_tone,tok2_base,tok2_tone
533269,yu3,zheng3,1.0,zheng,3,yu,3
682468,bao3,chan3,1.0,chan,3,bao,3
682487,bao3,wang3,1.0,wang,3,bao,3
682511,bao3,zhu3,1.0,zhu,3,bao,3
682518,bao3,yi3,1.0,yi,3,bao,3
...,...,...,...,...,...,...,...
108853,wu3,bai3,35.0,bai,3,wu,3
638081,jiu3,bai3,51.0,bai,3,jiu,3
136733,you3,qu3,55.0,qu,3,you,3
136507,you3,dian3,61.0,dian,3,you,3


#### Compute unigram and bigram vocabularies with underlying pinyin

In [15]:
vocabulary = Counter(df.pinyin_underlying.str.cat(sep=" ").strip().split(" "))
vocabulary.most_common(30)

[('de5', 24707),
 ('shi4', 15367),
 ('shi2', 10020),
 ('you3', 8592),
 ('yi4', 7129),
 ('me5', 6579),
 ('zai4', 6525),
 ('wo3', 5619),
 ('yi1', 5425),
 ('bu4', 5159),
 ('ren2', 4999),
 ('shen2', 4857),
 ('le5', 4801),
 ('san1', 4782),
 ('er4', 4763),
 ('gong1', 4747),
 ('zhong1', 4521),
 ('qi1', 4455),
 ('ji4', 4396),
 ('he2', 4285),
 ('yi2', 4228),
 ('dao4', 4226),
 ('cheng2', 3928),
 ('si4', 3893),
 ('jia1', 3710),
 ('ta1', 3677),
 ('yuan2', 3655),
 ('li4', 3628),
 ('bai3', 3560),
 ('xian4', 3531)]

In [16]:
idx2syl = list(vocabulary.keys())
syl2idx = {syl: idx for idx, syl in enumerate(idx2syl)}

In [17]:
bigrams = np.zeros((len(idx2syl), len(idx2syl)))
for _, row in tqdm(df.iterrows(), total=len(df)):
    toks = row.pinyin_underlying.strip().split(" ")
    hanzi_toks = row.text.strip().split(" ")
    for tok1, tok2, hanzi_tok1, hanzi_tok2 in zip(toks, toks[1:], hanzi_toks, hanzi_toks[1:]):
        bigrams[syl2idx[tok1], syl2idx[tok2]] += 1

  0%|          | 0/63262 [00:00<?, ?it/s]

In [18]:
bigram_df = pd.DataFrame(bigrams)
bigram_df.index = idx2syl
bigram_df.index.name = "tok1"
bigram_df.columns = idx2syl
bigram_df.columns.name = "tok2"
bigram_df = pd.DataFrame(bigram_df.unstack().reset_index(name="count"))
bigram_df

Unnamed: 0,tok2,tok1,count
0,guang3,guang3,0.0
1,guang3,zhou1,0.0
2,guang3,nv3,0.0
3,guang3,da4,0.0
4,guang3,xue2,1.0
...,...,...,...
4235359,cong5,xingr2,0.0
4235360,cong5,nan1,0.0
4235361,cong5,liur1,0.0
4235362,cong5,ning1,0.0


In [19]:
bigram_df

Unnamed: 0,tok2,tok1,count
0,guang3,guang3,0.0
1,guang3,zhou1,0.0
2,guang3,nv3,0.0
3,guang3,da4,0.0
4,guang3,xue2,1.0
...,...,...,...
4235359,cong5,xingr2,0.0
4235360,cong5,nan1,0.0
4235361,cong5,liur1,0.0
4235362,cong5,ning1,0.0


In [20]:
bigram_df[["tok1_base", "tok1_tone"]] = bigram_df.tok1.str.extract("([^0-9]+)([0-9])")
bigram_df[["tok2_base", "tok2_tone"]] = bigram_df.tok2.str.extract("([^0-9]+)([0-9])")

In [21]:
bigram_df["tok1_tone"] = bigram_df.tok1_tone.astype(int)
bigram_df["tok2_tone"] = bigram_df.tok2_tone.astype(int)

---

## Background checks

### What are 3-3 instances in the corpus?

In [128]:
df_33 = df[df.pinyin_underlying.str.contains(r"\w+3 \w+3")]

In [137]:
HTML("<p>" + "</p><p>".join(df_33.sample(n=10).apply(lambda row: f"<tt>{row.file}</tt><br/>{row.pinyin_underlying}<br/>{row.text}<br/> <audio controls src='intermediates/AISHELL/mfa_input/{row.file[:7]}/{row.file}.wav'></audio>", axis=1)) + "</p>")

## Stimulus search

Find cases of bigrams A+B where B is T3 and either
1. A is T2 and there is an attested segmentally identical T3, or
2. A is T3 and there is an attested segmentally identical T2

### Case 1

In [22]:
case1_df = bigram_df.loc[(bigram_df["tok1_tone"] == 2) & (bigram_df["tok2_tone"] == 3)]

case1_alternates = bigram_df.set_index(["tok1", "tok2"]).index.intersection(list(zip(case1_df.tok1.str.replace("2", "6"), case1_df.tok2)))
case1_alternates = bigram_df.set_index(["tok1", "tok2"]).loc[case1_alternates]

case1_df = pd.merge(case1_df, case1_alternates[["count", "tok1_base"]].reset_index(),
                    how="inner", left_on=["tok1_base", "tok2"], right_on=["tok1_base", "tok2"],
                    suffixes=("", "_alternate"))
case1_df["count_mean"] = (case1_df["count"] + case1_df.count_alternate) / 2
case1_df["count_ratio"] = case1_df["count"] / (case1_df["count_alternate"] + 1e-4)
case1_df

Unnamed: 0,tok2,tok1,count,tok1_base,tok1_tone,tok2_base,tok2_tone,tok1_alternate,count_alternate,count_mean,count_ratio
0,guang3,xue2,1.0,xue,2,guang,3,xue6,0.0,0.5,10000.0
1,guang3,lian2,0.0,lian,2,guang,3,lian6,0.0,0.0,0.0
2,guang3,yi2,0.0,yi,2,guang,3,yi6,0.0,0.0,0.0
3,guang3,qiu2,0.0,qiu,2,guang,3,qiu6,0.0,0.0,0.0
4,guang3,wu2,0.0,wu,2,guang,3,wu6,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
81743,huai3,ku2,0.0,ku,2,huai,3,ku6,0.0,0.0,0.0
81744,huai3,ruan2,0.0,ruan,2,huai,3,ruan6,0.0,0.0,0.0
81745,huai3,gang2,0.0,gang,2,huai,3,gang6,0.0,0.0,0.0
81746,huai3,tui2,0.0,tui,2,huai,3,tui6,0.0,0.0,0.0


In [113]:
# case1_df[case1_df.count_ratio.between(1/4, 4)].sort_values("count_mean", ascending=False).head(50)

### Case 2

In [24]:
case2_df = bigram_df.loc[(bigram_df["tok1_tone"] == 6) & (bigram_df["tok2_tone"] == 3)]

case2_alternates = bigram_df.set_index(["tok1", "tok2"]).index.intersection(list(zip(case1_df.tok1.str.replace("6", "2"), case1_df.tok2)))
case2_alternates = bigram_df.set_index(["tok1", "tok2"]).loc[case2_alternates]

case2_df = pd.merge(case2_df, case2_alternates[["count", "tok1_base"]].reset_index(),
                    how="inner", left_on=["tok1_base", "tok2"], right_on=["tok1_base", "tok2"],
                    suffixes=("", "_alternate"))
case2_df["count_mean"] = (case2_df["count"] + case2_df.count_alternate) / 2
case2_df["count_ratio"] = case2_df["count"] / (case2_df["count_alternate"] + 1e-4)
case2_df

Unnamed: 0,tok2,tok1,count,tok1_base,tok1_tone,tok2_base,tok2_tone,tok1_alternate,count_alternate,count_mean,count_ratio
0,guang3,da6,0.0,da,6,guang,3,da2,1.0,0.5,0.0
1,guang3,wu6,0.0,wu,6,guang,3,wu2,0.0,0.0,0.0
2,guang3,si6,0.0,si,6,guang,3,si2,0.0,0.0,0.0
3,guang3,tie6,0.0,tie,6,guang,3,tie2,0.0,0.0,0.0
4,guang3,bai6,0.0,bai,6,guang,3,bai2,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
81743,huai3,mo6,0.0,mo,6,huai,3,mo2,0.0,0.0,0.0
81744,huai3,tan6,0.0,tan,6,huai,3,tan2,0.0,0.0,0.0
81745,huai3,zhai6,0.0,zhai,6,huai,3,zhai2,0.0,0.0,0.0
81746,huai3,luan6,0.0,luan,6,huai,3,luan2,0.0,0.0,0.0


In [112]:
# case2_df[case2_df.count_ratio.between(1/4, 4)].sort_values("count_mean", ascending=False).head(50)

## Play

In [36]:
aligned_df = pd.read_csv("intermediates/AISHELL/words_aligned.csv").set_index(["file", "word_idx"])
# Drop sentence-initial, sentence-final silence; renumber word idxs
aligned_df = aligned_df[~aligned_df.text.isna()]
aligned_df.index.set_names("drop", level="word_idx", inplace=True)
aligned_df["word_idx"] = aligned_df.groupby("file").cumcount()
aligned_df = aligned_df.reset_index("drop").set_index("word_idx", append=True)
aligned_df

Unnamed: 0_level_0,Unnamed: 1_level_0,drop,start,stop,text
file,word_idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SSB11250133,0,1,0.28,0.46,澳
SSB11250133,1,2,0.46,0.59,大
SSB11250133,2,3,0.59,0.98,利
SSB11250133,3,4,0.98,1.07,亚
SSB11250133,4,5,1.07,1.18,门
...,...,...,...,...,...
SSB11610110,16,21,5.90,6.10,赛
SSB11610110,17,22,6.10,6.22,一
SSB11610110,18,23,6.22,6.64,举
SSB11610110,19,24,6.64,6.88,得


In [51]:
# Check that sentence lengths agree
counts_from_aligned = aligned_df.groupby("file").size()
counts_from_df = df.set_index("file").text.str.count(" ") + 1
assert (counts_from_aligned == counts_from_df).all(), "Count mismatches between two data annotations"

In [107]:
show = case1_df[case1_df.count_ratio.between(1/4, 4)].sort_values("count_mean", ascending=False)
# show = case2_df[case2_df.count_ratio.between(1/4, 4)].sort_values("count_mean", ascending=False).head(5)
rendered = []

sentences_per_item = 5
render_items = 2

item_df = []

# For the given bigram item, extract specific sentences + audio clips
for i, (_, row) in enumerate(tqdm(show.iterrows(), total=len(show))):
    show_i_left = df[df.pinyin_underlying.str.contains(f"{row.tok1} {row.tok2}")].head(sentences_per_item)
    show_i_right = df[df.pinyin_underlying.str.contains(f"{row.tok1_alternate} {row.tok2}")].head(sentences_per_item)
    
    if len(show_i_left) == 0 or len(show_i_right) == 0:
        continue
        
    def prepare_instance(instance_row, tok1, tok2):
        item_start_char = instance_row.pinyin_underlying.index(f"{tok1} {tok2}")
        item_end_char = item_start_char + len(f"{tok1} {tok2}")
        item_start_idx = instance_row.pinyin_underlying[:item_start_char].count(" ")
        item_end_idx = instance_row.pinyin_underlying[:item_end_char].count(" ") + 1
        hanzi_toks = instance_row.text.split(" ")

        # Compute metadata
        preceding_syllable_forms_word = item_start_idx > 0 and ''.join(hanzi_toks[item_start_idx - 1:item_start_idx + 1]) in cedict

        # Retrieve alignment data
        item_aligned = aligned_df.loc[instance_row.file]
        
        # Retrieve containing file
        speaker_id = instance_row.file[:7]
        sound_path = f"intermediates/AISHELL/mfa_input/{speaker_id}/{instance_row.file}.wav"
        sound = pydub.AudioSegment.from_wav(sound_path)
        # Slice sound
        buffer_size = 1 # buffer size in seconds
        slice_onset = int(max(0, item_aligned.iloc[item_start_idx].start - buffer_size) * 1000)
        slice_offset = int((item_aligned.iloc[item_end_idx - 1].stop + buffer_size) * 1000)
        sound_sliced = sound[slice_onset:slice_offset]
        sound_file = f"intermediates/AISHELL/render/{instance_row.file}_{row.tok1_base}{row.tok2}.wav"
        sound_sliced.export(f"{sound_file}", format="wav")

        return {
            "file": instance_row.file,
            "hanzi": " ".join(hanzi_toks),
            "pinyin": instance_row.pinyin,
            "pinyin_underlying": instance_row.pinyin_underlying,
            "tok1": tok1,
            "tok2": tok2,
            "start_token_idx": item_start_idx,
            "end_token_idx": item_end_idx,
            "start_time": item_aligned.iloc[item_start_idx].start,
            "end_time": item_aligned.iloc[item_end_idx - 1].stop,

            "preceding_syllable_forms_word": preceding_syllable_forms_word,
            
            "slice_start": slice_onset,
            "slice_end": slice_offset,
            "slice_file": sound_file,
        }

    def render_instance(info):
        # Render text
        html_toks = info["hanzi"].split(" ")
        for idx in range(info["start_token_idx"], info["end_token_idx"]):
            html_toks[idx] = f"<strong>{html_toks[idx]}</strong>"
        if info["preceding_syllable_forms_word"]:
            for idx in range(info["start_token_idx"] - 1, info["start_token_idx"] + 1):
                html_toks[idx] = f"<span style='color: red'>{html_toks[idx]}</span>"
        pinyin = info["pinyin_underlying"].replace(f"{info['tok1']} {info['tok2']}", f"<strong>{info['tok1']} {info['tok2']}</strong>")
        
        return f"<tr><td style='font-family: monospace; font-size: 75%'>" + \
               f"{info['file']} <audio controls src='{info['slice_file']}'></audio></td>" + \
               f"<td><p>{''.join(html_toks)}<br/><small>{pinyin}</small></p></td></tr>"

    left_data = [prepare_instance(item, row.tok1, row.tok2) for _, item in show_i_left.iterrows()]
    right_data = [prepare_instance(item, row.tok1_alternate, row.tok2) for _, item in show_i_right.iterrows()]

    data = pd.concat([pd.DataFrame.from_records(left_data), pd.DataFrame.from_records(right_data)],
                     names=["case"], keys=[2, 6])
    item_df.append(data)
    
    if i < render_items:
        left_rendered = [render_instance(data_i) for data_i in left_data]
        right_rendered = [render_instance(data_i) for data_i in right_data]
        
        left_rendered = "<table>" + "".join(left_rendered) + "</table>"
        right_rendered = "<table>" + "".join(right_rendered) + "</table>"
        
        rendered.append(f"<tr><th colspan='2' style='text-align: center'>{row.tok1_base} {row.tok2}<br/><span style='font-weight: normal'>T6 T3: {row['count']}; T2 T3: {row.count_alternate}</span></th></tr>"
                        f"<tr><td>{left_rendered}</td><td>{right_rendered}</td></tr>")

HTML("<table>" + "".join(rendered) + "</table>")

# Presentation conditions

# Single syllable
# S: guo6
# Q: guo6 vs guo2

# A+B syllables
# S: guo6 you3
# Q: guo6 vs guo2

# A+B+right context (is this worth anything??)
# S: guo6 you3 shen2 me5
# Q: guo6 vs guo2

# Left context+A+B+Right context
# S: gan1 su4 de5 shui6 guo6 you3 shen2 me5
# Q: guo6 vs guo2

# Ways of accounting for bigram

# Add left context up to containing constituent of A/A+B
# Maybe easier to code: add left characters s.t. left context + A forms a word in the lexicon
# S: shui6 guo6 you3 shen2 me5
# Q: ..

# ----

# Variables to code in the df for controls and/or augmented designs:
# - do A+B form a word? (check lexicon)
# - do (A-1) + A form a word? (check lexicon)

  0%|          | 0/512 [00:00<?, ?it/s]

guo you3 T6 T3: 27.0; T2 T3: 17.0,guo you3 T6 T3: 27.0; T2 T3: 17.0.1
SSB00090085 文明帝国有什么 wen2 ming2 di4 guo2 you3 shen2 me5SSB00330050 国有独资类医院有什么 guo2 you3 du2 zi1 lei4 yi1 yuan4 you3 shen2 me5SSB03390017 他描述其公司为国有的国际化公司 ta1 miao2 shu4 qi2 gong1 si1 wei4 guo2 you3 de5 guo2 ji4 hua4 gong1 si1SSB03800333 东芝中国有限公司公关宣传部对记者表示 dong1 zhi1 zhong1 guo2 you3 xian4 gong1 si1 gong1 guan1 xuan1 chuan2 bu4 dui4 ji4 zhe6 biao3 shi4SSB04260384 连续有国有和民营资本投资设立保险公司 lian2 xu4 you3 guo2 you3 he2 min2 ying2 zi1 ben3 tou2 zi1 she4 li4 bao6 xian3 gong1 si1,SSB00330082 甘肃的水果有什么 gan1 su4 de5 shui6 guo6 you3 shen2 me5SSB00570108 目前大中华区苹果有一家直营店 mu4 qian2 da4 zhong1 hua2 qu1 ping2 guo6 you3 yi4 jia1 zhi2 ying2 dian4SSB00730128 其中样本检测结果有疑问的运动员多达八百人 qi2 zhong1 yang4 ben6 jian3 ce4 jie2 guo6 you3 yi2 wen4 de5 yun4 dong4 yuan2 duo1 da2 ba1 bai3 ren2SSB02000391 如果有更多的开发商参与进来 ru2 guo6 you3 geng4 duo1 de5 kai1 fa1 shang1 can1 yu4 jin4 lai2SSB03930398 如果有人理解的话 ru2 guo6 you3 ren2 li6 jie3 de5 hua4
guo shou3 T6 T3: 15.0; T2 T3: 17.0,guo shou3 T6 T3: 15.0; T2 T3: 17.0
SSB01390493 杜虹被媒体称为中国首例冰冻自己等待复生的逝者 du4 hong2 bei4 mei2 ti3 cheng1 wei2 zhong1 guo2 shou3 lie4 bin1 dong4 zi4 ji6 den3 dai4 fu4 sheng1 de5 si4 zhe3SSB02730319 女子用滴滴刷单获利三万被批捕系全国首例 nv6 zi3 yong4 di1 di1 shua1 dan1 huo4 li4 san1 wan4 bei4 pi1 bu3 xi4 quan2 guo2 shou3 li4SSB03850134 万科是全国首个住宅销售超过千亿的房企 wan4 ke1 shi4 quan2 guo2 shou3 ge4 zhu4 zai2 xiao1 shou4 cao1 guo4 qian1 yi4 de5 fan2 qi3SSB03850435 这是中国首次推动成立国际性金融业组织 zhe4 shi4 zhong1 guo2 shou3 ci4 tui1 dong4 cheng2 li4 guo2 ji4 xing4 jin1 rong2 ye4 zu3 zhi1SSB04260365 万科希望成为中国首席素质教育产业服务商 wan4 ke1 xi1 wang4 cheng2 wei2 zhong1 guo2 shou3 xi2 su4 zhi4 jiao4 yu4 chan3 ye4 fu2 wu4 shang1,SSB01220183 发现挎包和两部苹果手机被偷 fa1 xian4 kua4 bao1 he2 liang3 bu4 ping2 guo6 shou3 ji1 bei4 tou1SSB01970059 新款苹果手机的产量更大 xin1 kuan3 ping2 guo6 shou3 ji1 de5 chan3 liang4 geng4 da4SSB01970095 苹果手机维持高利润率的原因是什么 ping2 guo6 shou3 ji1 wei2 chi2 gao1 li4 run4 lv4 de5 yuan2 yin1 shi4 shen2 me5SSB01970226 苹果手机的利润占有率上涨 ping2 guo6 shou3 ji1 de5 li4 run4 zhan4 you3 lv4 shang4 zhang3SSB03150020 苹果手机的代工厂是台湾广达电脑公司 ping2 guo6 shou3 ji1 de5 dai4 gong1 chang3 shi4 tai2 wan1 guang3 da2 dian4 nao3 gong1 si1

0,1
SSB00090085,文明帝国有什么 wen2 ming2 di4 guo2 you3 shen2 me5
SSB00330050,国有独资类医院有什么 guo2 you3 du2 zi1 lei4 yi1 yuan4 you3 shen2 me5
SSB03390017,他描述其公司为国有的国际化公司 ta1 miao2 shu4 qi2 gong1 si1 wei4 guo2 you3 de5 guo2 ji4 hua4 gong1 si1
SSB03800333,东芝中国有限公司公关宣传部对记者表示 dong1 zhi1 zhong1 guo2 you3 xian4 gong1 si1 gong1 guan1 xuan1 chuan2 bu4 dui4 ji4 zhe6 biao3 shi4
SSB04260384,连续有国有和民营资本投资设立保险公司 lian2 xu4 you3 guo2 you3 he2 min2 ying2 zi1 ben3 tou2 zi1 she4 li4 bao6 xian3 gong1 si1

0,1
SSB00330082,甘肃的水果有什么 gan1 su4 de5 shui6 guo6 you3 shen2 me5
SSB00570108,目前大中华区苹果有一家直营店 mu4 qian2 da4 zhong1 hua2 qu1 ping2 guo6 you3 yi4 jia1 zhi2 ying2 dian4
SSB00730128,其中样本检测结果有疑问的运动员多达八百人 qi2 zhong1 yang4 ben6 jian3 ce4 jie2 guo6 you3 yi2 wen4 de5 yun4 dong4 yuan2 duo1 da2 ba1 bai3 ren2
SSB02000391,如果有更多的开发商参与进来 ru2 guo6 you3 geng4 duo1 de5 kai1 fa1 shang1 can1 yu4 jin4 lai2
SSB03930398,如果有人理解的话 ru2 guo6 you3 ren2 li6 jie3 de5 hua4

0,1
SSB01390493,杜虹被媒体称为中国首例冰冻自己等待复生的逝者 du4 hong2 bei4 mei2 ti3 cheng1 wei2 zhong1 guo2 shou3 lie4 bin1 dong4 zi4 ji6 den3 dai4 fu4 sheng1 de5 si4 zhe3
SSB02730319,女子用滴滴刷单获利三万被批捕系全国首例 nv6 zi3 yong4 di1 di1 shua1 dan1 huo4 li4 san1 wan4 bei4 pi1 bu3 xi4 quan2 guo2 shou3 li4
SSB03850134,万科是全国首个住宅销售超过千亿的房企 wan4 ke1 shi4 quan2 guo2 shou3 ge4 zhu4 zai2 xiao1 shou4 cao1 guo4 qian1 yi4 de5 fan2 qi3
SSB03850435,这是中国首次推动成立国际性金融业组织 zhe4 shi4 zhong1 guo2 shou3 ci4 tui1 dong4 cheng2 li4 guo2 ji4 xing4 jin1 rong2 ye4 zu3 zhi1
SSB04260365,万科希望成为中国首席素质教育产业服务商 wan4 ke1 xi1 wang4 cheng2 wei2 zhong1 guo2 shou3 xi2 su4 zhi4 jiao4 yu4 chan3 ye4 fu2 wu4 shang1

0,1
SSB01220183,发现挎包和两部苹果手机被偷 fa1 xian4 kua4 bao1 he2 liang3 bu4 ping2 guo6 shou3 ji1 bei4 tou1
SSB01970059,新款苹果手机的产量更大 xin1 kuan3 ping2 guo6 shou3 ji1 de5 chan3 liang4 geng4 da4
SSB01970095,苹果手机维持高利润率的原因是什么 ping2 guo6 shou3 ji1 wei2 chi2 gao1 li4 run4 lv4 de5 yuan2 yin1 shi4 shen2 me5
SSB01970226,苹果手机的利润占有率上涨 ping2 guo6 shou3 ji1 de5 li4 run4 zhan4 you3 lv4 shang4 zhang3
SSB03150020,苹果手机的代工厂是台湾广达电脑公司 ping2 guo6 shou3 ji1 de5 dai4 gong1 chang3 shi4 tai2 wan1 guang3 da2 dian4 nao3 gong1 si1


In [141]:
len(case1_df[(case1_df["count"] > 2) & (case1_df["count_alternate"] > 2)])

68

In [142]:
case1_df[(case1_df["count"] > 2) & (case1_df["count_alternate"] > 2)]

Unnamed: 0,tok2,tok1,count,tok1_base,tok1_tone,tok2_base,tok2_tone,tok1_alternate,count_alternate,count_mean,count_ratio
858,zhi3,yi2,7.0,yi,2,zhi,3,yi6,3.0,5.0,2.333256
866,zhi3,wei2,13.0,wei,2,zhi,3,wei6,5.0,9.0,2.599948
1296,bei3,yu2,8.0,yu,2,bei,3,yu6,4.0,6.0,1.999950
1724,li3,yu2,3.0,yu,2,li,3,yu6,3.0,3.0,0.999967
2366,bai3,yu2,8.0,yu,2,bai,3,yu6,3.0,5.5,2.666578
...,...,...,...,...,...,...,...,...,...,...,...
25979,shui3,li2,3.0,li,2,shui,3,li6,4.0,3.5,0.749981
26340,si3,chang2,3.0,chang,2,si,3,chang6,3.0,3.0,0.999967
30409,cai3,fu2,8.0,fu,2,cai,3,fu6,3.0,5.5,2.666578
30829,hao3,mei2,3.0,mei,2,hao,3,mei6,19.0,11.0,0.157894


In [108]:
item_df = pd.concat(item_df)
item_df

Unnamed: 0_level_0,Unnamed: 1_level_0,file,hanzi,pinyin,pinyin_underlying,tok1,tok2,start_token_idx,end_token_idx,start_time,end_time,preceding_syllable_forms_word,slice_start,slice_end,slice_file
case,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2,0,SSB00090085,文 明 帝 国 有 什 么,wen2 ming2 di4 guo2 you3 shen2 me5,wen2 ming2 di4 guo2 you3 shen2 me5,guo2,you3,3,5,0.93,1.21,True,0,2210,intermediates/AISHELL/render/SSB00090085_guoyo...
2,1,SSB00330050,国 有 独 资 类 医 院 有 什 么,guo2 you3 du2 zi1 lei4 yi1 yuan4 you3 shen2 me5,guo2 you3 du2 zi1 lei4 yi1 yuan4 you3 shen2 me5,guo2,you3,0,2,0.26,0.86,False,0,1859,intermediates/AISHELL/render/SSB00330050_guoyo...
2,2,SSB03390017,他 描 述 其 公 司 为 国 有 的 国 际 化 公 司,ta1 miao2 shu4 qi2 gong1 si1 wei4 guo2 you3 de...,ta1 miao2 shu4 qi2 gong1 si1 wei4 guo2 you3 de...,guo2,you3,7,9,1.60,1.91,False,600,2910,intermediates/AISHELL/render/SSB03390017_guoyo...
2,3,SSB03800333,东 芝 中 国 有 限 公 司 公 关 宣 传 部 对 记 者 表 示,dong1 zhi1 zhong1 guo2 you3 xian4 gong1 si1 go...,dong1 zhi1 zhong1 guo2 you3 xian4 gong1 si1 go...,guo2,you3,3,5,0.84,1.21,True,0,2210,intermediates/AISHELL/render/SSB03800333_guoyo...
2,4,SSB04260384,连 续 有 国 有 和 民 营 资 本 投 资 设 立 保 险 公 司,lian2 xu4 you3 guo2 you3 he2 min2 ying2 zi1 be...,lian2 xu4 you3 guo2 you3 he2 min2 ying2 zi1 be...,guo2,you3,3,5,1.28,1.64,False,280,2639,intermediates/AISHELL/render/SSB04260384_guoyo...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,0,SSB05440080,可 我 们 通 过 晨 跑 却 让 生 理 机 能 永 葆 青 春,ke2 wo3 men5 tong1 guo4 chen2 pao3 que4 rang4 ...,ke6 wo3 men5 tong1 guo4 chen2 pao3 que4 rang4 ...,yong6,bao3,13,15,2.60,2.92,False,1600,3920,intermediates/AISHELL/render/SSB05440080_yongb...
2,0,SSB04270377,其 中 三 水 上 月 新 建 住 宅 成 交 套 数儿 环 比 增 幅 五 区 第 一,qi2 zhong1 san1 shui3 shang4 yue4 xin1 jian4 z...,qi2 zhong1 san1 shui3 shang4 yue4 xin1 jian4 z...,fu2,wu3,17,19,4.41,4.68,True,3410,5680,intermediates/AISHELL/render/SSB04270377_fuwu3...
6,0,SSB11250489,金 府 五 金 机 电 城 金 府 路,jin1 fu2 wu3 jin1 ji1 dian4 cheng2 jin1 fu3 lu4,jin1 fu6 wu3 jin1 ji1 dian4 cheng2 jin1 fu3 lu4,fu6,wu3,1,3,0.43,0.87,False,0,1870,intermediates/AISHELL/render/SSB11250489_fuwu3...
2,0,SSB01450418,娱 乐 节 目 昨 日 暂 停 五 天 抗 战 剧 避 免 手 撕 鬼 子,yu2 le4 jie2 mu4 zuo2 ri4 zan4 ting2 wu3 tian1...,yu2 le4 jie2 mu4 zuo2 ri4 zan4 ting2 wu3 tian1...,ting2,wu3,7,9,1.76,2.51,True,760,3510,intermediates/AISHELL/render/SSB01450418_tingw...


In [None]:
item_df.to_csv("intermediates/AISHELL/item_df.csv")