In [1]:
import logging
import os
import pandas as pd
import numpy as np
import scipy.sparse as sp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import dtale
import util

In [2]:
prefix = r'C:\users\lerna\git\zhongguoyuyan\csv'
dialect_path = os.path.join(prefix, 'dialect')
location = pd.read_csv(os.path.join(dialect_path, 'location.csv'), index_col=0)
char = pd.read_csv(os.path.join(prefix, 'words.csv'), index_col=0)
dialect = util.get_dialect(location)
mandarin = location[dialect.str.endswith('官话') | (dialect == '晋方言')]

In [3]:
dtale.show(pd.concat([location, dialect.to_frame(name='dialect')], axis=1))







In [4]:
data = util.load_data(dialect_path, mandarin.index, transpose=True)

2022-10-07 23:16:12,114 - INFO     - loading 463 data files ...
2022-10-07 23:16:12,115 - INFO     - loading C:\users\lerna\git\zhongguoyuyan\csv\dialect\06K06mb01dz.csv ...
2022-10-07 23:16:12,170 - INFO     - loading C:\users\lerna\git\zhongguoyuyan\csv\dialect\06K10mb01dz.csv ...
2022-10-07 23:16:12,227 - INFO     - loading C:\users\lerna\git\zhongguoyuyan\csv\dialect\06K09mb01dz.csv ...
2022-10-07 23:16:12,272 - INFO     - loading C:\users\lerna\git\zhongguoyuyan\csv\dialect\06K11mb01dz.csv ...
2022-10-07 23:16:12,308 - INFO     - loading C:\users\lerna\git\zhongguoyuyan\csv\dialect\06K07mb01dz.csv ...
2022-10-07 23:16:12,345 - INFO     - loading C:\users\lerna\git\zhongguoyuyan\csv\dialect\06K08mb01dz.csv ...
2022-10-07 23:16:12,387 - INFO     - loading C:\users\lerna\git\zhongguoyuyan\csv\dialect\06F98mb01dz.csv ...
2022-10-07 23:16:12,425 - INFO     - loading C:\users\lerna\git\zhongguoyuyan\csv\dialect\06F90mb01dz.csv ...
2022-10-07 23:16:12,460 - INFO     - loading C:\users\le

In [5]:
dtale.show(pd.concat([char, data.loc[:, '03E82']], axis=1))







In [6]:
codes = {}
limits = {}

for part in ('initial', 'finals', 'tone'):
    categories = []
    code = []
    for c in data.loc[:, pd.IndexSlice[:, part]]:
        cv = CountVectorizer(lowercase=False, token_pattern=r'\S+', dtype=np.int32)
        code.append(cv.fit_transform(data.loc[:, c]))
        categories.append(len(cv.vocabulary_))

    lim = np.empty(len(categories) + 1, dtype=np.int64)
    lim[0] = 0
    np.cumsum(categories, out=lim[1:])
    code = sp.hstack(code)

    codes[part] = code
    limits[part] = lim

In [43]:
initial_rule = (
    ((14, 29, 135, 140, 171, 198, 555, 663, 850, 859, 889), (58, 134, 169, 297, 488, 491, 497, 544, 552, 553, 705, 810, 818, 857)),
    ((14, 29, 135, 140, 171, 198, 555, 663, 850, 859, 889), (13, 59, 139, 213, 299, 498, 528, 571, 842, 903)),
    ((61, 199, 277, 860), (58, 134, 169, 297, 488, 491, 497, 544, 552, 553, 705, 810, 818, 857)),
    ((61, 199, 277, 860), (13, 59, 139, 213, 299, 498, 528, 571, 842, 903)),
    ((120, 282, 327, 334, 430, 463, 532, 709, 894, 923), (1, 17, 62, 131, 156, 175, 280, 326, 332, 372, 387, 427, 428, 461, 533, 558, 794, 796, 804, 832, 890, 891, 917, 943)),
    ((120, 282, 327, 334, 430, 463, 532, 709, 894, 923), (2, 157, 333, 374, 434, 462, 531, 573, 623, 708, 716, 893, 904, 920, 922)),
    ((3, 121, 159, 160, 216, 283, 335, 389, 559, 574, 896, 905, 924, 925, 937), (1, 17, 62, 131, 156, 175, 280, 326, 332, 372, 387, 427, 428, 461, 533, 558, 794, 796, 804, 832, 890, 891, 917, 943)),
    ((3, 121, 159, 160, 216, 283, 335, 389, 559, 574, 896, 905, 924, 925, 937), (2, 157, 333, 374, 434, 462, 531, 573, 623, 708, 716, 893, 904, 920, 922)),
    ((12, 320, 414, 659, 699, 958), (330, 362, 449, 458, 538, 539, 551, 596, 641, 657, 691, 698, 745, 756, 863, 973)),
    ((12, 320, 414, 659, 699, 958), (597, 621, 881)),
    ((364, 365, 512, 660), (330, 362, 449, 458, 538, 539, 551, 596, 641, 657, 691, 698, 745, 756, 863, 973)),
    ((364, 365, 512, 660), (597, 621, 881)),
)
finals_rule = (
    ((198, 213), (155,)),
    ((152, 153), (202, 203, 221)),
    ((858, 859, 860, 861, 862, 868, 869), (810,)),
    ((204, 223, 238, 239, 240), (202, 203, 221)),
    ((204, 223, 238, 239, 240), (222, 235, 236, 237)),
    ((714, 715, 786), (570, 571, 572)),
    ((714, 715, 786), (276, 277, 278, 279)),
    ((802,), (841, 842, 843, 853)),
    ((809,), (848,)),
    ((822, 824), (651,)),
    ((802,), (818, 903))
)
tone_rule = (
    ((19, 105, 148, 159, 240, 256, 277, 283, 345, 364, 559, 660, 873, 924), (3, 45, 61, 106, 121, 133, 186, 192, 216, 236, 335, 365, 473, 568, 860, 896, 925)),
    ((456, 518, 574, 576, 578, 650, 652, 703, 753, 806, 821, 843, 885, 887, 905, 937, 948, 962), (11, 12, 14, 29, 32, 44, 47, 78, 87, 135, 140, 143, 147, 171, 180, 198, 207, 239))
)

rule = pd.concat([
    pd.DataFrame(initial_rule, columns=['cid1', 'cid2']).assign(part='initial'),
    pd.DataFrame(finals_rule, columns=['cid1', 'cid2']).assign(part='finals'),
    pd.DataFrame(tone_rule, columns=['cid1', 'cid2']).assign(part='tone')
], axis=0, ignore_index=True)

rule.insert(0, 'name', rule['part'] + ':' + char.loc[rule['cid1'].str[0], 'item'].values
    + '=' + char.loc[rule['cid2'].str[0], 'item'].values)

cid1 = rule['cid1'].explode()
cid2 = rule['cid2'].explode()
rule['char1'] = char.loc[cid1, 'item'].groupby(cid1.index).agg(''.join)
rule['char2'] = char.loc[cid2, 'item'].groupby(cid2.index).agg(''.join)

In [44]:
dtale.show(rule)



In [45]:
homos = []

for part in ('initial', 'finals', 'tone'):
    r = rule[rule['part'] == part]
    if r.shape[0] > 0:
        code = codes[part]
        lim = limits[part]

        code1 = np.stack([code[[data.index.get_loc(c) for c in g]].sum(axis=0).A.squeeze() for g in r['cid1']], axis=0)
        code2 = np.stack([code[[data.index.get_loc(c) for c in g]].sum(axis=0).A.squeeze() for g in r['cid2']], axis=0)

        homos.append(pd.DataFrame(
            np.stack(
                [np.sum(
                    normalize(code1[:, lim[i]:lim[i + 1]], norm='l1') \
                        * normalize(code2[:, lim[i]:lim[i + 1]], norm='l1'),
                    axis=1
                ) for i in range(lim.shape[0] - 1)],
                axis=0
            ),
            index=mandarin.index,
            columns=r.index
        ))

homo = pd.concat(homos, axis=1).sort_index(axis=1)
homo.columns = rule.loc[homo.columns, 'name']

In [46]:
dtale.show(pd.concat([
    mandarin[['province', 'city', 'country']],
    dialect.loc[mandarin.index].to_frame(name='dialect'),
    homo
], axis=1))



In [47]:
rule_char = pd.concat([cid1, cid2], axis=0).to_frame(name='cid')
rule_char.index.set_names('rules', inplace=True)
rule_char.reset_index(inplace=True)
rule_char = rule_char.assign(names=rule.loc[rule_char['rules'], 'name'].values) \
    .groupby('cid').agg({'rules': tuple, 'names': ','.join})
rule_char.insert(0, 'char', char['item'])

pronunciation = pd.concat([
    rule_char,
    data.loc[rule_char.index, pd.IndexSlice[:, 'initial']].droplevel(axis=1, level=1)
        + data.loc[rule_char.index, pd.IndexSlice[:, 'finals']].droplevel(axis=1, level=1)
        + data.loc[rule_char.index, pd.IndexSlice[:, 'tone']].droplevel(axis=1, level=1),
], axis=1)

In [48]:
dtale.show(pronunciation)

