In [41]:
import os
import logging
import pandas as pd
import numpy as np
import scipy.sparse as sp
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import normalize, OneHotEncoder, PolynomialFeatures
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2, mutual_info_classif, VarianceThreshold, SelectKBest
from sklearn.metrics.pairwise import linear_kernel, cosine_distances
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from util import clean_data

logging.getLogger().setLevel(logging.INFO)
np.random.seed(1234567)

In [61]:
def load_data(prefix, ids, suffix='mb01dz.csv'):
    '''加载方言字音数据'''

    logging.info('loading {} data files ...'.format(len(ids)))

    load_ids = []
    dialects = []
    for id in ids:
        try:
            fname = os.path.join(prefix, id + suffix)
            logging.info(f'loading {fname} ...')
            d = pd.read_csv(
                fname,
                encoding='utf-8',
                index_col='iid',
                usecols=('iid', 'initial', 'finals', 'tone'),
                dtype={ 'iid': int, 'initial': str, 'finals': str, 'tone': str}
            )
        except Exception as e:
            logging.error('cannot load file {}: {}'.format(fname, e))
            continue

        d = clean_data(d)
        dialects.append(d)
        load_ids.append(id)

    logging.info('done. {} data loaded'.format(len(dialects)))

    data = pd.concat(
        [d.groupby(d.index).agg(' '.join) for d in dialects],
        axis=1,
        keys=load_ids
    ).dropna(axis=0, how='all').fillna('')

    logging.info(f'load data of {data.shape[0]} characters x {len(dialects)} dialects, ' \
        + f'{sum(d.shape[0] for d in dialects)} valid records')
    return data

def encode_data(data):
    features = []
    limits = []
    for col in data.columns.levels[1]:
        d = data.loc[:, (slice(None), col)]

        fea = []
        cat = []
        for i in range(d.shape[1]):
            cv = CountVectorizer(lowercase=False, token_pattern=r'\S+', binary=True)
            fea.append(cv.fit_transform(d.iloc[:, i]))
            cat.append(len(cv.vocabulary_))

        features.append(sp.hstack(fea, format='csr'))
        limits.append(np.insert(np.cumsum(cat), 0, 0))

    return features, limits

def get_cluster(features, p=0.99, cluster=200):
    vt = VarianceThreshold(p * (1 - p))
    kmeans = KMeans(cluster).fit(vt.fit_transform(features))
    logging.info(f'keep {vt.get_support(indices=True).shape[0]}/{features.shape[1]} after filtered')
    return vt.get_support(indices=True), kmeans.labels_, sp.csr_matrix(kmeans.cluster_centers_)

def get_homophone(features, limits):
    indices = np.triu_indices(features.shape[0])
    homophone = sp.vstack(
        [sp.csr_matrix(linear_kernel(features[:, limits[i]:limits[i + 1]])[indices]) \
            for i in range(limits.shape[0] - 1)],
        format='csr'
    )
    return homophone, indices

def merge_homophone(homophome, p=0.998, clusters=1000):
    vt = VarianceThreshold(p * (1 - p))
    kmeans = KMeans(clusters).fit(vt.fit_transform(homophome).T)
    logging.info(f'keep {vt.get_support(indices=True).shape[0]}/{homophone.shape[1]} after filtered')
    return vt.get_support(indices=True), kmeans.labels_, kmeans.cluster_centers_.T

In [3]:
def clean_dialect_tag(tag):
    return np.where(
        tag.str.contains('[吴闽赣粤湘晋徽]'),
        tag.str.replace('.*([吴闽赣粤湘晋徽]).*', r'\1方言', regex=True),
        np.where(
            tag.str.contains('客'),
            tag.str.replace('.*(客).*', '客家方言', regex=True),
            np.where(
                tag.str.contains('平话'),
                tag.str.replace('.*平话.*', '平话', regex=True),
                np.where(
                    tag.str.contains('北京|东北|冀鲁|胶辽|中原|兰银|江淮|西南'),
                    tag.str.replace('.*(北京|东北|冀鲁|胶辽|中原|兰银|江淮|西南).*', r'\1官话', regex=True),
                    np.where(
                        tag.str.contains('湖南|韶州'),
                        tag.str.replace('.*(湖南|韶州).*', '\1土话', regex=True),
                        np.nan
                    )
                )
            )
        )
    )

def get_dialect(location):
    dialect = pd.Series(clean_dialect_tag(location['area']), index=location.index)
    return dialect.where(dialect.notna(), clean_dialect_tag(location['slice']))

In [4]:
prefix = r'D:\git\zhongguoyuyan\csv'
location = pd.read_csv(os.path.join(prefix, 'dialect', 'location.csv'), index_col=0)
char = pd.read_csv(os.path.join(prefix, 'words.csv'), index_col=0)

In [62]:
data = load_data(os.path.join(prefix, 'dialect'), location.sample(100).index)
data = data[data.index.isin(char.index)]
codes, limits = encode_data(data)

clusters = []
cluster_names = []
homophones = []
rule_names = []
rule_group_names = []
for code, lim in zip(codes, limits):
    _, cl, centers = get_cluster(code)
    cl_name = char.groupby(cl)['item'].agg(''.join).reindex(np.arange(centers.shape[0]))
    features = sp.vstack([sp.csr_matrix(code[cl == i].mean(axis=0)) for i in range(centers.shape[0])])
    homophone, indices = get_homophone(features, lim)
    rule_name = pd.Series(f'{cl_name.loc[indices[0][j]]}={cl_name.loc[indices[1][j]]}' for j in range(indices[0].shape[0]))
    idx, homophone_cluster, homophone_centers = merge_homophone(homophone)
    homophone_cluster = pd.Series(homophone_cluster, index=idx)
    rule_group_name = rule_name[idx].groupby(homophone_cluster).agg(' '.join).reindex(np.arange(homophone_centers.shape[0]))
    clusters.append(cl)
    cluster_names.append(cl_name)
    homophones.append(homophone)
    rule_names.append(rule_name)
    rule_group_names.append(rule_group_name)

INFO:root:loading 100 data files ...
INFO:root:loading D:\git\zhongguoyuyan\csv\dialect\03E89mb01dz.csv ...
INFO:root:loading D:\git\zhongguoyuyan\csv\dialect\10114mb01dz.csv ...
INFO:root:loading D:\git\zhongguoyuyan\csv\dialect\14E63mb01dz.csv ...
INFO:root:loading D:\git\zhongguoyuyan\csv\dialect\15972mb01dz.csv ...
INFO:root:loading D:\git\zhongguoyuyan\csv\dialect\28396mb01dz.csv ...
INFO:root:loading D:\git\zhongguoyuyan\csv\dialect\26A39mb01dz.csv ...
INFO:root:loading D:\git\zhongguoyuyan\csv\dialect\10G71mb01dz.csv ...
INFO:root:loading D:\git\zhongguoyuyan\csv\dialect\21352mb01dz.csv ...
INFO:root:loading D:\git\zhongguoyuyan\csv\dialect\01021mb01dz.csv ...
INFO:root:loading D:\git\zhongguoyuyan\csv\dialect\12D80mb01dz.csv ...
INFO:root:loading D:\git\zhongguoyuyan\csv\dialect\26A38mb01dz.csv ...
INFO:root:loading D:\git\zhongguoyuyan\csv\dialect\01E08mb01dz.csv ...
INFO:root:loading D:\git\zhongguoyuyan\csv\dialect\27388mb01dz.csv ...
INFO:root:loading D:\git\zhongguoyuyan\c

In [63]:
display(cluster_names)
display(rule_group_names)

[0                 箍古苦裤吴五虎壶户乌
 1                          缚
 2                         绝雪
 3                        厌炎盐
 4      柄平病明命镜影饼名领井清姓轻赢钉厅听零青星
                ...          
 195                        吞
 196                   力壁劈踢笛历
 197                        让
 198                       甲鸭
 199             胎台袋来菜财该改海爱盖害
 Name: item, Length: 200, dtype: object,
 0                    芋意衣腰要优幼厌音烟印一隐秧样约蝇影益育拥
 1                                    猪主转砖准
 2                       锁岁死四丝嫂三伞酸算孙笋索塞送松宋缩
 3                                        嫩
 4      把布贝拜摆杯碑比宝包饱表扮八班板变扁憋搬半拨笔本帮绑剥北冰逼百兵柄饼壁
                       ...                 
 195                                      剩
 196                                  西洗息惜锡
 197                                    去器气
 198                                      择
 199                             吕犁梨李料立连律亮力
 Name: item, Length: 200, dtype: object,
 0                                                      迎
 1      嫁借化布做锯付爱带盖拜晒对碎怪挂岁肺戏四试记醉费贵罩孝笑照要钓叫够瘦暗占剑变

[0                     菊育曲局玉浴=靴月越决缺 菊育曲局玉浴=血 靴月越决缺=肉 肉=血
 1                                    冷省耕=冷省耕 冷省耕=生更梗坑硬争
 2                    阔活=着弱 果过火货祸=着弱 多拖锣左躲螺坐锁错所=着弱 脱夺=着弱
 3     冻痛动洞弄粽送宋=朋猛棚 东懂通桶铜聋葱冬脓松=朋猛棚 宫龙松恭共=朋猛棚 朋猛棚=中虫终充...
 4                 竹畜粥叔熟烛属=赌土图杜奴路租初锄数数 赌土图杜奴路租初锄数数=读鹿族毒赎
                             ...                        
 95                                  集急及吸匹七吉一=日 鼻立习笔密栗=日
 96    百拍白麦=贝杯配赔背煤妹碑 百拍白麦=肺飞费肥 得特刻黑测=贝杯配赔背煤妹碑 得特刻黑测=肺...
 97    冻痛动洞弄粽送宋=横 东懂通桶铜聋葱冬脓松=横 宫龙松恭共=横 横=中虫终充重肿种冲 横=公...
 98                            搭踏杂塔蜡插闸=杉 法达辣擦八扎杀发罚=杉 杉=大
 99    西洗鸡溪契系=吹水 西洗鸡溪契系=灰回会 西洗鸡溪契系=垂追锤 抖偷头豆楼走瘦=朝照烧 桂规...
 Length: 100, dtype: object,
 0                                           竖树顺=鼠 书输水=鼠
 1     下夏校限学行=寺随 程=区圈劝缺曲 左租做紫资子嘴醉早灶走卒作粽足=柿 终肿种烛=柿 州折针...
 2                                         窄=窄 窄=纸指罩找扎争摘
 3     间角=茄渠棋桥球权裙穷 可开口看渴糠壳肯坑客哭=区圈劝缺曲 可开口看渴糠壳肯坑客哭=溪 茄渠...
 4                                               猪主转砖准=柱
                             ...                        
 9

In [129]:
clusters = pd.DataFrame(np.column_stack(clusters), index=data.index, columns=data.columns.levels[1])
tmp = pd.concat([char[['item']].reindex(clusters.index).fillna(''), clusters], axis=1, join='inner')
cluster_names = [tmp.groupby(c)['item'].agg(''.join) for c in clusters.columns]

for name in cluster_names:
    display(name)

finals
0            紫刺资死四子字丝祠寺
1                皮比屁地梨李
2                  吞墩蹲轮
3                    猪鼠
4      品贫民邻进亲新紧银印引筋劲勤近隐
             ...       
195                   骨
196                   兄
197                 木服目
198                   硬
199                 网王旺
Name: item, Length: 199, dtype: object

initial
0             意衣腰要优幼厌音烟印一隐秧约蝇影益拥
1                           举句卷橘
2                            碎岁笋
3      锣螺路来雷类老楼拉蓝蜡兰懒烂辣乱轮浪落冷聋鹿六龙绿
4           大杜袋弟递地道豆淡碟达垫断夺定笛动洞读毒
                 ...            
195                           任褥
196                         桃头潭糖
197                          决均军
198                            弄
199                            辰
Name: item, Length: 199, dtype: object

tone
0                                                 吉橘竹菊足烛
1                                                      费
2      多歌靴沙瓜花租乌猪书输该街低西鸡溪杯灰歪碑知资饥丝衣规追龟飞刀高包抓交焦烧腰箫钩修州休优丢三...
3                                     可谱所府解拐紫感险品产典显反隐党纺孔
4                                                     林连
                             ...                        
195                                                    胖
196                                                    兴
197                                                    市
198    躲锁果火假哑姐写赌古苦虎许主改海摆矮洗纸比死子喜几嘴水鬼宝早嫂好饱找小抖走狗酒手九胆敢减点板...
199                                                    屋
Name: item, Length: 199, dtype: object

In [196]:
features = np.column_stack(homophone_centers)
labels = get_dialect(location)
train_location, test_location, train_features, test_features, train_target, test_target = train_test_split(
    location[labels.notna()],
    features[labels.notna()],
    labels[labels.notna()],
    test_size=0.2
)

In [101]:
selector = SelectKBest(chi2, k=10000)
features = selector.fit_transform(train_features, train_target)

In [135]:
feature_names = pd.Series([f'{clusters.columns[i]}:{cluster_names[i].loc[indices[i][0][j]]}={cluster_names[i].loc[indices[i][1][j]]}' for i in range(len(cluster_names)) for j in range(indices[i][0].shape[0])])
display(feature_names[np.argsort(-selector.scores_)[:100]])

13367                                        finals:项=荣永
21279    initial:把布贝拜摆杯碑比宝包饱表扮八班板变扁憋搬半拨笔本帮绑剥北冰逼百兵柄饼壁=肥饭房
27860                                   initial:下学=欠牵劝庆轻
10916                                       finals:正声城=兄
33203                                   initial:鞋咸=欠牵劝庆轻
                              ...                       
35244                                initial:课苦裤块快宽阔=肥饭房
37258                            initial:乌歪卫位围胃碗挖弯温握翁屋=项
42574                                   tone:笔骨北得色谷福宿曲=伞
26605                                 initial:火货花化灰欢婚慌=鱼
23834        initial:多躲赌带低对刀钓抖丢搭胆点店跌单典端短墩蹲党灯等凳得打钉顶东懂冻冬=传
Length: 100, dtype: object

In [148]:
kmeans = KMeans(1000).fit(features.T)
feature_group_names = feature_names[selector.get_support()].groupby(kmeans.labels_).agg(' '.join)
display(feature_group_names)

0      tone:校任恨凤=客 tone:坏画话换县认让=客 tone:柿=客 tone:淡=客 t...
1      finals:接叶业贴碟协孽捏=夹 finals:瓜瓦花化华挂画话划=外 finals:月越...
2      tone:鸽接贴歇切泼脱缺约剥=着 tone:笔骨北得色谷福宿曲=着 tone:刻侧测畜=着...
3      tone:林连=六 tone:河斜壶回骑朝含咸传船贫辰朋程瓶停横红=六 tone:茄婆爬茶蛇...
4               initial:下学=吸 initial:吸=虾瞎 initial:吸=夏校限行
                             ...                        
995    finals:择=歌个可鹅饿河课 finals:歌个可鹅饿河课=虱特刻侧测色 finals:...
996                               finals:胎台袋来菜财该改开海爱盖害=还
997              initial:绕=艺蚁义 initial:绕=银 initial:绕=验严业
998                       initial:女泥黏念孽年捏娘=弱 initial:弱=牛
999    initial:写西洗小笑箫修心鲜线先选雪新想削息姓惜星锡=虾瞎 initial:下学=像 ...
Length: 1000, dtype: object

In [197]:
def benchmark(features, target, estimators):
    for e in estimators:
        scores = cross_val_score(e, features, target)
        display(f'{e} {scores.mean()}+-{scores.std()}')

In [198]:
benchmark(train_features, train_target, (
    make_pipeline(
        # PolynomialFeatures(interaction_only=True, include_bias=False),
        LogisticRegression(penalty='l1', solver='saga')
    ),
    make_pipeline(
        # PolynomialFeatures(interaction_only=True, include_bias=False),
        SVC(),
    ),
    DecisionTreeClassifier(max_depth=20),
    GradientBoostingClassifier(n_estimators=20)
))



"Pipeline(steps=[('logisticregression',\n                 LogisticRegression(penalty='l1', solver='saga'))]) 0.8817950581395347+-0.013413653851443866"



"Pipeline(steps=[('svc', SVC())]) 0.8538032945736435+-0.014323550441179056"



'DecisionTreeClassifier(max_depth=20) 0.7387839147286821+-0.041584076108233364'



'GradientBoostingClassifier(n_estimators=20) 0.8087330426356589+-0.01007184009058067'

In [246]:
test_location.assign(label=test_target, pred=pred)[pred != test_target]

Unnamed: 0,longitude,latitude,insertDate,uploader,firstLevelid,sheetName,filepath,province,city,country,...,minorityInfo,dialectInfo,operaInfo,source,degree,area,slice,slices,label,pred
05D22,119.641667,41.35,,,05D22,,辽宁/凌源需交文件电子版/模板表/其他/概况.xls,辽宁,朝阳市,凌源市,...,（无）,凌源方言属北京官话朝峰片。凌源处于北京、辽宁、河北三省交界地带，因此兼具多个官话方言区的特点...,皮影戏、评戏、京剧、京韵大鼓、二人转在当地比较流行。其中凌源皮影戏于2006年被列入第一批国...,yubao,common,北京官话,朝峰片,系属不明,北京官话,东北官话
15990,113.208333,25.233333,,,15990,,广东/皈塘需交文件电子版/模板表/其他/概况.xls,广东,韶关,乐昌,...,（无）,皈塘村委下辖各村小组所有人均说皈塘话。另外，从原皈塘村分出单立的现京口村的全部人，金鸡、新岩...,有少数老年人会唱花鼓戏（使用湖南衡阳一带的话）和客家山歌（使用客家话）,yubao,common,平话土话,韶州片,皈塘小片,平话方言,湘方言
08F70,119.316667,29.483333,,,08F70,,浙江/寿昌需交文件电子版/模板表/其他/概况.xls,浙江,杭州市,建德市,...,（无）,1.寿昌话，分布于旧寿昌县。2.建德话，主要分布于原建德县境内，一般一个镇一种口音，例如梅城...,越剧、婺剧，在当地皆盛行。,yubao,common,徽语严州片,系属不明,系属不明,徽方言,吴方言
35H24,109.7,36.666667,,,35H24,,濒危方言/陕西延安老户话需交文件电子版/模板表/其他/概况.xls,濒危方言,延安,宝塔区,...,无呈区域分布的少数民族语言,本区主要有两种方言：老户话和上头话，同属于晋语。老户话，是延安的原住民方言，上头话是来自榆林...,延安宝塔区当下流行的地方曲艺有说书、民歌、晋剧、眉户戏等，但都不是本地原有的。用老户话说唱的...,yubao,endangered,晋语,志延片,系属不明,晋方言,中原官话
11D10,115.391667,39.616667,,,11D10,,河北/涞水需交文件电子版/模板表/其他/概况.xls,河北,保定,涞水,...,本县无少数民族语言，少数民族均说汉语。,涞水方言大致分为城关话和山区话两种，山区话主要指三坡镇、九龙镇方言，城关话指平原地带各乡镇方...,（无）,yubao,common,冀鲁官话,保唐片,定霸小片,冀鲁官话,北京官话
26D48,111.253887,25.199988,,,26D48,,湖南/江永需交文件电子版/模板表/其他/概况.xls,湖南,永州,江永,...,有过山瑶，又称高山瑶话，属于瑶族勉方言。主要分布在松柏、千家峒、兰溪、源口四个瑶族乡的少数村...,江永县的汉语方言有很多种，主要分为三大类：一类是以上江圩、城关潇浦镇为代表的土话，一类是以松...,（无）,yubao,common,湖南土话,永州,江永,土话,湘方言
23J18,108.658567,19.101105,,,23J18,,海南/东方军话需交文件电子版/模板表/其他/概况.xls,海南,（无）,东方市,...,少数民族语言主要是黎语，人口约8.5万人；其次是哥隆话（村话），哥隆人民族成分归为汉族，但是...,汉语方言主要有军话（属西南官话）、海南话（属闽语）、儋州话（属粤语），其中军话主要分布在八所...,军话民歌用军话演唱，形式一般为男女对唱，也有独唱、合唱、小组唱等；内容一般为表达爱情，也有思...,yubao,common,官话,西南官话,海南军话,西南官话,江淮官话
02A10,117.1,26.391667,,,02A10,,福建/明溪需交文件电子版/模板表/其他/概况.xls,福建,三明市,明溪县,...,（无）,明溪县有四镇五乡。明溪话通常指建县以来的政治文化经济中心雪峰镇话。此外，城关乡、瀚仙镇、沙溪...,明溪县夏阳乡有大腔戏，系从永安引进，一般只在夏阳演出。明溪县城较少地方戏演出。,yubao,common,客家话区,闽西片,闽客赣过渡片,客家方言,闽方言
14A65,102.866667,36.333333,,,14A65,,甘肃/红古需交文件电子版/模板表/其他/概况.xls,甘肃,兰州市,红古区,...,（无）,红古区方言内部口音有窑街口音和红古口音。窑街口音主要分布在区内窑街镇，其语音系统与兰州话基本...,红古地区流行的主要剧种有秦腔、眉户。一般在年节、庙会演出或伴随社会庆典活动时演出，多为折子戏...,yubao,common,兰银官话,金城片,红古小片,兰银官话,中原官话
35H13,109.941667,18.575,,,35H13,,濒危方言/海南陵水疍家话需交文件电子版/模板表/其他/概况.xls,濒危方言,（无）,陵水黎族自治县,...,（无）,新村镇通行当地闵语（海南话）和疍家话两种方言，全镇1万3千人基本会说海南话，其中将近1万人同...,疍家调，原来叫咸水歌，新加坡、澳门、香港等地粤语人群都会唱。本地疍家人50岁以上才会唱，年轻...,yubao,endangered,中原官话,信蚌片,系属不明,中原官话,粤方言
