In [252]:
import os
import logging
import pandas as pd
import numpy as np
import scipy.sparse as sp
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.metrics.pairwise import cosine_distances
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering
from sklearn.feature_selection import chi2, mutual_info_classif, SelectKBest
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from util import clean_data
from similarity import load_data

np.random.seed(1234567)

In [95]:
def get_cluster(features, cluster):
    onehot = OneHotEncoder(handle_unknown='ignore').fit(
        SimpleImputer(missing_values='', strategy='most_frequent').fit_transform(features)
    ).transform(features)
    return KMeans(cluster).fit_predict(onehot)

def get_char_cluster(data):
    columns = {'initial': 100, 'finals': 200, 'tone': 50}

    result = pd.DataFrame(index=data.index)
    for col, cluster in columns.items():
        result[col] = get_cluster(data.loc[:, (slice(None), col)], cluster)

    return result

In [41]:
def get_homophone(data, chars):
    data = data[data.index.isin(chars)]
    mat = pd.DataFrame((data.values[:, None] == data.values[None, :]))
    mat = mat.groupby(data.index).any().astype(float).reindex(chars)
    mat = mat.transpose().groupby(data.index).mean().reindex(chars)
    return np.minimum(mat, mat.T)

def merge_homophone(homophone, clusters):
    return homophone.groupby(clusters).mean().transpose().groupby(clusters).mean()

def get_homophone_groups(cluster, prefix, ids, suffix='mb01dz.csv'):
    columns = ('initial', 'finals', 'tone')
    indices = {}
    for col in columns:
        indices[col] = np.triu_indices(cluster[col].nunique(), 1)

    homophones = {}
    for id in ids:
        fname = os.path.join(prefix, f'{id}{suffix}')
        data = clean_data(pd.read_csv(
            fname,
            index_col='iid',
            dtype={'iid': int, 'initial': str, 'finals': str, 'tone': str}
        ))

        for col in columns:
            homophone = merge_homophone(get_homophone(data[col], cluster.index), cluster[col])
            homophones.setdefault(col, []).append(homophone.values[indices[col]])

    data = []
    for col in columns:
        data.append(pd.DataFrame(
            np.stack(homophones[col]),
            index=ids,
            columns=pd.MultiIndex.from_arrays(indices[col])
        ))

    return pd.concat(data, axis=1, keys=columns)

In [141]:
def clean_dialect_tag(tag):
    return np.where(
        tag.str.contains('[吴闽赣粤湘晋徽]'),
        tag.str.replace('.*([吴闽赣粤湘晋徽]).*', r'\1方言', regex=True),
        np.where(
            tag.str.contains('客'),
            tag.str.replace('.*(客).*', '客家方言', regex=True),
            np.where(
                tag.str.contains('平话'),
                tag.str.replace('.*平话.*', '平话', regex=True),
                np.where(
                    tag.str.contains('北京|东北|冀鲁|胶辽|中原|兰银|江淮|西南'),
                    tag.str.replace('.*(北京|东北|冀鲁|胶辽|中原|兰银|江淮|西南).*', r'\1官话', regex=True),
                    np.where(
                        tag.str.contains('湖南|韶州'),
                        tag.str.replace('.*(湖南|韶州).*', '\1土话', regex=True),
                        np.nan
                    )
                )
            )
        )
    )

def get_dialect(location):
    dialect = pd.Series(clean_dialect_tag(location['area']), index=location.index)
    return dialect.where(dialect.notna(), clean_dialect_tag(location['slice']))

In [193]:
prefix = r'D:\git\zhongguoyuyan\csv'
location = pd.read_csv(os.path.join(prefix, 'dialect', 'location.csv'), index_col=0)
char = pd.read_csv(os.path.join(prefix, 'words.csv'), index_col=0)

In [194]:
ids, data = load_data(os.path.join(prefix, 'dialect'), location.index)
data.index = data.index.astype(int)

929  0895     iŋ      th  NaN         挺
946  0918     uŋ       t  NaN         懂
169  0161      i          NaN   42    泥
822  0717      ɔ       l    文         落
1008  0779                   　         撞
132  0104      i     tsh  不可～         取
233  0229      u      ts  鸽～。词缀         子
405  0369                 NaN         有
564  0487    iaŋ       h  有～公司         限
529  0470                 NaN         看
648  0582                 NaN         惯
652  0584                 NaN         还
621  0568                 NaN         换
203  0192              h  文，～剧   35    话
292   0223     ts              文。食～         指
390   0295      h             文。～学习         好
400   0300      m              文。～腰         猫
422   0313     ts               文～点         焦
446   0326      t              文。～勾         钓
695   0508      t              文。盘～         缠
722   0527     ph                ～舟         扁
756   0548     ts              文。～约         节
768   0554     ph                 文         判
771   0555     ph    

In [195]:
cluster = get_char_cluster(data)
cluster = cluster[cluster.index.isin(char.index)]
for col in ('initial', 'finals', 'tone'):
    display(char['item'].groupby(cluster[col]).agg(''.join))

initial
0                                      闰
1                                      褥
2                    制知照州占折针汁战镇震张章证织贞整正粥
3                                  西洗息惜锡
4     爷野夜吴五余雨芋裕移姨摇右炎盐叶延圆院园远越匀云运痒药蝇赢荣育容用浴
                     ...                
95                                  瓦外危顽
96                                     溪
97                                   验严业
98                                 下夏限学行
99                                   虎壶户
Name: item, Length: 100, dtype: object

finals
0         宝抱毛帽刀讨桃道脑老早灶草糙造嫂高熬好号
1                          钉顶零
2                         鸡溪契系
3      东懂冻通桶痛铜动洞聋弄粽葱送公孔烘红冬统脓松宋
4                          拆窄摘
                ...           
195                          歪
196                         夹瞎
197                          绳
198                          袜
199                          虱
Name: item, Length: 200, dtype: object

tone
0                                                监挖闩钢浆听
1                                                  裤糙票秤
2     锣鹅磨螺牙爷奴吴鱼来埋泥犁煤雷儿移眉梨姨围毛熬摇楼流牛油南蓝岩炎盐严林难兰颜棉连延言年莲完顽...
3                                                  数喊伞柄
4                       搭鸽夹鸭接跌贴擦八扎杀瞎歇憋捏切拨泼脱刷缺托削约郭剥桌拍拆摘锡
5                               雾遇裕币艺递系背卫义治类胃校袖验任健传恨旺弄凤
6                                          闸协集习及达杰截极择额族
7                                         业灭列孽末越律物鹤弱特或育
8                                           折急决吉卒橘格竹菊足烛
9                                     坐下柱弟被市抱厚妇舅淡断近上棒动重
10    个破过课货嫁借化布做错锯去付数句菜爱贝带盖拜戒派晒制世剃契配对碎怪挂快岁肺桂刺寄戏屁四器试记...
11                                          瓦五吕雨耳老藕有远痒网
12                                        撤设阔作各霍握测式策益壁畜
13                                               插割刮恶壳隔
14                                            鼻舌薄着学食白席石
15                                       立入密栗日墨力历木鹿目六绿浴
16                                                    肉
17                                         

In [196]:
homophone = get_homophone_groups(cluster, os.path.join(prefix, 'dialect'), location.index)
homophone.fillna(0, inplace=True)

s -> ss 1
ə -> æə 1
53 -> 5353 1


In [197]:
labels = get_dialect(location)

In [260]:
# selector = SelectKBest(mutual_info_classif, k=1000)
selector = SelectKBest(chi2, k=1000)
selector.fit(homophone[labels.notna()], labels[labels.notna()])

SelectKBest(k=1000, score_func=<function chi2 at 0x0000020BA3B25D30>)

In [261]:
for col, c1, c2 in homophone.columns[np.argsort(-selector.scores_)[:100]]:
    print(f'{col} 1 = {c1} {"".join(char.loc[cluster[col] == c1, "item"])}, {col} 2 = {c2} {"".join(char.loc[cluster[col] == c2, "item"])}')

finals 1 = 6 本盆门, finals 2 = 117 官宽欢换碗
initial 1 = 90 箍, initial 2 = 96 溪
initial 1 = 22 开口看渴糠壳肯坑客哭, initial 2 = 96 溪
initial 1 = 8 鸡寄饥记几季叫九剑金急建肩见结紧吉筋劲姜脚镜经击, initial 2 = 79 跪柜共
finals 1 = 70 初锄数数, finals 2 = 182 杉
initial 1 = 5 多躲赌带低对刀钓抖丢搭胆点店跌单典端短墩蹲党灯等凳得打钉顶东懂冻冬, initial 2 = 59 治侄着直
finals 1 = 16 纸师指柿事使试时市, finals 2 = 199 虱
initial 1 = 22 开口看渴糠壳肯坑客哭, initial 2 = 98 下夏限学行
initial 1 = 22 开口看渴糠壳肯坑客哭, initial 2 = 58 虾孝瞎
initial 1 = 58 虾孝瞎, initial 2 = 87 去契器气欠牵庆轻
initial 1 = 81 鞋蟹咸项, initial 2 = 87 去契器气欠牵庆轻
initial 1 = 87 去契器气欠牵庆轻, initial 2 = 98 下夏限学行
finals 1 = 192 兄, finals 2 = 197 绳
tone 1 = 15 立入密栗日墨力历木鹿目六绿浴, tone 2 = 38 块
initial 1 = 79 跪柜共, initial 2 = 83 举句卷决均橘军菊
initial 1 = 20 河海害好号后厚含盒喊汉汗还恨鹤黑红, initial 2 = 87 去契器气欠牵庆轻
tone 1 = 7 业灭列孽末越律物鹤弱特或育, tone 2 = 38 块
finals 1 = 40 蓬风丰凤梦封蜂缝, finals 2 = 76 冷生省梗坑争耕
initial 1 = 44 爱矮暗安恩恶, initial 2 = 98 下夏限学行
initial 1 = 58 虾孝瞎, initial 2 = 92 吃
initial 1 = 6 规滚郭, initial 2 = 39 轿舅旧件近
tone 1 = 10 个破过课货嫁借化布做错锯去付数句菜爱贝带盖拜戒派晒制世剃契配对碎怪挂快岁肺桂刺寄戏屁四器试记意气

In [264]:
train_location, test_location, train_features, test_features, train_target, test_target = train_test_split(
    location[labels.notna()],
    sp.csr_matrix(selector.transform(homophone[labels.notna()])),
    labels[labels.notna()],
    test_size=0.2
)

In [248]:
def benchmark(features, target, estimators):
    for e in estimators:
        scores = cross_val_score(e, features, target)
        display(f'{e} {scores.mean()}+-{scores.std()}')

In [265]:
benchmark(train_features, train_target, (
    make_pipeline(
        PolynomialFeatures(interaction_only=True, include_bias=False),
        LogisticRegression(penalty='l1', solver='saga')
    ),
    make_pipeline(
        PolynomialFeatures(interaction_only=True, include_bias=False),
        SVC(),
    ),
    DecisionTreeClassifier(max_depth=20),
    GradientBoostingClassifier(n_estimators=20)
))



"Pipeline(steps=[('polynomialfeatures',\n                 PolynomialFeatures(include_bias=False, interaction_only=True)),\n                ('logisticregression',\n                 LogisticRegression(penalty='l1', solver='saga'))]) 0.8834423449612403+-0.030410517621067393"



"Pipeline(steps=[('polynomialfeatures',\n                 PolynomialFeatures(include_bias=False, interaction_only=True)),\n                ('svc', SVC())]) 0.867890019379845+-0.030007329142986632"



'DecisionTreeClassifier(max_depth=20) 0.7745760658914729+-0.025316730583401687'



'GradientBoostingClassifier(n_estimators=20) 0.8211845930232557+-0.014334434183380627'

In [246]:
test_location.assign(label=test_target, pred=pred)[pred != test_target]

Unnamed: 0,longitude,latitude,insertDate,uploader,firstLevelid,sheetName,filepath,province,city,country,...,minorityInfo,dialectInfo,operaInfo,source,degree,area,slice,slices,label,pred
05D22,119.641667,41.35,,,05D22,,辽宁/凌源需交文件电子版/模板表/其他/概况.xls,辽宁,朝阳市,凌源市,...,（无）,凌源方言属北京官话朝峰片。凌源处于北京、辽宁、河北三省交界地带，因此兼具多个官话方言区的特点...,皮影戏、评戏、京剧、京韵大鼓、二人转在当地比较流行。其中凌源皮影戏于2006年被列入第一批国...,yubao,common,北京官话,朝峰片,系属不明,北京官话,东北官话
15990,113.208333,25.233333,,,15990,,广东/皈塘需交文件电子版/模板表/其他/概况.xls,广东,韶关,乐昌,...,（无）,皈塘村委下辖各村小组所有人均说皈塘话。另外，从原皈塘村分出单立的现京口村的全部人，金鸡、新岩...,有少数老年人会唱花鼓戏（使用湖南衡阳一带的话）和客家山歌（使用客家话）,yubao,common,平话土话,韶州片,皈塘小片,平话方言,湘方言
08F70,119.316667,29.483333,,,08F70,,浙江/寿昌需交文件电子版/模板表/其他/概况.xls,浙江,杭州市,建德市,...,（无）,1.寿昌话，分布于旧寿昌县。2.建德话，主要分布于原建德县境内，一般一个镇一种口音，例如梅城...,越剧、婺剧，在当地皆盛行。,yubao,common,徽语严州片,系属不明,系属不明,徽方言,吴方言
35H24,109.7,36.666667,,,35H24,,濒危方言/陕西延安老户话需交文件电子版/模板表/其他/概况.xls,濒危方言,延安,宝塔区,...,无呈区域分布的少数民族语言,本区主要有两种方言：老户话和上头话，同属于晋语。老户话，是延安的原住民方言，上头话是来自榆林...,延安宝塔区当下流行的地方曲艺有说书、民歌、晋剧、眉户戏等，但都不是本地原有的。用老户话说唱的...,yubao,endangered,晋语,志延片,系属不明,晋方言,中原官话
11D10,115.391667,39.616667,,,11D10,,河北/涞水需交文件电子版/模板表/其他/概况.xls,河北,保定,涞水,...,本县无少数民族语言，少数民族均说汉语。,涞水方言大致分为城关话和山区话两种，山区话主要指三坡镇、九龙镇方言，城关话指平原地带各乡镇方...,（无）,yubao,common,冀鲁官话,保唐片,定霸小片,冀鲁官话,北京官话
26D48,111.253887,25.199988,,,26D48,,湖南/江永需交文件电子版/模板表/其他/概况.xls,湖南,永州,江永,...,有过山瑶，又称高山瑶话，属于瑶族勉方言。主要分布在松柏、千家峒、兰溪、源口四个瑶族乡的少数村...,江永县的汉语方言有很多种，主要分为三大类：一类是以上江圩、城关潇浦镇为代表的土话，一类是以松...,（无）,yubao,common,湖南土话,永州,江永,土话,湘方言
23J18,108.658567,19.101105,,,23J18,,海南/东方军话需交文件电子版/模板表/其他/概况.xls,海南,（无）,东方市,...,少数民族语言主要是黎语，人口约8.5万人；其次是哥隆话（村话），哥隆人民族成分归为汉族，但是...,汉语方言主要有军话（属西南官话）、海南话（属闽语）、儋州话（属粤语），其中军话主要分布在八所...,军话民歌用军话演唱，形式一般为男女对唱，也有独唱、合唱、小组唱等；内容一般为表达爱情，也有思...,yubao,common,官话,西南官话,海南军话,西南官话,江淮官话
02A10,117.1,26.391667,,,02A10,,福建/明溪需交文件电子版/模板表/其他/概况.xls,福建,三明市,明溪县,...,（无）,明溪县有四镇五乡。明溪话通常指建县以来的政治文化经济中心雪峰镇话。此外，城关乡、瀚仙镇、沙溪...,明溪县夏阳乡有大腔戏，系从永安引进，一般只在夏阳演出。明溪县城较少地方戏演出。,yubao,common,客家话区,闽西片,闽客赣过渡片,客家方言,闽方言
14A65,102.866667,36.333333,,,14A65,,甘肃/红古需交文件电子版/模板表/其他/概况.xls,甘肃,兰州市,红古区,...,（无）,红古区方言内部口音有窑街口音和红古口音。窑街口音主要分布在区内窑街镇，其语音系统与兰州话基本...,红古地区流行的主要剧种有秦腔、眉户。一般在年节、庙会演出或伴随社会庆典活动时演出，多为折子戏...,yubao,common,兰银官话,金城片,红古小片,兰银官话,中原官话
35H13,109.941667,18.575,,,35H13,,濒危方言/海南陵水疍家话需交文件电子版/模板表/其他/概况.xls,濒危方言,（无）,陵水黎族自治县,...,（无）,新村镇通行当地闵语（海南话）和疍家话两种方言，全镇1万3千人基本会说海南话，其中将近1万人同...,疍家调，原来叫咸水歌，新加坡、澳门、香港等地粤语人群都会唱。本地疍家人50岁以上才会唱，年轻...,yubao,endangered,中原官话,信蚌片,系属不明,中原官话,粤方言
