In [1]:
import os
import logging
import pandas as pd
import dtale
import numpy as np
from sklearn.feature_selection import chi2, VarianceThreshold, SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import eli5
from util import get_dialect

logging.getLogger().setLevel(logging.INFO)
np.random.seed(77213)

In [2]:
prefix = r'D:\git\zhongguoyuyan\csv\dialect'
location = pd.read_csv(os.path.join(prefix, 'location.csv'), index_col=0)
homophones = pd.read_csv('homophone2.csv', index_col='id', dtype={'id': str})

In [3]:
train_data, test_data = train_test_split(homophones, test_size=0.2)
train_dialect = get_dialect(location.loc[train_data.index])
test_dialect = get_dialect(location.loc[test_data.index])

In [4]:
idx = np.argsort(-VarianceThreshold(0).fit(train_data).variances_)[:100]
df = pd.DataFrame(train_data.iloc[:, idx], index=train_data.index, columns=train_data.columns)
df.insert(0, 'dialect', train_dialect)
dtale.show(df, name='train_variance', ignore_duplicate=True)



In [6]:
selector = SelectKBest(chi2, k=1000).fit(train_data[train_dialect != ''], train_dialect[train_dialect != ''])
idx = np.argsort(-selector.scores_)[:100]
df = pd.DataFrame(
    train_data[train_dialect != ''].iloc[:, idx],
    index=train_data[train_dialect != ''].index,
    columns=train_data.columns[idx]
)
df.insert(0, 'dialect', train_dialect[train_dialect != ''])
dtale.show(df, name='train_chi2')



In [7]:
lr = LogisticRegression(penalty='l1', solver='saga', fit_intercept=False).fit(train_data[train_dialect != ''], train_dialect[train_dialect != ''])
print(classification_report(train_dialect[train_dialect != ''], lr.predict(train_data[train_dialect != ''])))
print(classification_report(test_dialect[test_dialect != ''], lr.predict(test_data[test_dialect != ''])))

              precision    recall  f1-score   support

        东北官话       0.84      1.00      0.92        27
        中原官话       0.95      0.97      0.96        62
        兰银官话       1.00      0.91      0.95        11
        冀鲁官话       0.96      1.00      0.98        23
        北京官话       1.00      1.00      1.00        11
         吴方言       1.00      1.00      1.00        47
        客家方言       0.94      0.97      0.96        34
          平话       1.00      0.78      0.88         9
         徽方言       1.00      1.00      1.00        11
         晋方言       0.97      0.97      0.97        35
        江淮官话       1.00      0.97      0.98        33
        湖南土话       0.00      0.00      0.00         1
         湘方言       0.93      0.96      0.95        28
         粤方言       0.95      0.97      0.96        38
        胶辽官话       1.00      0.64      0.78        14
        西南官话       0.99      0.99      0.99       147
         赣方言       0.98      1.00      0.99        57
         闽方言       0.98    

In [None]:
eli5.explain_weights(lr, top=10, feature_names=train_data.columns.values)

In [5]:
train_feature = train_data[train_dialect != '']
test_feature = test_data[test_dialect != '']
train_target = np.char.endswith(train_dialect[train_dialect != ''], '官话')
test_target = np.char.endswith(test_dialect[test_dialect != ''], '官话')
svm = SVC(kernel='linear').fit(train_feature, train_target)
print(classification_report(train_target, svm.predict(train_feature)))
print(classification_report(test_target, svm.predict(test_feature)))
eli5.explain_weights(svm, top=10, feature_names=train_data.columns.values)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00       314
        True       1.00      1.00      1.00       328

    accuracy                           1.00       642
   macro avg       1.00      1.00      1.00       642
weighted avg       1.00      1.00      1.00       642

              precision    recall  f1-score   support

       False       0.93      0.96      0.94        71
        True       0.97      0.95      0.96        91

    accuracy                           0.95       162
   macro avg       0.95      0.95      0.95       162
weighted avg       0.95      0.95      0.95       162



Weight?,Feature
+0.207,initial_制知照州占折针汁战镇震张章证织贞整正粥=柱住
+0.173,tone_奴如儿严颜延言莲顽原民邻纯娘能迎形荣容=渠_锣鹅牙爷鱼来埋泥犁雷移眉梨姨围毛熬摇牛油南蓝岩盐难棉年人门匀云忙名赢零脓=渠_茄爬茶赔皮垂锤桃桥条头蚕钳沉钱前裙糖床藤蓬虫=渠_渠=螺蛇华吴壶余煤回怀时随肥楼流嫌林兰连缠完还船圆园神银魂轮坟黄房防狂王绳行明横营红熊雄浓龙_渠=河婆斜图徐除锄台财排牌柴鞋池骑迟祠棋朝浮绸愁球潭含咸甜寻琴弹田盘全传权贫陈辰勤盆唇长尝朋层棚平程城瓶停铜穷
+0.145,tone_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=逼_歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂=逼_多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶=逼
+0.127,initial_把布贝拜摆杯碑比宝包饱表扮八班板变扁憋搬半拨笔本帮绑剥北冰逼百兵柄饼壁=被抱棒
+0.124,tone_七虱一出黑织哭屋=削_笔骨北得色谷福=削_铁雪血脚百尺=削_搭鸭擦割八扎杀瞎刷刮郭剥桌=削_接贴歇切拨泼脱缺约=削_吸失息积惜击缩=削
+0.112,tone_奴如儿严颜延言莲顽原民邻纯娘能迎形荣容=橘菊_锣鹅牙爷鱼来埋泥犁雷移眉梨姨围毛熬摇牛油南蓝岩盐难棉年人门匀云忙名赢零脓=橘菊_茄爬茶赔皮垂锤桃桥条头蚕钳沉钱前裙糖床藤蓬虫=橘菊_螺蛇华吴壶余煤回怀时随肥楼流嫌林兰连缠完还船圆园神银魂轮坟黄房防狂王绳行明横营红熊雄浓龙=橘菊_河婆斜图徐除锄台财排牌柴鞋池骑迟祠棋朝浮绸愁球潭含咸甜寻琴弹田盘全传权贫陈辰勤盆唇长尝朋层棚平程城瓶停铜穷=橘菊
+0.111,tone_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=七虱一出黑织哭屋_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=搭鸭擦割八扎杀瞎刷刮郭剥桌_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=接贴歇切拨泼脱缺约_七虱一出黑织哭屋=歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂_七虱一出黑织哭屋=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶_搭鸭擦割八扎杀瞎刷刮郭剥桌=歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂_搭鸭擦割八扎杀瞎刷刮郭剥桌=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶_歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂=接贴歇切拨泼脱缺约_接贴歇切拨泼脱缺约=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶
… 1490 more positive …,… 1490 more positive …
… 1500 more negative …,… 1500 more negative …
-0.121,initial_破谱铺派配屁炮票品骗片判泼匹胖拍劈=被抱棒


In [26]:
train_feature = train_data[np.isin(train_dialect, ['赣方言', '客家方言'])]
test_feature = test_data[np.isin(test_dialect, ['赣方言', '客家方言'])]
train_target = train_dialect[np.isin(train_dialect, ['赣方言', '客家方言'])] == '客家方言'
test_target = test_dialect[np.isin(test_dialect, ['赣方言', '客家方言'])] == '客家方言'
svm = SVC(kernel='linear').fit(train_feature, train_target)
print(classification_report(train_target, svm.predict(train_feature)))
print(classification_report(test_target, svm.predict(test_feature)))
eli5.explain_weights(svm, top=10, feature_names=train_data.columns.values)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00        57
        True       1.00      1.00      1.00        34

    accuracy                           1.00        91
   macro avg       1.00      1.00      1.00        91
weighted avg       1.00      1.00      1.00        91

              precision    recall  f1-score   support

       False       1.00      0.92      0.96        12
        True       0.83      1.00      0.91         5

    accuracy                           0.94        17
   macro avg       0.92      0.96      0.93        17
weighted avg       0.95      0.94      0.94        17



Weight?,Feature
+0.068,finals_皮义戏移比地梨饥器姨李记棋喜意希衣=死四_死四=寄骑屁几气季
+0.059,initial_爷野夜余雨芋裕移姨摇有右油炎盐叶延引痒样药蝇赢营育用浴=哑鸭_哑鸭=意衣腰要优幼厌音烟印一隐秧约影益拥_哑鸭=圆院冤园远越匀云运永
+0.051,finals_把爬马骂茶沙拉打=瓜挂话_假嫁牙虾下夏哑=瓜挂话_瓜挂话=车蛇射
+0.049,tone_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=坐柱被舅淡断近上重_歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂=坐柱被舅淡断近上重_坐柱被舅淡断近上重=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶
… 1602 more positive …,… 1602 more positive …
… 1387 more negative …,… 1387 more negative …
-0.050,finals_岁嘴随醉=女徐举锯去渠鱼许余取句区遇雨芋剧
-0.055,tone_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=猫_猫=歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂_猫=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶
-0.056,finals_桂规亏跪龟柜鬼贵=卫危位围胃
-0.064,finals_紫刺资子字丝祠寺=死四


In [28]:
train_feature = train_data[train_dialect != '']
test_feature = test_data[test_dialect != '']
train_target = train_dialect[train_dialect != ''] == '晋方言'
test_target = test_dialect[test_dialect != ''] == '晋方言'
svm = SVC(kernel='linear').fit(train_feature, train_target)
print(classification_report(train_target, svm.predict(train_feature)))
print(classification_report(test_target, svm.predict(test_feature)))
eli5.explain_weights(svm, top=10, feature_names=train_data.columns.values)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00       607
        True       1.00      1.00      1.00        35

    accuracy                           1.00       642
   macro avg       1.00      1.00      1.00       642
weighted avg       1.00      1.00      1.00       642

              precision    recall  f1-score   support

       False       1.00      1.00      1.00       153
        True       1.00      1.00      1.00         9

    accuracy                           1.00       162
   macro avg       1.00      1.00      1.00       162
weighted avg       1.00      1.00      1.00       162



Weight?,Feature
+0.069,tone_奴如儿严颜延言莲顽原民邻纯娘能迎形荣容=削_锣鹅牙爷鱼来埋泥犁雷移眉梨姨围毛熬摇牛油南蓝岩盐难棉年人门匀云忙名赢零脓=削_茄爬茶赔皮垂锤桃桥条头蚕钳沉钱前裙糖床藤蓬虫=削_螺蛇华吴壶余煤回怀时随肥楼流嫌林兰连缠完还船圆园神银魂轮坟黄房防狂王绳行明横营红熊雄浓龙=削_河婆斜图徐除锄台财排牌柴鞋池骑迟祠棋朝浮绸愁球潭含咸甜寻琴弹田盘全传权贫陈辰勤盆唇长尝朋层棚平程城瓶停铜穷=削
… 1259 more positive …,… 1259 more positive …
… 1731 more negative …,… 1731 more negative …
-0.063,finals_各鹤恶壳=破婆磨磨摸
-0.066,tone_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=憋_憋=歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂_憋=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶
-0.067,finals_贝杯配赔背煤妹碑=被眉
-0.068,initial_吕犁梨李料流林立连列莲邻栗律两亮力领零历=奴脑闹南难暖嫩能脓_奴脑闹南难暖嫩能脓=锣螺路来雷类老楼拉蓝蜡兰懒烂辣乱轮浪落冷聋弄鹿六龙绿
-0.073,tone_厌店汉战建进镇俊=做_个过货嫁借化布锯数句爱贝带拜戒晒制世对碎怪挂岁肺桂寄戏四试记意醉季费贵灶罩孝笑照要钓叫够富副瘦暗占剑变线扇见半算惯印劲粪壮向放证镜姓正冻粽送宋=做_破课错裤去菜剃配快刺屁器气靠炮票凑臭欠炭看骗片判劝寸困唱胖秤庆痛=做
-0.074,tone_七虱一出黑织哭屋=削_笔骨北得色谷福=削_铁雪血脚百尺=削_搭鸭擦割八扎杀瞎刷刮郭剥桌=削_接贴歇切拨泼脱缺约=削_吸失息积惜击缩=削
-0.083,finals_破婆磨磨摸=歌个可鹅饿河课
