In [42]:
import os
import logging
import pandas as pd
import dtale
import numpy as np
from sklearn.feature_selection import chi2, VarianceThreshold, SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import eli5
from util import get_dialect

logging.getLogger().setLevel(logging.INFO)
np.random.seed(77213)

In [2]:
prefix = r'D:\git\zhongguoyuyan\csv\dialect'
location = pd.read_csv(os.path.join(prefix, 'location.csv'), index_col=0)
homophones = pd.read_csv('homophone.csv', index_col='id', dtype={'id': str})

In [3]:
train_data, test_data = train_test_split(homophones, test_size=0.2)
train_dialect = get_dialect(location.loc[train_data.index])
test_dialect = get_dialect(location.loc[test_data.index])

In [4]:
idx = np.argsort(-VarianceThreshold(0).fit(train_data).variances_)[:100]
df = pd.DataFrame(train_data.iloc[:, idx], index=train_data.index, columns=train_data.columns)
df.insert(0, 'dialect', train_dialect)
dtale.show(df, name='train_variance', ignore_duplicate=True)



In [6]:
selector = SelectKBest(chi2, k=1000).fit(train_data[train_dialect != ''], train_dialect[train_dialect != ''])
idx = np.argsort(-selector.scores_)[:100]
df = pd.DataFrame(
    train_data[train_dialect != ''].iloc[:, idx],
    index=train_data[train_dialect != ''].index,
    columns=train_data.columns[idx]
)
df.insert(0, 'dialect', train_dialect[train_dialect != ''])
dtale.show(df, name='train_chi2')



In [7]:
lr = LogisticRegression(penalty='l1', solver='saga', fit_intercept=False).fit(train_data[train_dialect != ''], train_dialect[train_dialect != ''])
print(classification_report(train_dialect[train_dialect != ''], lr.predict(train_data[train_dialect != ''])))
print(classification_report(test_dialect[test_dialect != ''], lr.predict(test_data[test_dialect != ''])))

              precision    recall  f1-score   support

        东北官话       0.84      1.00      0.92        27
        中原官话       0.95      0.97      0.96        62
        兰银官话       1.00      0.91      0.95        11
        冀鲁官话       0.96      1.00      0.98        23
        北京官话       1.00      1.00      1.00        11
         吴方言       1.00      1.00      1.00        47
        客家方言       0.94      0.97      0.96        34
          平话       1.00      0.78      0.88         9
         徽方言       1.00      1.00      1.00        11
         晋方言       0.97      0.97      0.97        35
        江淮官话       1.00      0.97      0.98        33
        湖南土话       0.00      0.00      0.00         1
         湘方言       0.93      0.96      0.95        28
         粤方言       0.95      0.97      0.96        38
        胶辽官话       1.00      0.64      0.78        14
        西南官话       0.99      0.99      0.99       147
         赣方言       0.98      1.00      0.99        57
         闽方言       0.98    

In [None]:
eli5.explain_weights(lr, top=10, feature_names=train_data.columns.values)

In [27]:
train_feature = train_data[train_dialect != '']
test_feature = test_data[test_dialect != '']
train_target = train_dialect[train_dialect != ''] == '闽方言'
test_target = test_dialect[test_dialect != ''] == '闽方言'

In [29]:
svm = SVC(kernel='linear').fit(train_feature, train_target)
print(classification_report(train_target, svm.predict(train_feature)))
print(classification_report(test_target, svm.predict(test_feature)))
eli5.explain_weights(svm, top=20, feature_names=train_data.columns.values)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00       589
        True       1.00      1.00      1.00        53

    accuracy                           1.00       642
   macro avg       1.00      1.00      1.00       642
weighted avg       1.00      1.00      1.00       642

              precision    recall  f1-score   support

       False       1.00      0.99      1.00       154
        True       0.89      1.00      0.94         8

    accuracy                           0.99       162
   macro avg       0.94      1.00      0.97       162
weighted avg       0.99      0.99      0.99       162



Weight?,Feature
+0.084,<BIAS>
+0.057,initial_风丰封蜂=祸华回会怀坏魂或_祸华回会怀坏魂或=父犯罚佛防凤服缝_祸华回会怀坏魂或=府付肺费富副法反翻发粉福
+0.057,initial_破谱铺派配屁炮票品骗片判泼匹胖拍劈=被抱棒
+0.056,initial_奴脑闹南难暖嫩能脓=染热人认日让
+0.055,initial_假嫁锯交减夹监甲奸江讲=轿舅旧件近_鸡寄饥记几季=轿舅旧件近_叫九剑金急建肩见结紧吉筋劲姜脚镜经击=轿舅旧件近_举句卷决均橘军=轿舅旧件近
+0.054,initial_多躲赌带低对刀钓抖丢搭胆点店跌单典端短墩蹲党灯等凳得打钉顶东懂冻冬=猪主转砖准_多躲赌带低对刀钓抖丢搭胆点店跌单典端短墩蹲党灯等凳得打钉顶东懂冻冬=追中竹_梯剃添贴天铁厅听挺踢=锤虫_大杜袋弟递地道豆淡碟达垫断夺定笛动洞读毒=池迟朝绸沉缠陈长_大杜袋弟递地道豆淡碟达垫断夺定笛动洞读毒=除传_大杜袋弟递地道豆淡碟达垫断夺定笛动洞读毒=侄着直_大杜袋弟递地道豆淡碟达垫断夺定笛动洞读毒=重_图台条甜弹田藤铜=池迟朝绸沉缠陈长_图台条甜弹田藤铜=除传_图台条甜弹田藤铜=侄着直_图台条甜弹田藤铜=重_锤虫=拖土胎讨偷贪毯塔炭脱吞汤托通桶痛统_锤虫=桃头潭糖停_池迟朝绸沉缠陈长=踏特_侄着直=踏特
+0.051,initial_嫌协现县形熊雄=风丰封蜂_嫌协现县形熊雄=父犯罚佛防凤服缝_嫌协现县形熊雄=府付肺费富副法反翻发粉福_许系戏喜希=风丰封蜂_许系戏喜希=父犯罚佛防凤服缝_许系戏喜希=府付肺费富副法反翻发粉福_大杜袋弟递地道豆淡碟达垫断夺定笛动洞读毒=追中竹_风丰封蜂=靴休险歇显血熏响向兴兄凶_下夏校限学行=府付肺费富副法反翻发粉福_靴休险歇显血熏响向兴兄凶=父犯罚佛防凤服缝_靴休险歇显血熏响向兴兄凶=府付肺费富副法反翻发粉福_父犯罚佛防凤服缝=虾孝_府付肺费富副法反翻发粉福=虾孝
+0.051,tone_奴如儿严颜延言莲顽原民邻纯娘能迎形荣容=虾_锣鹅牙爷鱼来埋泥犁雷移眉梨姨围毛熬摇牛油南蓝岩盐难棉年人门匀云忙名赢零脓=虾_茄爬茶赔皮垂锤桃桥条头蚕钳沉钱前裙糖床藤蓬虫=虾_虾=螺蛇华吴壶余煤回怀时随肥楼流嫌林兰连缠完还船圆园神银魂轮坟黄房防狂王绳行明横营红熊雄浓龙_虾=河婆斜图徐除锄台财排牌柴鞋池骑迟祠棋朝浮绸愁球潭含咸甜寻琴弹田盘全传权贫陈辰勤盆唇长尝朋层棚平程城瓶停铜穷
+0.051,initial_河害号后盒还恨鹤红=父犯罚佛防凤服缝_河害号后盒还恨鹤红=府付肺费富副法反翻发粉福_风丰封蜂=海好喊汉黑烘_海好喊汉黑烘=父犯罚佛防凤服缝_海好喊汉黑烘=府付肺费富副法反翻发粉福
+0.051,initial_爷野夜余雨芋裕移姨摇有右油炎盐叶延引痒样药蝇赢营育用浴=爱矮暗安恩恶_爱矮暗安恩恶=意衣腰要优幼厌音烟印一隐秧约影益拥_爱矮暗安恩恶=圆院冤园远越匀云运永


In [30]:
train_feature = train_data[train_dialect != '']
test_feature = test_data[test_dialect != '']
train_target = np.char.endswith(train_dialect[train_dialect != ''], '官话')
test_target = np.char.endswith(test_dialect[test_dialect != ''], '官话')

In [5]:
selector = SelectKBest(chi2, k=1000).fit(train_feature, train_target)
idx = np.argsort(-selector.scores_)[:100]
df = pd.DataFrame(
    train_feature.iloc[:, idx],
    train_feature.index,
    columns=train_feature.columns[idx]
)
df.insert(0, 'label', train_target)
df.insert(1, 'dialect', train_dialect[train_dialect != ''])
df.insert(2, 'location', location.loc[train_feature.index, ['province', 'city', 'country']].fillna('').apply(lambda x: ','.join(x), axis=1))
dtale.show(df, name='guanhua_chi2')



In [31]:
svm = SVC(kernel='linear').fit(train_feature, train_target)
print(classification_report(train_target, svm.predict(train_feature)))
print(classification_report(test_target, svm.predict(test_feature)))
eli5.explain_weights(svm, top=20, feature_names=train_data.columns.values)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00       314
        True       1.00      1.00      1.00       328

    accuracy                           1.00       642
   macro avg       1.00      1.00      1.00       642
weighted avg       1.00      1.00      1.00       642

              precision    recall  f1-score   support

       False       0.93      0.96      0.94        71
        True       0.97      0.95      0.96        91

    accuracy                           0.95       162
   macro avg       0.95      0.95      0.95       162
weighted avg       0.95      0.95      0.95       162



Weight?,Feature
+0.207,initial_制知照州占折针汁战镇震张章证织贞整正粥=柱住
+0.173,tone_奴如儿严颜延言莲顽原民邻纯娘能迎形荣容=渠_锣鹅牙爷鱼来埋泥犁雷移眉梨姨围毛熬摇牛油南蓝岩盐难棉年人门匀云忙名赢零脓=渠_茄爬茶赔皮垂锤桃桥条头蚕钳沉钱前裙糖床藤蓬虫=渠_渠=螺蛇华吴壶余煤回怀时随肥楼流嫌林兰连缠完还船圆园神银魂轮坟黄房防狂王绳行明横营红熊雄浓龙_渠=河婆斜图徐除锄台财排牌柴鞋池骑迟祠棋朝浮绸愁球潭含咸甜寻琴弹田盘全传权贫陈辰勤盆唇长尝朋层棚平程城瓶停铜穷
+0.145,tone_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=逼_歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂=逼_多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶=逼
+0.127,initial_把布贝拜摆杯碑比宝包饱表扮八班板变扁憋搬半拨笔本帮绑剥北冰逼百兵柄饼壁=被抱棒
+0.124,tone_七虱一出黑织哭屋=削_笔骨北得色谷福=削_铁雪血脚百尺=削_搭鸭擦割八扎杀瞎刷刮郭剥桌=削_接贴歇切拨泼脱缺约=削_吸失息积惜击缩=削
+0.112,tone_奴如儿严颜延言莲顽原民邻纯娘能迎形荣容=橘菊_锣鹅牙爷鱼来埋泥犁雷移眉梨姨围毛熬摇牛油南蓝岩盐难棉年人门匀云忙名赢零脓=橘菊_茄爬茶赔皮垂锤桃桥条头蚕钳沉钱前裙糖床藤蓬虫=橘菊_螺蛇华吴壶余煤回怀时随肥楼流嫌林兰连缠完还船圆园神银魂轮坟黄房防狂王绳行明横营红熊雄浓龙=橘菊_河婆斜图徐除锄台财排牌柴鞋池骑迟祠棋朝浮绸愁球潭含咸甜寻琴弹田盘全传权贫陈辰勤盆唇长尝朋层棚平程城瓶停铜穷=橘菊
+0.111,tone_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=七虱一出黑织哭屋_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=搭鸭擦割八扎杀瞎刷刮郭剥桌_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=接贴歇切拨泼脱缺约_七虱一出黑织哭屋=歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂_七虱一出黑织哭屋=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶_搭鸭擦割八扎杀瞎刷刮郭剥桌=歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂_搭鸭擦割八扎杀瞎刷刮郭剥桌=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶_歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂=接贴歇切拨泼脱缺约_接贴歇切拨泼脱缺约=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶
+0.107,initial_取签浅切亲七抢雀清青=钱前全_车抽臭撤厂唱秤尺=池迟朝绸沉缠陈长_错菜刺草糙凑擦村寸仓测策葱=财蚕层
+0.103,tone_夜坏二闹右念岸乱浪亮硬命用=做_币校袖任健恨凤=做_谢雾害败递会地事柜位寿办便垫现顺闰让剩洞共=做_大夏射步住树袋画话字号轿豆旧汗换饭万县匠病定=做_骂妹面问=做_饿磨路卖外味帽庙料烂慢面认嫩样梦=做_坐柱被舅淡断近上重=做_户罪造犯善件静动=做_做=遇艺义类胃验运旺_做=祸下后限
+0.102,tone_厌店汉战建进镇俊=做_个过货嫁借化布锯数句爱贝带拜戒晒制世对碎怪挂岁肺桂寄戏四试记意醉季费贵灶罩孝笑照要钓叫够富副瘦暗占剑变线扇见半算惯印劲粪壮向放证镜姓正冻粽送宋=做_破课错裤去菜剃配快刺屁器气靠炮票凑臭欠炭看骗片判劝寸困唱胖秤庆痛=做


In [43]:
train_feature = train_data[np.isin(train_dialect, ['赣方言', '客家方言'])]
test_feature = test_data[np.isin(test_dialect, ['赣方言', '客家方言'])]
train_target = train_dialect[np.isin(train_dialect, ['赣方言', '客家方言'])] == '客家方言'
test_target = test_dialect[np.isin(test_dialect, ['赣方言', '客家方言'])] == '客家方言'

In [11]:
selector = SelectKBest(chi2, k=1000).fit(train_feature, train_target)
idx = np.argsort(-selector.scores_)[:100]
df = pd.DataFrame(
    train_feature.iloc[:, idx],
    index=train_feature.index,
    columns=train_feature.columns[idx]
)
df.insert(0, 'label', train_target)
df.insert(1, 'dialect', train_dialect[np.isin(train_dialect, ['赣方言', '客家方言'])])
df.insert(2, 'location', location.loc[train_feature.index, ['province', 'city', 'country']].fillna('').apply(lambda x: ','.join(x), axis=1))
dtale.show(df, name='kejia_chi2')



In [22]:
svm = SVC(kernel='linear').fit(train_feature, train_target)
print(classification_report(train_target, svm.predict(train_feature)))
print(classification_report(test_target, svm.predict(test_feature)))
eli5.explain_weights(svm, top=10, feature_names=train_data.columns.values)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00        57
        True       1.00      1.00      1.00        34

    accuracy                           1.00        91
   macro avg       1.00      1.00      1.00        91
weighted avg       1.00      1.00      1.00        91

              precision    recall  f1-score   support

       False       1.00      0.92      0.96        12
        True       0.83      1.00      0.91         5

    accuracy                           0.94        17
   macro avg       0.92      0.96      0.93        17
weighted avg       0.95      0.94      0.94        17



Weight?,Feature
+0.068,finals_皮义戏移比地梨饥器姨李记棋喜意希衣=死四_死四=寄骑屁几气季
+0.059,initial_爷野夜余雨芋裕移姨摇有右油炎盐叶延引痒样药蝇赢营育用浴=哑鸭_哑鸭=意衣腰要优幼厌音烟印一隐秧约影益拥_哑鸭=圆院冤园远越匀云运永
+0.051,finals_把爬马骂茶沙拉打=瓜挂话_假嫁牙虾下夏哑=瓜挂话_瓜挂话=车蛇射
+0.049,tone_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=坐柱被舅淡断近上重_歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂=坐柱被舅淡断近上重_坐柱被舅淡断近上重=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶
… 1602 more positive …,… 1602 more positive …
… 1387 more negative …,… 1387 more negative …
-0.050,finals_岁嘴随醉=女徐举锯去渠鱼许余取句区遇雨芋剧
-0.055,tone_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=猫_猫=歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂_猫=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶
-0.056,finals_桂规亏跪龟柜鬼贵=卫危位围胃
-0.064,finals_紫刺资子字丝祠寺=死四


In [55]:
dt = DecisionTreeClassifier(max_depth=1).fit(train_feature, train_target)
print(classification_report(train_target, dt.predict(train_feature)))
print(classification_report(test_target, dt.predict(test_feature)))
eli5.explain_weights(dt, top=10, feature_names=train_data.columns.values)

              precision    recall  f1-score   support

       False       0.93      0.96      0.95        57
        True       0.94      0.88      0.91        34

    accuracy                           0.93        91
   macro avg       0.93      0.92      0.93        91
weighted avg       0.93      0.93      0.93        91

              precision    recall  f1-score   support

       False       1.00      0.92      0.96        12
        True       0.83      1.00      0.91         5

    accuracy                           0.94        17
   macro avg       0.92      0.96      0.93        17
weighted avg       0.95      0.94      0.94        17



Weight,Feature
1.0000,initial_尾袜蚊问网=吴五_吴五=味晚万
0,initial_爷野夜余雨芋裕移姨摇有右油炎盐叶延引痒样药蝇赢营育用浴=爷野夜余雨芋裕移姨摇有右油炎盐叶延引痒样药蝇赢营育用浴_爷野夜余雨芋裕移姨摇有右油炎盐叶延引痒样药蝇赢营育用浴=意衣腰要优幼厌音烟印一隐秧约影益拥
0,initial_遇=哑鸭
0,initial_斜=愁_斜=柿_斜=勺_斜=床_静=愁_静=柴_静=垂_静=事时市_静=勺_静=床_锄=寻_钱前全=愁_钱前全=柿_钱前全=事时市_愁=谢袖习席_愁=匠_愁=集截_愁=绝_愁=像_愁=寻_愁=徐_柿=谢袖习席_柿=匠_柿=集截_柿=像_柿=寻_柿=徐_尝=匠_尝=集截_尝=像_柴=谢袖习席_柴=匠_柴=集截_柴=绝_柴=寻_柴=徐_谢袖习席=床_垂=匠_垂=集截_垂=绝_垂=像_事时市=匠_事时市=集截_事时市=绝_匠=勺_匠=床_闸=寻_闸=徐_集截=床_绝=床_像=勺_像=床_寻=床_徐=床
0,initial_球钳琴权勤穷=球钳琴权勤穷_球钳琴权勤穷=茄渠骑棋桥裙_图台条甜弹田藤铜=桃头潭糖停_茄渠骑棋桥裙=茄渠骑棋桥裙
0,initial_世烧手扇设身失伤升式声叔=辰_书输水=辰
0,finals_壁劈踢笛锡=岁嘴随醉_壁劈踢笛锡=雷类_壁劈踢笛锡=对罪碎_岁嘴随醉=极益击_岁嘴随醉=鼻笔匹密栗七吉一_岁嘴随醉=立集习急及吸_岁嘴随醉=逼力息积惜席历_极益击=雷类_极益击=对罪碎_雷类=鼻笔匹密栗七吉一_雷类=立集习急及吸_雷类=逼力息积惜席历_鼻笔匹密栗七吉一=对罪碎_立集习急及吸=对罪碎_逼力息积惜席历=对罪碎
0,initial_左租做紫资子嘴醉早灶走卒作粽=造_制知照州占折针汁战镇震张章证织贞整正粥=治_静=姐借焦酒尖接剪节浆积_竖树顺=书输水_姐借焦酒尖接剪节浆积=匠_下夏校限学行=虾孝
0,initial_圈劝=菊_开口看渴糠壳肯坑客哭=及杰健极剧局_开口看渴糠壳肯坑客哭=轿舅旧件近_开口看渴糠壳肯坑客哭=菊_及杰健极剧局=课苦裤块快宽阔_去器气=跪柜共_去器气=菊_可亏靠困筐刻孔=菊_课苦裤块快宽阔=菊_区契缺曲=菊_敲=菊_菊=欠牵庆轻
0,tone_夜坏二闹右念岸乱浪亮硬命用=扮_币校袖任健恨凤=扮_谢雾害败递会地事柜位寿办便垫现顺闰让剩洞共=扮_大夏射步住树袋画话字号轿豆旧汗换饭万县匠病定=扮_骂妹面问=扮_饿磨路卖外味帽庙料烂慢面认嫩样梦=扮_遇艺义类胃验运旺=扮


In [36]:
train_feature = train_data[(train_dialect == '晋方言') | np.char.endswith(train_dialect, '官话')]
test_feature = test_data[(test_dialect == '晋方言') | np.char.endswith(test_dialect, '官话')]
train_target = train_dialect[(train_dialect == '晋方言') | np.char.endswith(train_dialect, '官话')] == '晋方言'
test_target = test_dialect[(test_dialect == '晋方言') | np.char.endswith(test_dialect, '官话')] == '晋方言'

In [14]:
selector = SelectKBest(chi2, k=1000).fit(train_feature, train_target)
idx = np.argsort(-selector.scores_)[:100]
df = pd.DataFrame(
    train_feature.iloc[:, idx],
    index=train_feature.index,
    columns=train_feature.columns[idx]
)
df.insert(0, 'label', train_target)
df.insert(1, 'dialect', train_dialect[train_dialect != ''])
df.insert(2, 'location', location.loc[train_feature.index, ['province', 'city', 'country']].fillna('').apply(lambda x: ','.join(x), axis=1))
dtale.show(df, name='jin_chi2')



Executing shutdown due to inactivity...


2022-04-12 19:31:36,121 - INFO     - Executing shutdown due to inactivity...


Executing shutdown...


2022-04-12 19:32:07,202 - INFO     - Executing shutdown...


In [39]:
svm = SVC(kernel='linear').fit(train_feature, train_target)
print(classification_report(train_target, svm.predict(train_feature)))
print(classification_report(test_target, svm.predict(test_feature)))
eli5.explain_weights(svm, top=20, feature_names=train_data.columns.values)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00       328
        True       1.00      1.00      1.00        35

    accuracy                           1.00       363
   macro avg       1.00      1.00      1.00       363
weighted avg       1.00      1.00      1.00       363

              precision    recall  f1-score   support

       False       1.00      1.00      1.00        91
        True       1.00      1.00      1.00         9

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



Weight?,Feature
+0.069,tone_奴如儿严颜延言莲顽原民邻纯娘能迎形荣容=削_锣鹅牙爷鱼来埋泥犁雷移眉梨姨围毛熬摇牛油南蓝岩盐难棉年人门匀云忙名赢零脓=削_茄爬茶赔皮垂锤桃桥条头蚕钳沉钱前裙糖床藤蓬虫=削_螺蛇华吴壶余煤回怀时随肥楼流嫌林兰连缠完还船圆园神银魂轮坟黄房防狂王绳行明横营红熊雄浓龙=削_河婆斜图徐除锄台财排牌柴鞋池骑迟祠棋朝浮绸愁球潭含咸甜寻琴弹田盘全传权贫陈辰勤盆唇长尝朋层棚平程城瓶停铜穷=削
+0.058,tone_奴如儿严颜延言莲顽原民邻纯娘能迎形荣容=虾_锣鹅牙爷鱼来埋泥犁雷移眉梨姨围毛熬摇牛油南蓝岩盐难棉年人门匀云忙名赢零脓=虾_茄爬茶赔皮垂锤桃桥条头蚕钳沉钱前裙糖床藤蓬虫=虾_虾=螺蛇华吴壶余煤回怀时随肥楼流嫌林兰连缠完还船圆园神银魂轮坟黄房防狂王绳行明横营红熊雄浓龙_虾=河婆斜图徐除锄台财排牌柴鞋池骑迟祠棋朝浮绸愁球潭含咸甜寻琴弹田盘全传权贫陈辰勤盆唇长尝朋层棚平程城瓶停铜穷
… 1274 more positive …,… 1274 more positive …
… 1703 more negative …,… 1703 more negative …
-0.053,finals_木目=竹畜粥叔熟烛赎属褥_木目=读鹿族毒
-0.053,tone_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=摸_摸=歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂_摸=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶
-0.054,tone_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=笔骨北得色谷福_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=作各恶策_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=刻侧测畜_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=吉竹足烛_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=铁雪血脚百尺_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=格隔_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=橘菊_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=节结角国_笔骨北得色谷福=歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂_笔骨北得色谷福=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶_作各恶策=歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂_作各恶策=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶_刻侧测畜=歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂_刻侧测畜=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶_吉竹足烛=歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂_吉竹足烛=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶_铁雪血脚百尺=歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂_铁雪血脚百尺=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶_歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂=格隔_歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂=橘菊_歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂=节结角国_多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶=格隔_多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶=橘菊_多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶=节结角国
-0.056,initial_歌个果过瓜古该改盖怪拐挂桂龟鬼贵高钩狗够感鸽甘敢肝割官关惯刮根滚骨钢各光郭国更梗格耕隔公谷宫恭=跪柜共
-0.057,tone_七虱一出黑织哭屋=挖_踢锡=挖_托拍拆=挖_挖=搭鸭擦割八扎杀瞎刷刮郭剥桌_挖=接贴歇切拨泼脱缺约_挖=吸失息积惜击缩
-0.059,finals_竹畜粥叔熟烛赎属褥=竹畜粥叔熟烛赎属褥


In [40]:
train_feature = train_data[np.isin(train_dialect, ['粤方言', '平话'])]
test_feature = test_data[np.isin(test_dialect, ['粤方言', '平话'])]
train_target = train_dialect[np.isin(train_dialect, ['粤方言', '平话'])] == '平话'
test_target = test_dialect[np.isin(test_dialect, ['粤方言', '平话'])] == '平话'

In [41]:
svm = SVC(kernel='linear').fit(train_feature, train_target)
print(classification_report(train_target, svm.predict(train_feature)))
print(classification_report(test_target, svm.predict(test_feature)))
eli5.explain_weights(svm, top=20, feature_names=train_data.columns.values)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00        38
        True       1.00      1.00      1.00         9

    accuracy                           1.00        47
   macro avg       1.00      1.00      1.00        47
weighted avg       1.00      1.00      1.00        47

              precision    recall  f1-score   support

       False       1.00      1.00      1.00         8
        True       1.00      1.00      1.00         2

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



Weight?,Feature
+0.844,<BIAS>
+0.053,tone_靴区溪歪知资师希休优丢肩宽蹲均筐僧耕贞翁丰宫恭=钢_歌花街鸡杯龟包箫钩衫金音班弯圈墩村孙仓疮窗灯坑钉厅星葱蜂=钢_钢=多拖沙车瓜铺租箍乌猪初书输胎该开低梯西灰碑饥丝衣吹规亏追飞刀高抓抄交敲焦烧腰偷修抽州贪三甘尖签添心参针深单肝安山间奸鲜天先牵烟搬端酸官欢关砖翻冤吞根恩亲新身筋婚温春分军熏帮汤糠张装霜章伤姜秧光慌方桩双江冰升生更争兵清声轻青经兄东通公冬松风中终充封冲凶
+0.046,initial_验严业=泥黏念孽年捏娘
+0.044,initial_奴脑闹南难暖嫩能脓=验严业
+0.042,initial_乌卫位围胃完碗挖弯温王旺握翁屋=瓦外危顽
+0.040,tone_厌店汉战建进镇俊=糙_个过货嫁借化布锯数句爱贝带拜戒晒制世对碎怪挂岁肺桂寄戏四试记意醉季费贵灶罩孝笑照要钓叫够富副瘦暗占剑变线扇见半算惯印劲粪壮向放证镜姓正冻粽送宋=糙_破课错裤去菜剃配快刺屁器气靠炮票凑臭欠炭看骗片判劝寸困唱胖秤庆痛=糙
+0.039,finals_胎台袋来菜财该改开海爱盖害=带拜排埋摆派牌柴晒败
… 1343 more positive …,… 1343 more positive …
… 1624 more negative …,… 1624 more negative …
-0.038,finals_装壮疮床霜桩撞窗双=汤糖浪仓钢糠_装壮疮床霜桩撞窗双=帮忙党绑胖棒
