# 必要なもののインポート
今回は発言内容からどの会議で発言されたものか予測することを考える。

In [4]:
import pandas as pd
import numpy as np
import MeCab
import mojimoji
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.decomposition import PCA
from sklearn import svm
import matplotlib.pyplot as plt

%matplotlib inline

# データセットの読み込み

In [5]:
df_input = pd.read_csv('../data/kokkai.csv', header=0)
display(df_input.head())
print(len(df_input))

Unnamed: 0,date,house,meeting,speech_order,text
0,2019-12-03,参議院,経済産業委員会,0,令和元年十二月三日（火曜日）\r\n 午前十時開会\r\n ───────────...
1,2019-12-03,参議院,経済産業委員会,1,○委員長（礒崎哲史君）　ただいまから経済産業委員会を開会いたします。\r\n　委員の異動につ...
2,2019-12-03,参議院,経済産業委員会,2,○委員長（礒崎哲史君）　政府参考人の出席要求に関する件についてお諮りいたします。\r\n　外...
3,2019-12-03,参議院,経済産業委員会,3,○委員長（礒崎哲史君）　御異議ないと認め、さよう決定いたします。\r\n ──────...
4,2019-12-03,参議院,経済産業委員会,4,○委員長（礒崎哲史君）　外国為替及び外国貿易法第十条第二項の規定に基づき、北朝鮮を仕向地とす...


8653


# 形態素解析の関数

In [6]:
def text_to_words(text, stop_word_pass='../stopwords/Japanese.txt'):
    # stopword listをつくる
    stopword_list = []
    with open(stop_word_pass, 'r') as f:
        stopword_list = f.readlines()
        
    stopword_list = [x.strip() for x in stopword_list if x.strip()] 
    #形態素解析を始める
    m = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
    m.parse('')
    #text = normalize_text(text)
    text = mojimoji.zen_to_han(text, kana=False)
    m_text = m.parse(text)
    basic_words = []
    #mecabの出力結果を単語ごとにリスト化
    m_text = m_text.split('\n')
    for row in m_text:
        #Tab区切りで形態素、その品詞等の内容と分かれているので単語部のみ取得
        word = row.split("\t")[0]
        #最終行はEOS
        if word == 'EOS':
            break
        else:
            pos = row.split('\t')[1]
            slice_ = pos.split(',')
            #品詞を取得する
            parts = slice_[0]
            if parts == '記号':
                continue

            #活用語の場合は活用指定ない原型を取得する。
            elif slice_[0] in ('形容詞', '動詞') and slice_[-3] not in stopword_list:
                    basic_words.append(slice_[-3])

            #活用しない語についてはそのままの語を取得する
            elif slice_[0] =='名詞' and word not in stopword_list:
                basic_words.append(word)

    basic_words = ' '.join(basic_words)
    return basic_words

# 学習データの整形
本会議を0、経済産業員会を1とコード化する。また、発言内容を上記関数で分かち書きする。

In [8]:
meeting_index = {'本会議':0, '経済産業委員会':1}
df_input['meeting_index'] = [0 if x not in meeting_index or meeting_index[x] == 0 else 1 \
                             for x in df_input['meeting'].values.tolist()]
df_input['date'] = pd.to_datetime(df_input['date'])
df_input['text_ana'] = df_input['text'].map(text_to_words)

In [9]:
print('Dataset size:{}'.format(len(df_input)))
df_input.head()

Dataset size:8653


Unnamed: 0,date,house,meeting,speech_order,text,meeting_index,text_ana
0,2019-12-03,参議院,経済産業委員会,0,令和元年十二月三日（火曜日）\r\n 午前十時開会\r\n ───────────...,1,令和元年 十二月三日 火曜日 午前 十時 開会 委員 異動 十一月二十八日 辞任 補欠 選任...
1,2019-12-03,参議院,経済産業委員会,1,○委員長（礒崎哲史君）　ただいまから経済産業委員会を開会いたします。\r\n　委員の異動につ...,1,委員長 礒崎哲史 君 経済産業委員会 開会 いたす 委員 異動 報告 いたす 昨日 三木亨 ...
2,2019-12-03,参議院,経済産業委員会,2,○委員長（礒崎哲史君）　政府参考人の出席要求に関する件についてお諮りいたします。\r\n　外...,1,委員長 礒崎哲史 君 政府参考人 出席 要求 件 諮る いたす 外国為替及び外国貿易法 第十...
3,2019-12-03,参議院,経済産業委員会,3,○委員長（礒崎哲史君）　御異議ないと認め、さよう決定いたします。\r\n ──────...,1,委員長 礒崎哲史 君 異議 ない 認める 決定 いたす
4,2019-12-03,参議院,経済産業委員会,4,○委員長（礒崎哲史君）　外国為替及び外国貿易法第十条第二項の規定に基づき、北朝鮮を仕向地とす...,1,委員長 礒崎哲史 君 外国為替及び外国貿易法 第十条 項 規定 基づく 北朝鮮 仕向 する ...


## tf-idfの導入
今回はtf-idfを次元圧縮したベクトルを用いることにする。

In [10]:
dim=200
cv_vec = CountVectorizer(max_df=0.5, min_df=0.03)
corpus_bgw = cv_vec.fit_transform(df_input['text_ana'])
tf_vec = TfidfTransformer()
corpus_tfidf = tf_vec.fit_transform(corpus_bgw)
print('tfidf shape:{}'.format(corpus_tfidf.shape))
pca = PCA(n_components=dim)
corpus_pca = pca.fit_transform(corpus_tfidf.toarray())
print('After pca shape:{}'.format(corpus_pca.shape))
print('PCA explained variance ratio:{}'.format(np.sum(pca.explained_variance_ratio_)))

tfidf shape:(8653, 353)
After pca shape:(8653, 200)
PCA explained variance ratio:0.8558992934816846


In [11]:
dim_cols = ['dim_{}'.format(x) for x in range(dim)]
df_tfpca = pd.DataFrame(corpus_pca, columns=dim_cols)
df_tfpca['label'] = df_input['meeting_index']
df_tfpca['date'] = df_input['date']
display(df_tfpca.head())

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,dim_192,dim_193,dim_194,dim_195,dim_196,dim_197,dim_198,dim_199,label,date
0,0.01548,0.150351,-0.045292,0.034151,-0.063571,0.094059,0.153211,0.050254,-0.065363,-0.013487,...,0.009607,0.041123,0.058695,0.062654,-0.072238,-0.065187,0.013826,-0.009905,1,2019-12-03
1,0.172795,0.215151,-0.22812,-0.071218,-0.084779,0.026997,0.133393,0.406687,0.186794,0.076466,...,0.006588,-0.009375,-0.002399,-0.012564,-0.001359,-0.007281,0.003006,0.000717,1,2019-12-03
2,0.221388,0.34243,-0.029578,-0.036825,0.328804,-0.054806,0.104023,-0.03212,-0.180322,0.127935,...,0.000758,-0.008356,-0.014685,-0.030353,0.018807,0.005729,0.024684,0.004312,1,2019-12-03
3,0.305793,0.457033,-0.051423,-0.228069,0.384287,-0.065583,-0.107993,-0.017622,-0.178406,-0.004184,...,-0.022973,-0.008128,-0.01198,0.011132,-0.007459,0.025789,0.004151,-0.013116,1,2019-12-03
4,0.154039,0.157632,-0.111747,0.113941,-0.058315,0.070563,0.018036,-0.017467,-0.002123,-0.078375,...,-0.006274,0.022962,0.002795,0.009949,0.020707,-0.003801,-0.024727,-0.038998,1,2019-12-03


In [12]:
df_set = df_tfpca[df_tfpca['date'] <= pd.to_datetime('2019-09-30')]
df_test = df_tfpca[df_tfpca['date'] >= pd.to_datetime('2019-10-01')]
print('Train valid data.')
display(df_set.head())
print('size:{}'.format(len(df_set)))
print('Test data.')
display(df_test.head())
print('size:{}'.format(len(df_test)))

Train valid data.


Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,dim_192,dim_193,dim_194,dim_195,dim_196,dim_197,dim_198,dim_199,label,date
163,0.128364,0.233438,-0.069051,0.164082,-0.19314,-0.021783,0.184147,0.027749,-0.111847,-0.15448,...,-0.014178,-0.082267,0.049903,-0.07455,-0.02803,-0.075032,0.018745,-0.03186,1,2019-08-05
164,0.147029,0.261018,-0.111666,-0.012182,-0.033572,0.145582,-0.036826,0.08774,0.091134,-0.038529,...,0.022645,-0.024272,-0.001591,0.005094,-0.020229,0.008245,0.012399,-0.021905,1,2019-08-05
165,0.17907,0.224289,-0.026965,0.16874,0.014812,-0.165203,0.098889,-0.066376,-0.169791,-0.130339,...,-0.015628,-0.082651,0.037827,-0.082074,-0.028076,-0.05892,0.005076,-0.006306,1,2019-08-05
166,0.263988,0.507164,-0.042052,-0.23058,0.411908,-0.077507,-0.153724,-0.007484,-0.206117,-0.000412,...,0.005555,0.013819,0.008028,-0.009898,0.008402,-0.011337,-0.007204,0.015634,1,2019-08-05
167,0.278721,0.4571,0.010302,-0.255953,0.417662,-0.042821,-0.05091,0.09319,-0.183419,0.010764,...,-0.023454,0.002147,-0.005139,-0.028996,0.019921,0.00816,-0.024946,-0.004384,1,2019-08-05


size:6525
Test data.


Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,dim_192,dim_193,dim_194,dim_195,dim_196,dim_197,dim_198,dim_199,label,date
0,0.01548,0.150351,-0.045292,0.034151,-0.063571,0.094059,0.153211,0.050254,-0.065363,-0.013487,...,0.009607,0.041123,0.058695,0.062654,-0.072238,-0.065187,0.013826,-0.009905,1,2019-12-03
1,0.172795,0.215151,-0.22812,-0.071218,-0.084779,0.026997,0.133393,0.406687,0.186794,0.076466,...,0.006588,-0.009375,-0.002399,-0.012564,-0.001359,-0.007281,0.003006,0.000717,1,2019-12-03
2,0.221388,0.34243,-0.029578,-0.036825,0.328804,-0.054806,0.104023,-0.03212,-0.180322,0.127935,...,0.000758,-0.008356,-0.014685,-0.030353,0.018807,0.005729,0.024684,0.004312,1,2019-12-03
3,0.305793,0.457033,-0.051423,-0.228069,0.384287,-0.065583,-0.107993,-0.017622,-0.178406,-0.004184,...,-0.022973,-0.008128,-0.01198,0.011132,-0.007459,0.025789,0.004151,-0.013116,1,2019-12-03
4,0.154039,0.157632,-0.111747,0.113941,-0.058315,0.070563,0.018036,-0.017467,-0.002123,-0.078375,...,-0.006274,0.022962,0.002795,0.009949,0.020707,-0.003801,-0.024727,-0.038998,1,2019-12-03


size:2128


# ホールドアウト法
ホールドアウト法で分析を行う。 　
層化サンプリングをする場合はstratify=df_set['XXXX']のように指定する。

In [13]:
print('All labels')
print('Label 0:{}'.format((df_set['label'] == 0).sum()))
print('Label 1:{}'.format((df_set['label'] == 1).sum()))
print('All:{}'.format(len(df_set)))

df_train, df_valid = train_test_split(df_set, random_state=8, stratify=df_set['label'], train_size=0.8, shuffle=True)

print('Train labels')
print('Label 0:{}'.format((df_train['label'] == 0).sum()))
print('Label 1:{}'.format((df_train['label'] == 1).sum()))
print('All:{}'.format(len(df_train)))
display(df_train.head())

print('Valid labels')
print('Label 0:{}'.format((df_valid['label'] == 0).sum()))
print('Label 1:{}'.format((df_valid['label'] == 1).sum()))
print('All:{}'.format(len(df_valid)))
display(df_valid.head())


All labels
Label 0:2309
Label 1:4216
All:6525
Train labels
Label 0:1847
Label 1:3373
All:5220


Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,dim_192,dim_193,dim_194,dim_195,dim_196,dim_197,dim_198,dim_199,label,date
4943,-0.138703,-0.031234,0.03593,-0.019048,-0.014765,0.085486,0.013622,-0.300632,-0.002284,0.039136,...,0.025348,-0.010257,0.036882,0.055556,0.003515,0.025739,-0.009996,0.04231,1,2019-04-17
8118,-0.080125,0.019422,0.030633,-0.115016,-0.067461,-0.037645,0.094802,0.150745,-0.06568,-0.074706,...,-0.001984,0.058087,0.028693,0.054284,0.069841,0.025573,-0.027615,-0.025974,1,2019-03-20
1444,-0.165271,0.051804,0.014086,-0.058171,-0.137151,-0.155063,-0.066494,0.013811,-0.023909,-0.094563,...,-0.001266,-0.015898,0.058159,0.000224,-0.058687,-0.020469,-0.026786,-0.005531,1,2019-05-24
8143,-0.145408,-0.017275,-0.002852,0.093215,0.049748,0.064688,0.017865,-0.067039,0.005588,0.030138,...,0.008443,-0.05948,-0.069256,0.003078,0.027003,-0.013239,-0.060421,-0.055471,1,2019-03-20
2662,-0.279757,-0.039488,0.004553,0.154825,0.100458,-0.077836,-0.17538,0.183937,-0.00395,-0.179601,...,0.002386,0.026005,0.03652,-0.004778,0.045615,0.000202,0.007658,-0.047008,0,2019-04-19


Valid labels
Label 0:462
Label 1:843
All:1305


Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,dim_192,dim_193,dim_194,dim_195,dim_196,dim_197,dim_198,dim_199,label,date
7789,-0.093893,0.01128,0.00075,0.022411,0.082389,0.00709,0.002765,0.019806,-0.033287,-0.024309,...,0.023042,-0.047553,-0.033701,0.006388,-0.024786,0.004641,0.014969,0.016055,1,2019-05-28
8567,-0.112255,-0.009925,0.036265,-0.189578,-0.14288,-0.020626,0.131112,0.001348,-0.024091,0.064437,...,0.007443,-0.001576,-0.006943,0.020373,-0.078177,-0.00839,0.041788,0.050028,1,2019-06-12
2654,-0.215523,-0.033732,-0.035143,0.321753,0.233926,0.122052,-0.120857,-0.053289,0.009997,-0.089943,...,-0.046902,0.025197,0.017205,-0.01883,-0.004785,-0.035837,0.002324,-0.012266,0,2019-04-19
4529,-0.174851,-0.014797,-0.037267,0.275954,0.103588,-0.062965,-0.10757,0.092719,-0.039235,-0.11819,...,-0.023426,0.020682,-0.018463,0.008521,0.051746,0.011335,0.017877,-0.00359,0,2019-03-12
4875,-0.139103,-0.003064,0.037702,-0.145304,-0.078453,-0.059788,0.048388,0.107609,-0.054276,-0.058054,...,0.011784,-0.006039,-0.001642,0.011103,0.037459,0.044306,-0.012732,-0.048264,1,2019-04-17


## SVC

In [None]:
cs = [10**x for x in range(-2, 3)]
df_train_vec = df_train[dim_cols]
df_valid_vec = df_valid[dim_cols]
df_train_lab = df_train['label']
df_valid_lab = df_valid['label']

result_list = {}
for c in cs:
    ## Train and Predict with SVM ##
    svm = SVC(kernel='linear', C=c, probability=True)
    print('C:{} train start'.format(c))
    svm.fit(df_train_vec, df_train_lab)
    print('C:{} train end'.format(c))
    predicts = svm.predict(df_valid_vec)
    predict_probas = svm.predict_proba(df_valid_vec)[:, 1]
    ## Train and Predict with SVM ##
    print('Confusion matrix')
    ## Confusion Matrix
    cm = confusion_matrix(df_valid_lab, predicts)
    display(pd.DataFrame(cm, index=['actual_0', 'actual_1'], columns=['predict_0', 'predict_1']))
    '''
    accuracy: 正解率
    precision: 精度
    recall: 再現率
    '''
    accuracy = accuracy_score(df_valid_lab, predicts)
    precision = precision_score(df_valid_lab, predicts)
    recall = recall_score(df_valid_lab, predicts)
    f1 = f1_score(df_valid_lab, predicts)
    fpr_all, tpr_all, thresholds = roc_curve(df_valid_lab, predict_probas)
    auc_num = auc(fpr_all, tpr_all)
    print('Accuracy:{} Precision:{} Recall:{}, F1score:{}'.format(accuracy, precision, recall, f1))
    print('auc:{}'.format(auc_num))
    ## Describe ROC curve ##
    fig, axe = plt.subplots()
    axe.plot(fpr_all, tpr_all, label='C:'.format(c))
    axe.set_title('ROC curve C:{}'.format(c))
    axe.set_xlabel('FPR')
    axe.set_ylabel('TPR')
    axe.legend()
    axe.grid()
    result_list[c] = [accuracy, precision, recall, f1, svm]
    print()

C:0.01 train start
C:0.01 train end
Confusion matrix


Unnamed: 0,predict_0,predict_1
actual_0,302,160
actual_1,4,839


Accuracy:0.8743295019157088 Precision:0.8398398398398398 Recall:0.9952550415183867, F1score:0.9109663409337676
auc:0.9858241797743578

C:0.1 train start


### Test dataで確認する。

In [None]:
df_test_vec = df_test[dim_cols]
df_test_lab = df_test['label']

model = result_list[10][-1]
predicts = model.predict(df_test_vec)
predict_probas = svm.predict_proba(df_test_vec)[:, 1]
cm = confusion_matrix(df_test_lab, predicts)
display(pd.DataFrame(cm, index=['actual_0', 'actual_1'], columns=['predict_0', 'predict_1']))
accuracy = accuracy_score(df_test_lab, predicts)
precision = precision_score(df_test_lab, predicts)
recall = recall_score(df_test_lab, predicts)
f1 = f1_score(df_test_lab, predicts)
fpr_all, tpr_all, thresholds = roc_curve(df_test_lab, predict_probas)
auc_num = auc(fpr_all, tpr_all)
print('Accuracy:{} Precision:{} Recall:{}, F1score:{}'.format(accuracy, precision, recall, f1))
print('auc:{}'.format(auc_num))
## Describe ROC curve ##
fig, axe = plt.subplots()
axe.plot(fpr_all, tpr_all, label='C:'.format(10))
axe.set_title('ROC curve C:{}'.format(10))
axe.set_xlabel('FPR')
axe.set_ylabel('TPR')
axe.legend()
axe.grid()


# 交差検証


In [12]:
set_vec = df_set[dim_cols].values
set_lab = df_set['label'].values
kfold = StratifiedKFold(n_splits=10, random_state=8).split(set_vec, set_lab)
for i, (train, valid) in enumerate(kfold):
    svm = SVC(kernel='linear', C=10, probability=True)
    svm.fit(set_vec[train], set_lab[train])
    predicts = svm.predict(set_vec[valid])
    accuracy = accuracy_score(set_lab[valid], predicts)
    print('Fold:{} Accuracy:{}'.format(i, accuracy))



Fold:0 Accuracy:0.992616899097621
Fold:1 Accuracy:0.992616899097621
Fold:2 Accuracy:0.992616899097621
Fold:3 Accuracy:0.985233798195242
Fold:4 Accuracy:0.9934372436423298
Fold:5 Accuracy:0.9967186218211649
Fold:6 Accuracy:0.9901558654634947
Fold:7 Accuracy:0.9917965545529123
Fold:8 Accuracy:0.9786710418375718
Fold:9 Accuracy:0.9958949096880131
