# bayes filter調査

## 入力データ

- trset トレーニングセット（既知データ）
- testset テストセット（未知データ）

In [36]:
import pandas as pd

trset=pd.read_csv("in_tr.csv")
trset

Unnamed: 0,flg,date,url,wd1,wd2,wd3
0,0,2017/02/20,url1,1.0,2.0,1.0
1,0,2017/02/21,url2,1.0,,
2,1,2017/02/21,url3,2.0,,1.0
3,1,2017/02/21,url4,2.0,1.0,2.0


In [37]:
testset=pd.read_csv("in_test.csv")
testset

Unnamed: 0,date,url,wd1,wd2,wd3
0,2017/02/20,url1,,1.0,1.0
1,2017/02/21,url2,1.0,1.0,
2,2017/02/21,url3,2.0,,2.0
3,2017/02/21,url4,2.0,1.0,1.0


## [メモ]ベイジアンフィルタ

http://qiita.com/ynakayama/items/ca3f5e9d762bbd50ad1f

In [38]:

import numpy as np

from sklearn.naive_bayes import GaussianNB

X = np.array([[1,2,3,4,5,6,7,8],
              [1,1,3,4,5,6,6,7],
              [2,1,2,4,5,8,8,8]]) # 特徴ベクトル
y = np.array([1, 2, 3]) # そのラベル
t = np.array([2,2,4,5,6,8,7,8]) # テストデータ

clf = GaussianNB() # 正規分布を仮定したベイズ分類
clf.fit(X, y) # 学習をする
clf.predict(t) # => [3]



array([1])

# 想定データに対して、ベイズフィルタを適用してみる

### トレーニングデータから、教師フラグ部分と、形態素解析部分を抽出

In [39]:
import pandas as pd

trset=pd.read_csv("in_tr.csv",  header=None, skiprows=1, names=['flg','date','url','word1','word2','word3']).fillna(0)
trset

Unnamed: 0,flg,date,url,word1,word2,word3
0,0,2017/02/20,url1,1.0,2.0,1.0
1,0,2017/02/21,url2,1.0,0.0,0.0
2,1,2017/02/21,url3,2.0,0.0,1.0
3,1,2017/02/21,url4,2.0,1.0,2.0


In [40]:
flags=trset['flg']
flags

0    0
1    0
2    1
3    1
Name: flg, dtype: int64

In [41]:
datas=trset.ix[:,3:]
datas

Unnamed: 0,word1,word2,word3
0,1.0,2.0,1.0
1,1.0,0.0,0.0
2,2.0,0.0,1.0
3,2.0,1.0,2.0


### テストデータから、形態素解析部分を抽出

In [42]:

testset=pd.read_csv("in_test.csv").fillna(0)
testdatas=testset.ix[:,2:]
testdatas

Unnamed: 0,wd1,wd2,wd3
0,0.0,1.0,1.0
1,1.0,1.0,0.0
2,2.0,0.0,2.0
3,2.0,1.0,1.0


### ベイズフィルタ

トレーニングデータから学習。
テストデータを使って予測する。予測結果は、arrayでなく、Seriesに変換してみる（最終的な結合用）

In [45]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB


clf = GaussianNB() # 正規分布を仮定したベイズ分類
#clf = BernoulliNB() # ベルヌーイ分布を仮定したベイズ分類
#clf = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) # 多項分布を仮定したベイズ分類

clf.fit(datas, flags) # 学習をする
res=clf.predict(testdatas)
pd.Series(res)

0    0
1    0
2    1
3    1
dtype: int64

テストデータに、予測結果を連接して結果を出力。

In [46]:
pd.concat([pd.Series(res),testset],axis=1)

Unnamed: 0,0,date,url,wd1,wd2,wd3
0,0,2017/02/20,url1,0.0,1.0,1.0
1,0,2017/02/21,url2,1.0,1.0,0.0
2,1,2017/02/21,url3,2.0,0.0,2.0
3,1,2017/02/21,url4,2.0,1.0,1.0


## 参考URL

* Naive Bayes
http://qiita.com/ynakayama/items/ca3f5e9d762bbd50ad1f
http://universityofbigdata.net/competition/tutorial/5681717746597888

* python dataframe, pandas
+ http://sinhrks.hatenablog.com/entry/2015/01/28/073327
+ http://blog.pepese.com/entry/2016/09/04/144109
+ http://www.mwsoft.jp/programming/numpy/pandas_csv.html
+ http://keisanbutsuriya.hateblo.jp/entry/2015/12/16/161410#csvファイルをデータフレームとして読み込む
+ http://sinhrks.hatenablog.com/entry/2014/11/12/233216


# 本物のデータ形式を使ってみる
## https://github.com/suesh32/tokenizer_app
* トレーニングデータ learn_data.csv
* テストデータ  test_data.csv

In [47]:
import pandas as pd

#trfile="in_tr.csv"
trfile="learn_data.csv"

trset=pd.read_csv(trfile,  header=None, skiprows=1).fillna(0)
trset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,210,211,212,213,214,215,216,217,218,219
0,0,2016-02-06,http://style.nikkei.com/article/DGXMZO12054150...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,2016-02-06,http://www.nikkei.com/article/DGKKZO12543610U7...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0,2016-02-01,http://www.nikkei.com/article/DGKKASFK25H41_R3...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3,0,2016-01-31,http://www.nikkei.com/article/DGKKZO12336510R3...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,2016-02-06,http://www.excite.co.jp/News/column_g/20170206...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,2016-02-04,http://www.gizmodo.jp/2017/02/lets-see-how-pai...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,2016-02-07,https://www.bloomberg.co.jp/news/articles/2017...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,2016-02-06,http://thebridge.jp/2017/02/how-ai-will-transf...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,2016-02-07,http://dd.hokkaido-np.co.jp/news/science/scien...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,2016-02-04,https://www.advertimes.com/20170204/article243...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
flags=trset[0-0]
print(flags)

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    1
24    1
25    1
26    1
27    1
28    1
29    1
Name: 0, dtype: int64


In [49]:
keitaiso_array=trset.ix[:,3:]
print(keitaiso_array)

    3    4    5    6    7    8    9    10   11   12  ...   210  211  212  213  \
0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 ...   0.0  0.0  0.0  0.0   
1   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 ...   0.0  0.0  0.0  0.0   
2   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 ...   0.0  0.0  0.0  0.0   
3   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 ...   0.0  0.0  0.0  0.0   
4   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 ...   0.0  0.0  0.0  0.0   
5   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 ...   1.0  0.0  0.0  0.0   
6   0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0 ...   0.0  1.0  1.0  0.0   
7   0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0 ...   0.0  0.0  0.0  0.0   
8   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0 ...   0.0  0.0  2.0  1.0   
9   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 ...   0.0  0.0  0.0  0.0   
10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 ...   0.0  0.0  0.0  0.0   
11  0.0  0.0  0.0  0.0  0.0 

In [50]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB


clf = GaussianNB() # 正規分布を仮定したベイズ分類
#clf = BernoulliNB() # ベルヌーイ分布を仮定したベイズ分類
#clf = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) # 多項分布を仮定したベイズ分類

clf.fit(keitaiso_array, flags) # 学習をする


GaussianNB()

In [51]:
#testfile="in_test.csv"
testfile="test_data.csv"

testset=pd.read_csv(testfile, header=None, skiprows=1).fillna(0)
testdatas=testset.ix[:,2:]
print(testdatas)

     2    3    4    5    6    7    8    9    10   11  ...   209  210  211  \
0      0    0    0    0    0    0    0    0    0    0 ...     0    0    0   
1      0    0    0    0    0    0    0    0    0    0 ...     0    0    0   
2      0    0    0    0    0    0    0    0    0    0 ...     0    0    0   
3      0    0    0    0    0    0    0    0    0    0 ...     0    0    0   
4      0    0    0    0    0    0    0    0    0    0 ...     0    0    0   
5      0    0    0    0    0    0    0    0    0    0 ...     0    0    0   
6      0    0    0    0    0    0    0    0    0    0 ...     0    0    0   
7      0    0    0    0    0    0    0    0    0    0 ...     0    0    0   
8      0    0    0    0    0    0    0    0    0    0 ...     0    0    0   
9      0    0    0    0    0    0    0    0    0    0 ...     0    0    0   
10     0    0    0    0    0    0    0    0    0    0 ...     0    0    0   
11     0    0    0    0    0    0    0    0    0    0 ...     0    0    0   

In [52]:
res=clf.predict(testdatas)
pd.Series(res)

0      0
1      1
2      1
3      0
4      0
5      0
6      0
7      1
8      1
9      0
10     0
11     1
12     1
13     1
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     1
24     0
25     0
26     1
27     0
28     0
29     1
      ..
145    1
146    0
147    0
148    0
149    0
150    0
151    0
152    0
153    1
154    1
155    0
156    0
157    0
158    0
159    0
160    0
161    0
162    0
163    0
164    0
165    0
166    0
167    0
168    0
169    0
170    0
171    0
172    0
173    0
174    1
dtype: int64

In [54]:
result=pd.concat([pd.Series(res),testset],axis=1)
result

Unnamed: 0,0,0.1,1,2,3,4,5,6,7,8,...,209,210,211,212,213,214,215,216,217,218
0,0,2017-03-18,http://www.itmedia.co.jp/,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,2017-03-18,http://www.itmedia.co.jp/news/,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,2017-03-18,http://www.itmedia.co.jp/business/,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,2017-03-18,http://mag.executive.itmedia.co.jp/,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,2017-03-18,http://www.itmedia.co.jp/enterprise/,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,2017-03-18,http://techtarget.itmedia.co.jp/,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,2017-03-18,http://marketing.itmedia.co.jp/,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,2017-03-18,http://techfactory.itmedia.co.jp/,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,2017-03-18,http://www.itmedia.co.jp/smartjapan/,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,2017-03-18,http://www.itmedia.co.jp/mobile/,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
result.to_csv('result.csv')