## scikit-learn チートシート（日本語）

http://wisteriahill.sakura.ne.jp/CMS/WordPress/wp-content/uploads/2019/06/scikit-learn-cheat-sheet-0-2.png

## GaussianNBとは

https://avinton.com/academy/naive-bayes/

## Unicode のコードポイントとは

▼コードポイント変換

https://chobitool.com/unicodepoint/

## ordとは

https://techacademy.jp/magazine/29162


In [8]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Unicodeのコードポイント頻度測定
def count_codePoint(str):
    # Unicodeのコードポイントをアドレスとする配列を用意
    counter = np.zeros(65535)

    for i in range(len(str)):
        # 各文字をUnicodeのコードポイントに変換
        code_point = ord(str[i])
        if code_point > 65535 :
            continue
        # 対応するアドレスの出現回数をインクリメント
        counter[code_point] += 1

    # 各要素を文字数で割って正規化
    counter = counter/len(str)
    
    # ↑ここで何をやっているのかわからない人向け 
    print(str)
    print('counter:', counter)
    print('len(str):', len(str))
    print('counter/len(str):', counter/len(str))
    print('')
    
    return counter

# 学習用データの準備
ja_str = 'これは日本語の文章です。' # 日本語
en_str = 'This is English Sentences.' # 英語
th_str = 'นี่เป็นประโยคภาษาไทย' # タイ語
si_str = 'මේක සිංහල' # シンハラ語
it_str = 'Questo è italiano.' # イタリア語
es_str = 'Esto es español.' # スペイン語
mm_str = 'ဒါက ဗမာပဲ။' # ミャンマー語


x_train = [count_codePoint(ja_str),
           count_codePoint(en_str),
           count_codePoint(th_str),
           count_codePoint(si_str),
           count_codePoint(it_str),
           count_codePoint(es_str),
           count_codePoint(mm_str)]
y_train = ['ja', 'en', 'th', 'si', 'it', 'es', 'mm']

# 学習する
clf = GaussianNB() 
clf.fit(x_train, y_train)

# 評価用データの準備
ja_test_str = 'こんにちは'
en_test_str = 'Hello'
th_test_str = 'สวัสดี'
si_test_str = 'ආයුබෝවන්'
it_test_str = 'Buongiorno'
es_test_str = 'Buenas tardes'
mm_test_str = 'မင်္ဂလာပါ.'

# 順番は関係なし
x_test = [count_codePoint(mm_test_str),
          count_codePoint(es_test_str),
          count_codePoint(it_test_str),
          count_codePoint(si_test_str),
          count_codePoint(en_test_str),
          count_codePoint(th_test_str),
          count_codePoint(ja_test_str)]
y_test = ['mm', 'es', 'it', 'si', 'en', 'th', 'ja']

# 評価する
y_pred = clf.predict(x_test)
print(y_pred)
print("正解率 = " , accuracy_score(y_test, y_pred))

これは日本語の文章です。
counter: [0. 0. 0. ... 0. 0. 0.]
len(str): 12
counter/len(str): [0. 0. 0. ... 0. 0. 0.]

This is English Sentences.
counter: [0. 0. 0. ... 0. 0. 0.]
len(str): 26
counter/len(str): [0. 0. 0. ... 0. 0. 0.]

นี่เป็นประโยคภาษาไทย
counter: [0. 0. 0. ... 0. 0. 0.]
len(str): 20
counter/len(str): [0. 0. 0. ... 0. 0. 0.]

මේක සිංහල
counter: [0. 0. 0. ... 0. 0. 0.]
len(str): 9
counter/len(str): [0. 0. 0. ... 0. 0. 0.]

Questo è italiano.
counter: [0. 0. 0. ... 0. 0. 0.]
len(str): 18
counter/len(str): [0. 0. 0. ... 0. 0. 0.]

Esto es español.
counter: [0. 0. 0. ... 0. 0. 0.]
len(str): 16
counter/len(str): [0. 0. 0. ... 0. 0. 0.]

ဒါက ဗမာပဲ။
counter: [0. 0. 0. ... 0. 0. 0.]
len(str): 10
counter/len(str): [0. 0. 0. ... 0. 0. 0.]

မင်္ဂလာပါ.
counter: [0. 0. 0. ... 0. 0. 0.]
len(str): 10
counter/len(str): [0. 0. 0. ... 0. 0. 0.]

Buenas tardes
counter: [0. 0. 0. ... 0. 0. 0.]
len(str): 13
counter/len(str): [0. 0. 0. ... 0. 0. 0.]

Buongiorno
counter: [0. 0. 0. ... 0. 0. 0.]
len(str): 10


In [5]:
# 正規化の式の意味がわからない人へのヒント
a = np.zeros(100)
b = a + 100
b

array([100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100.])