# SCDVクラスのテスト

In [6]:
import numpy as np
import pandas as pd

# import reload 用
import importlib

In [4]:
# error display
# target_name 間違っている出力の名前
def error_display(test_set, true_set, target_name=""):
    return target_name + " different. test setting : {0}, true settings : {1}".format(test_set, true_set)

## クラスの記載

In [46]:
import scdv

In [157]:
# 再import 
importlib.reload(scdv)
from scdv import SCDV
from scdv import Word
from scdv import Document

## テスト

### Word

In [158]:
# Word.match のテスト
def test_Word_match():
    word="A"
    model = Word(word)
    assert model.match(word), "word name " + error_display(word, model.name)
    
    # 異なる場合
    word_notMatch = "b"
    assert ~model.match(word_notMatch), "word name " + error_display(word_notMatch, model.name)    

In [159]:
test_Word_match()

In [160]:
# Word の get_name のテスト
def test_Word_get_name():
    name = "a"
    word = Word(name)
    assert word.get_name() == name, error_display(word.get_name(), name, "get_name")

In [161]:
test_Word_get_name()

In [162]:
# set_idf のテスト
def test_Word_set_idf():
    name = "apple"
    idf = 0.3
    
    word = Word(name)
    word.set_idf(idf)
    
    assert word.get_idf() == idf, error_display(word.get_idf(), idf, "idf")

In [163]:
test_Word_set_idf()

In [164]:
# calc_clustered_vector のテスト
def test_Word_calc_clustered_vector():
    name = "apple"
    idf = 0.3
    vector = np.array([2,4,2,1])
    cluster_idx = 2
    cluster_probability = np.array([0.1, 0.7, 0.2])
    
    word = Word(name)
    word.set_vector(vector)
    word.set_idf(idf)
    word.set_cluster_idx(cluster_idx)
    word.set_cluster_probability(cluster_probability)
    
    clustered_vector = word.calc_clustered_vector()
    
    assert clustered_vector.shape[0] == vector.shape[0]*cluster_probability.shape[0], error_display(clustered_vector.shape[0], vector.shape[0]*cluster_probability.shape[0], "dimension of clustered vector")
    # 各値が等しいか確認する
    for idx_cluster_probability, prob in enumerate(cluster_probability):
        for idx_vector, value in enumerate(vector):
            test_value = clustered_vector[idx_vector+idx_cluster_probability*vector.shape[0]]
            true_value = idf*value*prob
            assert test_value == true_value, error_display(test_value, true_value, "value of clustered_vector")

In [165]:
test_Word_calc_clustered_vector()

### SCDV

In [170]:
# SCDVのコンストラクタのテスト
def test_SCDV_constract():
    num_cluster=10
    random_seed = 1
    threshold = 0.1
    embedding_dimension = 300
    
    scdv = SCDV(num_cluster, random_seed, threshold, embedding_dimension)
    
    assert scdv.num_cluster == num_cluster, "num_cluster " + error_display(scdv.num_cluster, num_cluster)
    assert scdv.random_seed == random_seed, "num_cluster " + error_display(scdv.random_seed, random_seed)
    assert scdv.threshold == threshold, "threshold " + error_display(scdv.threshold, threshold)
    assert scdv.embedding_dimension == embedding_dimension, "embedding_dimension " + error_display(scdv.embedding_dimension, embedding_dimension)

In [171]:
test_SCDV_constract()

In [172]:
# remove_vocabulary のテスト
def test_SCDV_remove_vocabulary():
    lst_lst_word = [["a", "b"], ["c", "d", "e", ""], ["a"]]
    remove_word = "a"
    
    model = SCDV()
    model.set_vocabulary(lst_lst_word)
    model.remove_vocabulary(remove_word)
    
    lst_answer = []
    for lst_word in lst_lst_word:
        lst_answer += lst_word
    lst_answer = list(set(lst_answer))
    lst_answer.remove(remove_word)
    
#     print(lst_answer)
#     print(model.get_vocabulary())
    
    # 削除したい単語が消えているか確認
    assert remove_word not in model.get_vocabulary(), "removed word {0} is exist in vocab.".format(remove_word)
    
    # 削除したくない単語が消えていないか確認
    for ans in lst_answer:
        assert ans in model.get_vocabulary(), "{0} is not exist in vocab.".format(ans)
        

In [173]:
test_SCDV_remove_vocabulary()

In [174]:
# set_vocablary のテスト
def test_SCDV_set_vocabulary():
    lst_lst_word = [["a", "b"], ["c", "d", "e", ""], ["a"]]
    lst_word = []
    for lst_input in lst_lst_word:
        lst_word+=lst_input
    
    model = SCDV()
    model.set_vocabulary(lst_lst_word)
    
    # 語彙の多さは同じか
    assert len(model.vocabulary) == len(set(lst_word)), error_display(len(model.set_vocabulary), 4, "num of vocab")
    
    # 各単語は語彙に登録されているか
    for word in lst_word:
        assert word in [word_vocab.name for word_vocab in model.vocabulary], "word " + word + " is not in vocabulary."

In [175]:
test_SCDV_set_vocabulary()

#### word2vec

In [178]:
# make_word2VecModel のテスト
def test_SCDV_make_word2VecModel():
    lst_lst_word = [["a", "b"], ["c", "d", "e", ""], ["a"]]
    embedding_dimension = 100
    
    model = SCDV(embedding_dimension=embedding_dimension)
    model.set_vocabulary(lst_lst_word)
    
    # word2vec 作成
    model.make_word2VecModel(lst_lst_word)
    
    vec_model = model.word2vec
    
    # 全単語に対してベクトルが定義されているか確認する
    for word in model.get_vocabulary():
        try:
            vec_model[word]
        except NameError:
            assert False, "Word '{0}' is not exist.".format(word)
        
    # 埋め込み次元数は等しいか確認する
    assert vec_model.wv.syn0.shape[1] == embedding_dimension, error_display(vec_model.syn0.shape[1], embedding_dimension, "embedding_dimension")
    

In [179]:
test_SCDV_make_word2VecModel()

Training word2Vec model...




In [182]:
# set_word2VecModel のテスト
def test_SCDV_set_word2VecModel():
    lst_lst_word = [["a", "b"], ["c", "d", "e", ""], ["a"]]
    
    model = SCDV()
    model.set_vocabulary(lst_lst_word)
    
    # word2vec 作成
    model.make_word2VecModel(lst_lst_word, min_word_count=2)
    # set
    model.set_word2Vec()
    
    vec_model = model.word2vec
    
#     print(model.get_vocabulary())
    
    # vocabulary 中の全単語に対してベクトルが定義されているか確認する
    # 定義されていない単語は vocabulary から削除されているはずなのででない
    for word in model.vocabulary:
        try:
            word.get_vector()
            
            # モデルにおけるベクトルとWord class におけるベクトルは等しいか確認する
            assert (vec_model[word.get_name()] == word.get_vector()).all, "'{0}' vector is different betweend the one of word2vec model and the other of class Word.".format(word.get_name())
        except NameError:
            assert False, "Word vector '{0}' is not exist.".format(word)

In [183]:
test_SCDV_set_word2VecModel()

Training word2Vec model...




In [184]:
# get_word2vec のテスト
def test_SCDV_get_word2Vec():
    lst_lst_word = [["a", "b"], ["c", "d", "e", ""], ["a"]]
    embedding_dimension = 100
    
    model = SCDV(embedding_dimension=embedding_dimension)
    model.set_vocabulary(lst_lst_word)
    
    # word2vec 作成
    model.make_word2VecModel(lst_lst_word)
    # set
    model.set_word2Vec()
    # get
    word_vectors = model.get_word2Vec()

#     print(word_vectors)
    assert word_vectors.shape[0] == len(model.get_vocabulary()), error_display(word_vectors.shape[0], len(model.get_vocabulary()), "num of word")
    assert word_vectors.shape[1] == embedding_dimension, error_display(word_vectors.shape[1], embedding_dimension, "embedding dimension")


In [185]:
test_SCDV_get_word2Vec()

Training word2Vec model...


#### clustering

In [186]:
# calc_cluster_probability のテスト
def test_SCDV_calc_cluster_probability():
    lst_lst_word = [["a", "b"], ["c", "d", "e", ""], ["a"]]
    num_cluster = 3
    
    model = SCDV(num_cluster=num_cluster)
    model.set_vocabulary(lst_lst_word)
    
    # word2vec 作成
    model.make_word2VecModel(lst_lst_word)
    # set
    model.set_word2Vec()
    
    # clustering model の作成
    model.make_clusterModel()
    
    idx, idx_proba = model.calc_cluster_probability()
    
    assert idx_proba.shape[1] == num_cluster, error_display(idx_proba[1], num_cluster, "num of cluster")
    assert sum([1 if idx_cluster >= num_cluster else 0 for idx_cluster in idx])==0, "out of cluster num"

In [187]:
test_SCDV_calc_cluster_probability()

Training word2Vec model...
Training clustering model...


In [188]:
# set_cluster のテスト
def test_SCDV_set_cluster():
    lst_lst_word = [["a", "b"], ["c", "d", "e", ""], ["a"]]
    num_cluster = 3
    
    model = SCDV(num_cluster=num_cluster)
    model.set_vocabulary(lst_lst_word)
    
    # word2vec 作成
    model.make_word2VecModel(lst_lst_word)
    # set
    model.set_word2Vec()
    
    # clustering model の作成
    model.make_clusterModel()
    cluster_model = model.get_clusterModel()
    model.set_cluster()
    idx_cluster, idx_proba = model.calc_cluster_probability()
    
    for idx, word in enumerate(model.vocabulary):
        assert word.get_cluster_idx() == idx_cluster[idx], error_display(word.get_cluster_idx(), idx_cluster, "cluster id")
        assert (word.get_cluster_probability==idx_proba).all, "the probability of cluster membership is different for {0}".format(word.name) 

In [189]:
test_SCDV_set_cluster()

Training word2Vec model...
Training clustering model...


#### idf

In [191]:
# calc_idf_by_word のテスト
def test_SCDV_calc_idf_by_word():
    lst_lst_word = [["apple", "banana"], ["corch", "banana", "empty", ""], ["apple"]]
    
    model = SCDV()
    model.set_vocabulary(lst_lst_word)
    
    # idf値算出
    feature_names, _ = model.calc_idf_by_word(lst_lst_word)
    
    # 各単語のidf値はvocaburalyに存在する単語か
    for word in feature_names:
        assert word in model.get_vocabulary(), "The idf value of {0} is not calculated.".format(word)

In [192]:
test_SCDV_calc_idf_by_word()

In [195]:
# set_idf のテスト
def test_SCDV_set_idf():
    lst_lst_word = [["apple", "banana"], ["corch", "banana", "empty", ""], ["apple", "I"]]
    
    model = SCDV()
    model.set_vocabulary(lst_lst_word)
    
    # idf 値算出
    feature_names, idf = model.calc_idf_by_word(lst_lst_word)
    # idf 値セット
    model.set_idf(feature_names, idf)
    
#     print(model.get_vocabulary())
    
    # idf値が設定されている vocabulary に対し、値は一致するか
    for word in model.vocabulary:
        idf_word = word.get_idf()
        # idf値が設定されている場合に値のチェック
        if idf_word != 0:
            assert (idf_word == idf[feature_names.index(word.get_name())]).all, "The idf value of {0} is different.".format(word.get_name())

In [196]:
test_SCDV_set_idf()

#### clustered vector

In [197]:
# make_clustered_vector のテスト
def test_SCDV_make_clustered_vector():
    lst_lst_word = [["apple", "banana"], ["corch", "banana", "empty", ""], ["apple"]]
    num_cluster = 3
    
    model = SCDV(num_cluster = num_cluster)
    model.set_vocabulary(lst_lst_word)
    
    # word2vec 作成
    model.make_word2VecModel(lst_lst_word)
    model.set_word2Vec()
    
    # clustering model の作成
    model.make_clusterModel()
    model.set_cluster()
    
    # idf 値算出・セット
    feature_names, idf = model.calc_idf_by_word(lst_lst_word)
    model.set_idf(feature_names, idf)
    
    # clustered_vector の算出
    model.make_clustered_vector()
    
    # clustered vector の次元が正しいか確認
#     for word in model.vocabulary:
#         assert word.get_clustered_vector() == 

In [198]:
test_SCDV_make_clustered_vector()

Training word2Vec model...
Training clustering model...


#### Document の平均スパースベクトル

In [230]:
# set_documentのテスト
def test_SCDV_set_document():
    lst_lst_word = [["apple", "banana"], ["corch", "banana", "empty", ""], ["apple"]]
    remove_word= "banana"
    
    model = SCDV()
    model.set_vocabulary(lst_lst_word)
    model.remove_vocabulary(remove_word)
    model.set_documents(lst_lst_word)
    
    # document の数は一致するか
    assert len(lst_lst_word) == len(model.documents), "num of documents " + error_display(len(lst_lst_word), len(model.documents))
    # 各単語は一致するか
    for idx, lst_word in enumerate(lst_lst_word):
        document = model.documents[idx]
        # 削除対象の単語を抜く
        if remove_word in lst_word:
            lst_word.remove(remove_word)
        assert len(document.words)==len(lst_word), "num of words " + error_display(len(document.words), len(lst_word))
        
#         for idx_word, word in enumerate(document.words):
#             assert word.match(lst_word[idx_word]), error_display(word.name, lst_word[idx_word], "word")
        for word in lst_word:
            assert document.isExist_word(word), "{0} is not exist in document.".format(word)

In [231]:
test_SCDV_set_document()

In [246]:
# make_meanDocumentVector のテスト
def test_SCDV_make_meanDocumentVector():
    lst_lst_word = [["apple", "banana"], ["corch", "banana", "empty", ""], ["apple"]]
    num_cluster = 3
    embedding_dimension = 100
    
    model = SCDV(num_cluster=num_cluster,embedding_dimension=embedding_dimension)
    model.set_vocabulary(lst_lst_word)
    
    # word2vec 作成
    model.make_word2VecModel(lst_lst_word)
    model.set_word2Vec()
    
    # cluster 作成
    model.make_clusterModel()
    model.set_cluster()
    
    # idf 値算出
    feature_names, idf = model.calc_idf_by_word(lst_lst_word)
    model.set_idf(feature_names, idf)
    
    # clustered_vector の算出
    model.make_clustered_vector()
    
    # Document セット
    model.set_documents(lst_lst_word)
    
    # 平均ベクトルセット
    model.make_meanDocumentVector()
    
    # 各ベクトルの次元が一致するか確認
    for document in model.get_documents():
        assert document.get_meanWordVector().shape[0] == num_cluster*embedding_dimension, error_display(document.get_meanWordVector().shape[0], num_cluster*embedding_dimension, "dimension of mean vector")
        try:
            document.get_meanWordVector().shape[1]
            assert False, "dimension is out of range"
        except:
            continue

In [247]:
test_SCDV_make_meanDocumentVector()

Training word2Vec model...
Training clustering model...


In [253]:
# 再import 
importlib.reload(scdv)
from scdv import SCDV
from scdv import Word
from scdv import Document

In [254]:
# make_sparceDocumentVector のテスト
def test_SCDV_make_sparceDocumentVector():
    lst_lst_word = [["apple", "banana"], ["corch", "banana", "empty", ""], ["apple"]]
    num_cluster = 3
    embedding_dimension = 100
    
    model = SCDV(num_cluster=num_cluster,embedding_dimension=embedding_dimension)
    model.set_vocabulary(lst_lst_word)
    
    # word2vec 作成
    model.make_word2VecModel(lst_lst_word)
    model.set_word2Vec()
    
    # cluster 作成
    model.make_clusterModel()
    model.set_cluster()
    
    # idf 値算出
    feature_names, idf = model.calc_idf_by_word(lst_lst_word)
    model.set_idf(feature_names, idf)
    
    # clustered_vector の算出
    model.make_clustered_vector()
    
    # Document セット
    model.set_documents(lst_lst_word)
    
    # 平均ベクトルセット
    model.make_meanDocumentVector()
    
    # sparce vector set
    model.make_sparceDocumentVector()
    
    # 各ベクトルの次元が一致するか確認
    for document in model.get_documents():
        assert document.get_sparceMeanVector().shape[0] == num_cluster*embedding_dimension, error_display(document.get_meanWordVector().shape[0], num_cluster*embedding_dimension, "dimension of mean vector")
        try:
            document.get_meanWordVector().shape[1]
            assert False, "dimension is out of range"
        except:
            continue

In [255]:
test_SCDV_make_sparceDocumentVector()

Training word2Vec model...
Training clustering model...
