# Pythonサンプルプログラム

In [1]:
# %pip install scikit-learn nltk

# 必要なNLTKリソースをダウンロード
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')

In [2]:
# 情報検索

def run_search_documents():
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np
    import pandas as pd
    from nltk.corpus import stopwords
    
    def search_documents(documents, search_words):

        # TF-IDFベクトライザー
        stop_words = stopwords.words('english')
        vectorizer = TfidfVectorizer(stop_words=stop_words)

        # TF-IDFへ変換
        tfidf_matrix = vectorizer.fit_transform(documents)
        search_words_vector = vectorizer.transform([search_words])

        # コサイン類似度でソート
        cosine_similarities = cosine_similarity(search_words_vector, tfidf_matrix).flatten()
        sorted_indices = np.argsort(-cosine_similarities)

        # 結果を(スコア, ドキュメント)の形式で返す
        results = [(cosine_similarities[i], documents[i]) for i in sorted_indices]

        return results

    print("# 情報検索")

    # サンプルドキュメント
    documents = [
        "The stock market is experiencing unprecedented growth.",
        "Local football team wins the championship.",
        "New advancements in AI technology are transforming the industry.",
        "The government announces new economic policies.",
        "Celebrity couple announces their engagement.",
        "New study reveals health benefits of a balanced diet."
    ]

    # 検索ワード
    query = "AI technology"

    # 検索の実行
    results = search_documents(documents, query)

    # 結果の表示
    for score, doc in results:
        print(f"Score: {score:.4f}, Document: {doc}")

# # Test
# run_search_documents()


In [3]:
# 情報フィルタリング

def run_information_filtering():
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np
    import pandas as pd
    from nltk.corpus import stopwords
    
    def filter_information(documents, search_words):

        # TF-IDFベクトライザー
        stop_words = stopwords.words('english')
        vectorizer = TfidfVectorizer(stop_words=stop_words)

        # TF-IDFへ変換
        tfidf_matrix = vectorizer.fit_transform(documents)
        search_words_vector = vectorizer.transform([search_words])

        # コサイン類似度でソート
        cosine_similarities = cosine_similarity(search_words_vector, tfidf_matrix).flatten()
        sorted_indices = np.argsort(-cosine_similarities)

        # 結果を(スコア, ドキュメント)の形式で返す
        results = [(cosine_similarities[i], documents[i]) for i in sorted_indices]

        return results

    print("# 情報フィルタリング（TF-IDF）")

    # サンプルニュース記事
    documents = [
        "The stock market is experiencing unprecedented growth.",
        "Local football team wins the championship.",
        "New advancements in AI technology are transforming the industry.",
        "The government announces new economic policies.",
        "Celebrity couple announces their engagement.",
        "New study reveals health benefits of a balanced diet."
    ]

    # 検索ワード
    search_word = "AI technology and economic policies"

    # 情報フィルタリングの実行
    results = filter_information(documents, search_word)

    # 結果の表示
    for score, doc in results:
        print(f"Score: {score:.4f}, Document: {doc}")

# # Test
# run_information_filtering()


In [4]:
# 情報フィルタリング

def run_information_filtering_word2vec():
    from gensim.models import Word2Vec
    from gensim.utils import simple_preprocess
    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np
    import pandas as pd
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize

    # ドキュメントをWord2Vecベクトルに変換する関数
    def documents_to_vectors(documents, model):
        vectors = []
        for document in documents:
            words = word_tokenize(document.lower())
            words = [word for word in words if word.isalnum()]
            word_vectors = [model.wv[word] for word in words if word in model.wv.key_to_index]
            if word_vectors:
                vectors.append(np.mean(word_vectors, axis=0))
            else:
                vectors.append(np.zeros(model.vector_size))
        return np.array(vectors)

    def filter_information(documents, search_words, model):
        # ドキュメントをベクトルに変換
        doc_vectors = documents_to_vectors(documents, model)
        
        # 検索ワードをベクトルに変換
        search_words_vector = documents_to_vectors([search_words], model)

        # コサイン類似度でソート
        cosine_similarities = cosine_similarity(search_words_vector, doc_vectors).flatten()
        sorted_indices = np.argsort(-cosine_similarities)

        # 結果を(スコア, ドキュメント)の形式で返す
        results = [(cosine_similarities[i], documents[i]) for i in sorted_indices]

        return results

    print("# 情報フィルタリング（Word2Vec）")

    # サンプルニュース記事
    documents = [
        "The stock market is experiencing unprecedented growth.",
        "Local football team wins the championship.",
        "New advancements in AI technology are transforming the industry.",
        "The government announces new economic policies.",
        "Celebrity couple announces their engagement.",
        "New study reveals health benefits of a balanced diet."
    ]

    # 検索ワード
    search_word = "AI technology and economic policies"

    # Word2Vecモデルのトレーニング（またはロード）
    tokenized_documents = [word_tokenize(doc.lower()) for doc in documents]
    model = Word2Vec(tokenized_documents, vector_size=100, window=5, min_count=1, workers=4)

    # 情報フィルタリングの実行
    results = filter_information(documents, search_word, model)

    # 結果の表示
    for score, doc in results:
        print(f"Score: {score:.4f}, Document: {doc}")

# # Test
# run_information_filtering_word2vec()

In [5]:
# 情報フィルタリング

def run_information_filtering_bert():
    from transformers import BertTokenizer, BertModel
    import torch
    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np
    import pandas as pd
    import nltk
    from nltk.tokenize import word_tokenize

    # BERTモデルとトークナイザーのロード
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # ドキュメントをBERTベクトルに変換する関数
    def documents_to_vectors(documents):
        vectors = []
        for document in documents:
            inputs = tokenizer(document, return_tensors='pt', truncation=True, padding=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            # CLSトークンのベクトルを使用
            cls_vector = outputs.last_hidden_state[:, 0, :].numpy()
            vectors.append(cls_vector.flatten())
        return np.array(vectors)

    def filter_information(documents, search_words):
        # ドキュメントをベクトルに変換
        doc_vectors = documents_to_vectors(documents)
        
        # 検索ワードをベクトルに変換
        search_words_vector = documents_to_vectors([search_words])

        # コサイン類似度でソート
        cosine_similarities = cosine_similarity(search_words_vector, doc_vectors).flatten()
        sorted_indices = np.argsort(-cosine_similarities)

        # 結果を(スコア, ドキュメント)の形式で返す
        results = [(cosine_similarities[i], documents[i]) for i in sorted_indices]

        return results

    print("# 情報フィルタリング（BERT）")

    # サンプルニュース記事
    documents = [
        "The stock market is experiencing unprecedented growth.",
        "Local football team wins the championship.",
        "New advancements in AI technology are transforming the industry.",
        "The government announces new economic policies.",
        "Celebrity couple announces their engagement.",
        "New study reveals health benefits of a balanced diet."
    ]

    # 検索ワード
    search_word = "AI technology and economic policies"

    # 情報フィルタリングの実行
    results = filter_information(documents, search_word)

    # 結果の表示
    for score, doc in results:
        print(f"Score: {score:.4f}, Document: {doc}")

# # Test
# run_information_filtering_bert()

In [6]:
def main():

    # 情報検索
    run_search_documents()

    # 情報フィルタリング
    run_information_filtering()
    
    # 情報フィルタリング(Word2Vec)
    run_information_filtering_word2vec()
    
    # 情報フィルタリング(BERT)
    run_information_filtering_bert()

In [7]:
if __name__ == "__main__":
    main()

# 情報検索
Score: 0.6042, Document: New advancements in AI technology are transforming the industry.
Score: 0.0000, Document: The stock market is experiencing unprecedented growth.
Score: 0.0000, Document: Local football team wins the championship.
Score: 0.0000, Document: The government announces new economic policies.
Score: 0.0000, Document: Celebrity couple announces their engagement.
Score: 0.0000, Document: New study reveals health benefits of a balanced diet.
# 情報フィルタリング（TF-IDF）
Score: 0.4908, Document: The government announces new economic policies.
Score: 0.4272, Document: New advancements in AI technology are transforming the industry.
Score: 0.0000, Document: The stock market is experiencing unprecedented growth.
Score: 0.0000, Document: Local football team wins the championship.
Score: 0.0000, Document: Celebrity couple announces their engagement.
Score: 0.0000, Document: New study reveals health benefits of a balanced diet.
# 情報フィルタリング（Word2Vec）
Score: 0.5785, Document: The go