# Pythonサンプルプログラム

In [1]:
# %pip install scikit-learn nltk
# import nltk
# nltk.download('stopwords')

In [2]:
def filter_information(documents, search_words):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np
    import pandas as pd
    from nltk.corpus import stopwords

    # TF-IDFベクトライザー
    stop_words = stopwords.words('english')
    vectorizer = TfidfVectorizer(stop_words=stop_words)

    # TF-IDFへ変換
    tfidf_matrix = vectorizer.fit_transform(documents)
    search_words_vector = vectorizer.transform([search_words])

    # コサイン類似度でソート
    cosine_similarities = cosine_similarity(search_words_vector, tfidf_matrix).flatten()
    sorted_indices = np.argsort(-cosine_similarities)

    # 結果を(スコア, ドキュメント)の形式で返す
    results = [(cosine_similarities[i], documents[i]) for i in sorted_indices]

    return results

In [3]:
# 情報検索

def run_search_documents():
    import numpy as np
    import pandas as pd

    print("# 情報検索")

    # サンプルドキュメント
    documents = [
        "The quick brown fox jumps over the lazy dog.",
        "Never jump over the lazy dog quickly.",
        "A quick brown dog outpaces a quick fox."
    ]

    # 検索ワード
    query = "quick dog"

    # 検索の実行
    results = filter_information(documents, query)

    # 結果の表示
    for score, doc in results:
        print(f"Score: {score:.4f}, Document: {doc}")

# # Test
# run_search_documents()


In [4]:
# 情報フィルタリング

def run_information_filtering():
    import numpy as np
    import pandas as pd

    print("# 情報フィルタリング")

    # サンプルニュース記事
    documents = [
        "The stock market is experiencing unprecedented growth.",
        "Local football team wins the championship.",
        "New advancements in AI technology are transforming the industry.",
        "The government announces new economic policies.",
        "Celebrity couple announces their engagement.",
        "New study reveals health benefits of a balanced diet."
    ]

    # 検索ワード
    search_word = "AI technology and economic policies"

    # 情報フィルタリングの実行
    results = filter_information(documents, search_word)

    # 結果の表示
    for score, doc in results:
        print(f"Score: {score:.4f}, Document: {doc}")

# # Test
# run_information_filtering()


In [5]:
def main():

    # 情報検索
    run_search_documents()

    # 情報フィルタリング
    run_information_filtering()

In [6]:
if __name__ == "__main__":
    main()

# 情報検索
Score: 0.7123, Document: A quick brown dog outpaces a quick fox.
Score: 0.5032, Document: The quick brown fox jumps over the lazy dog.
Score: 0.1828, Document: Never jump over the lazy dog quickly.
# 情報フィルタリング
Score: 0.4908, Document: The government announces new economic policies.
Score: 0.4272, Document: New advancements in AI technology are transforming the industry.
Score: 0.0000, Document: The stock market is experiencing unprecedented growth.
Score: 0.0000, Document: Local football team wins the championship.
Score: 0.0000, Document: Celebrity couple announces their engagement.
Score: 0.0000, Document: New study reveals health benefits of a balanced diet.
