In [22]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

# Document Term Matrix
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()
# print(X.shape)

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [23]:
X.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [28]:
# # Term Freqeuncy
# tf = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names_out())
# tf
# Term Freqeuncy
tf = pd.DataFrame([list(X.toarray())]).transpose()
tf

Unnamed: 0,0
0,"[0.0, 0.46979138557992045, 0.5802858236844359,..."
1,"[0.0, 0.6876235979836938, 0.0, 0.2810886740337..."
2,"[0.511848512707169, 0.0, 0.0, 0.26710378764216..."
3,"[0.0, 0.46979138557992045, 0.5802858236844359,..."


In [4]:
# Document Frequency 
df = tf.astype(bool).sum(axis = 0)
df

and         1
document    3
first       2
is          4
one         1
second      1
the         4
third       1
this        4
dtype: int64

In [14]:
# 문서 개수
D = len(tf)

# Inverse Document Frequency
idf = np.log((D) / (df+1))
idf

and         0.693147
document    0.000000
first       0.287682
is         -0.223144
one         0.693147
second      0.693147
the        -0.223144
third       0.693147
this       -0.223144
dtype: float64

In [16]:
# TF-IDF
tfidf = tf * idf                      
tfidf = tfidf / np.linalg.norm(tfidf, axis = 1, keepdims = True)
tfidf

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.0,0.74728,-0.383655,0.0,0.0,-0.383655,0.0,-0.383655
1,0.0,0.0,0.0,-0.161306,0.0,0.960178,-0.161306,0.0,-0.161306
2,0.569372,0.0,0.0,-0.095652,0.569372,0.0,-0.095652,0.569372,-0.095652
3,0.0,0.0,0.74728,-0.383655,0.0,0.0,-0.383655,0.0,-0.383655


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 예시 텍스트 데이터
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
    "The last document is here."
]

# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# 특정 문서의 TF-IDF 벡터 얻기
query = "This is the second document."
query_vector = tfidf_vectorizer.transform([query])

# 코사인 유사도 계산
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

# 유사도가 높은 순으로 정렬된 인덱스 얻기
sorted_indices = cosine_similarities.argsort()[::-1]

# 상위 몇 개의 문서를 출력
top_k = 3
for i in range(top_k):
    print(f"Similarity with Document {sorted_indices[i]}: {cosine_similarities[sorted_indices[i]]:.2f}")
    print(f"Document {sorted_indices[i]}: {documents[sorted_indices[i]]}\n")


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# 예시 텍스트 데이터
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
    "The last document is here."
]

# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# TF-IDF 벡터를 numpy 배열로 변환하여 각 문서의 임베딩 얻기
document_embeddings = np.array(tfidf_matrix.toarray())

# 결과 출력
for i, embedding in enumerate(document_embeddings):
    print(f"Document {i} Embedding:")
    print(embedding)
    print()


Document 0 Embedding:
[0.         0.42712001 0.6116585  0.         0.36125537 0.
 0.         0.         0.36125537 0.         0.42712001]

Document 1 Embedding:
[0.         0.64612571 0.         0.         0.2732445  0.
 0.         0.57343426 0.2732445  0.         0.32306286]

Document 2 Embedding:
[0.51492278 0.         0.         0.         0.24536346 0.
 0.51492278 0.         0.24536346 0.51492278 0.29009851]

Document 3 Embedding:
[0.         0.42712001 0.6116585  0.         0.36125537 0.
 0.         0.         0.36125537 0.         0.42712001]

Document 4 Embedding:
[0.         0.33841126 0.         0.60067757 0.28622608 0.60067757
 0.         0.         0.28622608 0.         0.        ]

