# Vector 거리 계산

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances, cosine_similarity

# 문장 데이터 Corpus와 문서 이름
corpus = [
    "Hello World",
    "Hello World World",
    "Hello Hello World",
    "Hello Hello World World"
]
doc_names = ["Doc1", "Doc2", "Doc3", "Doc4"]

# BoW 계산
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(corpus)
print("BoW 행렬:\n", bow_matrix.toarray())
print("BoW 특성 이름:\n", bow_vectorizer.get_feature_names_out())

BoW 행렬:
 [[1 1]
 [1 2]
 [2 1]
 [2 2]]
BoW 특성 이름:
 ['hello' 'world']


In [2]:
# 맨하탄 거리 계산
manhattan_dist = manhattan_distances(bow_matrix)
print("\n맨하탄 거리:")
print("        ", end="")
for doc_name in doc_names:
    print(f"{doc_name:>10}", end="")
print()
for i, doc_name in enumerate(doc_names):
    print(f"{doc_name:>8}", end="")
    for dist in manhattan_dist[i]:
        print(f"{dist:10.2f}", end="")
    print()


맨하탄 거리:
              Doc1      Doc2      Doc3      Doc4
    Doc1      0.00      1.00      1.00      2.00
    Doc2      1.00      0.00      2.00      1.00
    Doc3      1.00      2.00      0.00      1.00
    Doc4      2.00      1.00      1.00      0.00


In [3]:
# 유클리드 거리 계산
euclidean_dist = euclidean_distances(bow_matrix)
print("\n유클리드 거리:")
print("        ", end="")
for doc_name in doc_names:
    print(f"{doc_name:>10}", end="")
print()
for i, doc_name in enumerate(doc_names):
    print(f"{doc_name:>8}", end="")
    for dist in euclidean_dist[i]:
        print(f"{dist:10.2f}", end="")
    print()


유클리드 거리:
              Doc1      Doc2      Doc3      Doc4
    Doc1      0.00      1.00      1.00      1.41
    Doc2      1.00      0.00      1.41      1.00
    Doc3      1.00      1.41      0.00      1.00
    Doc4      1.41      1.00      1.00      0.00


In [4]:
# 코사인 거리 계산 (1 - 코사인 유사도 = 코사인 거리)
cosine_sim = cosine_similarity(bow_matrix)
cosine_dist = 1 - cosine_sim
print("\n코사인 거리:")
print("        ", end="")
for doc_name in doc_names:
    print(f"{doc_name:>10}", end="")
print()
for i, doc_name in enumerate(doc_names):
    print(f"{doc_name:>8}", end="")
    for dist in cosine_dist[i]:
        print(f"{dist:10.2f}", end="")
    print()


코사인 거리:
              Doc1      Doc2      Doc3      Doc4
    Doc1      0.00      0.05      0.05      0.00
    Doc2      0.05      0.00      0.20      0.05
    Doc3      0.05      0.20      0.00      0.05
    Doc4      0.00      0.05      0.05      0.00
