In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk import ngrams

In [2]:
main_data = pd.read_csv('data/main_data.csv')
main_data[['CTGRY_ONE_NM', 'CTGRY_TWO_NM', 'CTGRY_THREE_NM']].head()

Unnamed: 0,CTGRY_ONE_NM,CTGRY_TWO_NM,CTGRY_THREE_NM
0,문화관광/명소,관광지,테마공원/대형놀이공원
1,문화관광/명소,관광지,테마공원/대형놀이공원
2,문화관광/명소,관광지,테마공원/대형놀이공원
3,문화관광/명소,관광지,테마공원/대형놀이공원
4,문화관광/명소,관광지,테마공원/대형놀이공원


In [3]:
# 데이터 준비/로드

category_data = {
    'CTGRY_ONE_NM' : main_data['CTGRY_ONE_NM'],
    'CTGRY_TWO_NM' : main_data['CTGRY_TWO_NM'],
    'CTGRY_THREE_NM' : main_data['CTGRY_THREE_NM'],
}
category_df = pd.DataFrame(category_data)
category_df.head()

Unnamed: 0,CTGRY_ONE_NM,CTGRY_TWO_NM,CTGRY_THREE_NM
0,문화관광/명소,관광지,테마공원/대형놀이공원
1,문화관광/명소,관광지,테마공원/대형놀이공원
2,문화관광/명소,관광지,테마공원/대형놀이공원
3,문화관광/명소,관광지,테마공원/대형놀이공원
4,문화관광/명소,관광지,테마공원/대형놀이공원


In [4]:
# 카테고리 결합
category_df['combined_ctgry'] = category_df['CTGRY_ONE_NM'] + '/' +category_df['CTGRY_TWO_NM'] + '/' + category_df['CTGRY_THREE_NM']
category_df['combined_ctgry'].head()

0    문화관광/명소/관광지/테마공원/대형놀이공원
1    문화관광/명소/관광지/테마공원/대형놀이공원
2    문화관광/명소/관광지/테마공원/대형놀이공원
3    문화관광/명소/관광지/테마공원/대형놀이공원
4    문화관광/명소/관광지/테마공원/대형놀이공원
Name: combined_ctgry, dtype: object

In [5]:
# 유사도 계산
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF 벡터화
vectorizer = TfidfVectorizer(ngram_range=(2, 4), max_features=500) # 객체 생성
tfidf_matrix = vectorizer.fit_transform(category_df['combined_ctgry'])

# cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [6]:
# 결과

# 유사도 매트릭스 -> data frame
cosine_sim_df = pd.DataFrame(cosine_sim, index=main_data['FCLTY_NM'], columns=main_data['FCLTY_NM'])
cosine_sim_df.head()

FCLTY_NM,9.81파크,9.81파크,9.81파크,9.81파크,9.81파크,9.81파크,9.81파크,9.81파크,9.81파크,9.81파크,...,화조원,훈데르트바서파크,훈데르트바서파크,훈데르트바서파크,훈데르트바서파크,ICC제주국제컨벤션센터,ICC제주국제컨벤션센터,ICC제주국제컨벤션센터,ICC제주국제컨벤션센터,ICC제주국제컨벤션센터
FCLTY_NM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9.81파크,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.058658,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
9.81파크,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.058658,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
9.81파크,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.058658,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
9.81파크,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.058658,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
9.81파크,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.058658,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# 한글 폰트
import matplotlib
import matplotlib.font_manager as fm

font_path = 'C:\\Windows\\Fonts\\gulim.ttc'
font = fm.FontProperties(fname=font_path).get_name()
matplotlib.rc('font', family=font) # 폰트 등록

In [12]:
display(cosine_sim_df)

FCLTY_NM,9.81파크,9.81파크,9.81파크,9.81파크,9.81파크,9.81파크,9.81파크,9.81파크,9.81파크,9.81파크,...,화조원,훈데르트바서파크,훈데르트바서파크,훈데르트바서파크,훈데르트바서파크,ICC제주국제컨벤션센터,ICC제주국제컨벤션센터,ICC제주국제컨벤션센터,ICC제주국제컨벤션센터,ICC제주국제컨벤션센터
FCLTY_NM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9.81파크,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.058658,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
9.81파크,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.058658,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
9.81파크,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.058658,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
9.81파크,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.058658,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
9.81파크,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.058658,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ICC제주국제컨벤션센터,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
ICC제주국제컨벤션센터,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
ICC제주국제컨벤션센터,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
ICC제주국제컨벤션센터,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# 'category_df'와 'main_data'를 병합하여 장소 정보를 추가
category_df = category_df.merge(main_data[['FCLTY_NM']], how='left', left_on='CTGRY_ONE_NM', right_on='FCLTY_NM')
# 
# # 또는 'ctgry_two'와 'ctgry_three'에 대해서도 같은 방식으로 병합
# category_df = category_df.merge(main_data[['FCLTY_NM']], how='left', left_on='CTGRY_TWO_NM', right_on='장소', suffixes=('_ctgry_one', '_ctgry_two'))
# category_df = category_df.merge(main_data[['FCLTY_NM']], how='left', left_on='ctgry_three', right_on='장소', suffixes=('_ctgry_two', '_ctgry_three'))

category_df.head()

Unnamed: 0,CTGRY_ONE_NM,CTGRY_TWO_NM,CTGRY_THREE_NM,combined_ctgry,FCLTY_NM
0,문화관광/명소,관광지,테마공원/대형놀이공원,문화관광/명소/관광지/테마공원/대형놀이공원,
1,문화관광/명소,관광지,테마공원/대형놀이공원,문화관광/명소/관광지/테마공원/대형놀이공원,
2,문화관광/명소,관광지,테마공원/대형놀이공원,문화관광/명소/관광지/테마공원/대형놀이공원,
3,문화관광/명소,관광지,테마공원/대형놀이공원,문화관광/명소/관광지/테마공원/대형놀이공원,
4,문화관광/명소,관광지,테마공원/대형놀이공원,문화관광/명소/관광지/테마공원/대형놀이공원,


In [9]:
# 상위 5개 유사도 높은 항목을 출력
import numpy as np

# 각 행에 대해 가장 유사한 항목 출력
for i in range(len(cosine_sim)):
    similar_indices = cosine_sim[i].argsort()[-6:-1][::-1]  # 가장 유사한 5개 항목
    print(f"유사한 항목 {main_data['FCLTY_NM'][i]}와 유사한 항목들:")
    for idx in similar_indices:
        print(f"- {main_data['FCLTY_NM'][idx]}: 유사도 {cosine_sim[i][idx]:.2f}")
    print("\n")


유사한 항목 9.81파크와 유사한 항목들:
- 고흐의정원: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00


유사한 항목 9.81파크와 유사한 항목들:
- 고흐의정원: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00


유사한 항목 9.81파크와 유사한 항목들:
- 고흐의정원: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00


유사한 항목 9.81파크와 유사한 항목들:
- 고흐의정원: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00


유사한 항목 9.81파크와 유사한 항목들:
- 고흐의정원: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00


유사한 항목 9.81파크와 유사한 항목들:
- 고흐의정원: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00


유사한 항목 9.81파크와 유사한 항목들:
- 고흐의정원: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00


유사한 항목 9.81파크와 유사한 항목들:
- 고흐의정원: 유사도 1.00
- 제주코코몽에코파크: 유사도 1.00
- 제주코코몽에코파크: