In [126]:
import pandas as pd
import numpy as np
import nltk
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

In [127]:
# 데이터 로드
data = pd.read_csv('yoitdata_shoes-boots.csv')
data.dropna(inplace=True)
num_null = data['item_name'].isnull().sum()
print(f"Number of null values in 'item_name': {num_null}")

Number of null values in 'item_name': 0


In [128]:
# 모든 column에 NaN 값이 없는지 확인
null_cols = data.columns[data.isnull().any()]
print(f"Columns with null values: {null_cols}")

Columns with null values: Index([], dtype='object')


In [129]:
# NaN 값을 가지고 있는 row 삭제
data.dropna(inplace=True)

In [130]:
# 전처리
item_name = [nltk.word_tokenize(name.lower()) for name in data['item_name']]



In [131]:
# Word2Vec 모델 학습
model = Word2Vec(item_name, vector_size=100, window=5, min_count=5, workers=4)


In [132]:
# 입력받은 문장 전처리
input_sentence = 'Rugged Flex 6 Inch Waterproof Composite Toe EH Work Boots Work Shoes Beige- Mens- Size 10.5 2E'
input_words = nltk.word_tokenize(input_sentence.lower())


In [133]:
# 입력 문장의 Word2Vec 표현 계산
input_vector = np.mean([model.wv[word] for word in input_words if word in model.wv and not np.isnan(model.wv[word]).any()], axis=0)
if not np.isnan(input_vector).any():
    input_vector = input_vector.reshape(1, -1)
else:
    print('Input vector contains NaN.')


In [134]:
# 모든 문장의 Word2Vec 표현 계산
sent_vectors = np.zeros((len(item_name), 100))
for i in range(len(item_name)):
    sent_vector = np.mean([model.wv[word] for word in item_name[i] if word in model.wv and not np.isnan(model.wv[word]).any()], axis=0)
    if not np.isnan(sent_vector).any():
        sent_vectors[i] = sent_vector.reshape(1, -1)
    else:
        print(f'Sentence vector {i} contains NaN.')

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Sentence vector 2057 contains NaN.
Sentence vector 3121 contains NaN.
Sentence vector 7791 contains NaN.
Sentence vector 9768 contains NaN.
Sentence vector 12669 contains NaN.
Sentence vector 15745 contains NaN.
Sentence vector 16079 contains NaN.
Sentence vector 17409 contains NaN.
Sentence vector 21469 contains NaN.
Sentence vector 22737 contains NaN.
Sentence vector 25547 contains NaN.
Sentence vector 31970 contains NaN.
Sentence vector 34031 contains NaN.
Sentence vector 35481 contains NaN.


#### cosine similarity

In [172]:
# 입력 문장과 모든 문장 간의 코사인 유사도 계산
similarity = cosine_similarity(input_vector, sent_vectors)
# 유사도가 가장 높은 상위 10개 문장 출력
top_indices = similarity[0].argsort()[::-1][:20]
for i, index in enumerate(top_indices):
    print(f'{i+1}. {data["item_name"][index]}, {similarity[0][index]}')

1. Rugged Flex 6 Inch Waterproof Composite Toe EH Work Boots Work Shoes Beige- Mens- Size 10.5 2E, 1.0
2. Sierra Calf Metallic Snip Toe Cowboy Boots  Black- Womens- Size 8 B, 0.9999505968563551
3. Ronnie Pull On Booties Boots Black- Womens- Size 6 WW, 0.9997088594798211
4. AMP LT Wedge Waterproof 6 Work Boot, 0.9996869877486143
5. Stella Water Resistant Zippered Booties Boots Brown- Womens- Size 6.5 M, 0.9996802700777425
6. Gone Fish'n Rain Boots (Toddler/Little Kid/Big Kid), 0.9996312374106349
7. Pran Cer Chelsea Boots Casual Shoes Brown- Womens- Size 8.5 M, 0.9996224329958503
8. WorkHog 11 inch Waterproof Soft Toe Work Boots Work Shoes Brown- Mens- Size 13 D, 0.9996079443071337
9. Titan EV 6" Waterproof Composite Toe Work Boots Casual Shoes Brown- Mens- Size 10 M, 0.9995887158257
10. Edgewater Classic Mid, 0.999565616152658
11. Comfort Foam Jo Bootie, 0.9995633520408582
12. Floatride Energy 8" Electrical Soft Toe Work Boots Work Shoes Brown- Mens- Size 10.5 W, 0.9994922195917119
13. 

#### jaccard similarity

#### cosine Similarity using Spacy

#### cosine similarity using scipy