<a href="https://colab.research.google.com/github/imymemineyay/Studying_NLP/blob/main/wine_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

과제 1. 코사인 유사도 기반 와인 추천 시스템 개발
- 제공받은 csv 파일 로드
- 상위 5000개의 와인데이터만 추출하여 변수에 저장
- description열 추출
- 불용어 전처리(길이가 짧은 단어, 불용어 사전에 등록되어 있는 단어, 5000개의 문서 중 단어의 등장 횟수가 3개 이하인 경우 제거)
- DTM 구성(만약 시간이 너무 많이 소요되면 상위 1000개만 가지고 하기)
- TFIDF 행렬구성(5000*단어개수)
- 코사인 유사도 (5000*5000)
- ex) 번호가 50번에 해당되는 와인과 가장 유사한 와인을 추천해줘
- 가장 유사한 와인 10개 추천

코사인 유사도 최대값 10개 구해서 그 유사도에 해당하는 와인의 번호 출력

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


In [None]:
# 제공받은 csv 파일 로드

df = pd.read_csv('/content/drive/MyDrive/광군제(11.11)/NLP/winemag-data-130k-v2.csv')

In [None]:
# 상위 5000개의 와인데이터만 추출하여 변수에 저장

wine = df.head(5000)

In [None]:
# description열 추출

wine = wine.loc[:,['description','title']]
wine.head()

Unnamed: 0,description,title
0,"Aromas include tropical fruit, broom, brimston...",Nicosia 2013 Vulkà Bianco (Etna)
1,"This is ripe and fruity, a wine that is smooth...",Quinta dos Avidagos 2011 Avidagos Red (Douro)
2,"Tart and snappy, the flavors of lime flesh and...",Rainstorm 2013 Pinot Gris (Willamette Valley)
3,"Pineapple rind, lemon pith and orange blossom ...",St. Julian 2013 Reserve Late Harvest Riesling ...
4,"Much like the regular bottling from 2012, this...",Sweet Cheeks 2012 Vintner's Reserve Wild Child...


In [None]:
# 불용어 전처리(길이가 짧은 단어, 불용어 사전에 등록되어 있는 단어, 5000개의 문서 중 단어의 등장 횟수가 3개 이하인 경우 제거)

tfidf = TfidfVectorizer(stop_words='english')
wine_m = tfidf.fit_transform(wine['description'])

In [None]:
count_vt = CountVectorizer(stop_words = 'english')
wine_vt = count_vt.fit_transform(wine['description'])

In [None]:
wine_vt.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
wine_ct_matrix = np.array(wine_vt.sum(axis=0)[0][0][0])

In [None]:
max(wine_ct_matrix[0]) # 단어의 가장 높은 빈도수

3063

In [None]:
len(wine_ct_matrix[0]) # 단어의 총 개수

8179

In [None]:
# 5000개의 문서 중 단어의 등장 횟수가 3개 이하인 경우 제거
lst = []
for i in range(8179):
    if wine_ct_matrix[0][i] < 4 : # 제거할 열
        lst.append(i)

In [None]:
wine_pre = wine_m.toarray()
wine_pre = pd.DataFrame(wine_pre)
wine_pre

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8169,8170,8171,8172,8173,8174,8175,8176,8177,8178
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
saved_lst = []
for i in range(8179):
    if i not in lst:
        saved_lst.append(i)

print(saved_lst)

[0, 4, 5, 7, 10, 11, 12, 13, 15, 17, 18, 20, 23, 25, 29, 38, 64, 66, 67, 68, 69, 70, 72, 74, 76, 77, 79, 81, 82, 83, 84, 85, 86, 87, 88, 91, 92, 93, 94, 95, 96, 97, 98, 100, 103, 109, 110, 111, 112, 114, 116, 119, 122, 124, 128, 130, 133, 135, 136, 139, 142, 143, 145, 148, 150, 153, 156, 158, 164, 166, 169, 173, 178, 182, 183, 185, 187, 189, 198, 200, 210, 211, 213, 214, 216, 217, 219, 221, 226, 227, 229, 230, 244, 245, 246, 247, 251, 253, 260, 264, 265, 267, 268, 269, 271, 272, 276, 279, 295, 301, 313, 316, 317, 318, 319, 321, 322, 324, 325, 326, 329, 330, 335, 342, 344, 346, 348, 354, 357, 362, 364, 365, 368, 372, 373, 375, 376, 377, 379, 383, 384, 387, 389, 395, 396, 397, 399, 400, 401, 405, 407, 408, 412, 414, 420, 433, 434, 435, 443, 455, 459, 461, 462, 463, 464, 465, 467, 468, 471, 472, 474, 475, 482, 484, 485, 486, 490, 491, 492, 494, 495, 499, 507, 509, 515, 521, 522, 523, 525, 526, 527, 543, 546, 550, 551, 552, 553, 560, 571, 572, 575, 578, 581, 586, 587, 598, 601, 602, 604, 6

In [None]:
wine_pre = wine_pre.iloc[:,saved_lst]
wine_pre

Unnamed: 0,0,4,5,7,10,11,12,13,15,17,...,8160,8162,8163,8165,8166,8168,8169,8171,8173,8176
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
tfidf_m = wine_pre.values # df → array 변환
tfidf_m

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
cos_sim = cosine_similarity(tfidf_m,tfidf_m)
print(cos_sim.shape)
print(cos_sim[0])

(5000, 5000)
[1.         0.0150337  0.01370403 ... 0.00910957 0.         0.03358695]


In [None]:
wine.title

0                       Nicosia 2013 Vulkà Bianco  (Etna)
1           Quinta dos Avidagos 2011 Avidagos Red (Douro)
2           Rainstorm 2013 Pinot Gris (Willamette Valley)
3       St. Julian 2013 Reserve Late Harvest Riesling ...
4       Sweet Cheeks 2012 Vintner's Reserve Wild Child...
                              ...                        
4995    Mud House 2007 Swan Sauvignon Blanc (Marlborough)
4996     Fattoria Alois 2006 Cunto Pallagrello (Campania)
4997                      Florio NV Fine Sweet  (Marsala)
4998    Vice Versa 2005 Le Petit Vice Cabernet Sauvign...
4999    Viña Mar de Casablanca 2008 Reserva Especial S...
Name: title, Length: 5000, dtype: object

In [None]:
wine.iloc[516]

description    This is a relatively thick and dense wine, gri...
title          Davis Family 2014 Soul Patch Estate Grown Pino...
Name: 516, dtype: object

In [None]:
i = wine.iloc[49,1]
print(i)

Vignerons de Bel Air 2011 Eté Indien  (Brouilly)


In [None]:
def recommend(t, cosine_sim = cos_sim): # t에 제목 전달
  idx = data[t]
  #print(cosine_sim[idx])
  sim_scores = list(enumerate(cosine_sim[idx]))
  # print(sim_scores)
  ss = sorted(sim_scores, key = lambda x:x[1], reverse=True)
  #print(ss)
  ss = ss[1:11]
  print(ss)


  res = [i[0] for i in ss]
  print(res)
  return wine['title'].iloc[res]

In [None]:
recommend('Vignerons de Bel Air 2011 Eté Indien  (Brouilly)')

[(30, 0.4728694130117888), (4091, 0.4622114339243789), (2396, 0.42736711962396173), (4561, 0.4049711854885382), (1552, 0.4011213804846594), (3602, 0.4004374765517892), (4563, 0.395994292202158), (4397, 0.39463986866359063), (3400, 0.3935506078939345), (2657, 0.37909491996976724)]
[30, 4091, 2396, 4561, 1552, 3602, 4563, 4397, 3400, 2657]


30      Domaine de la Madone 2012 Nouveau  (Beaujolais...
4091     Casca Wines 2013 Santos da Casa Red (Alentejano)
2396    Rui Roboredo Madeira 2015 Beyra Colheita Red (...
4561    Jose Maria Vieira 2014 Borges Quinta da Soalhe...
1552    Adega Cooperativa de Borba 2016 Cuvée B Red (A...
3602                    La Croix de Renaud 2009  Bordeaux
4563    Les Vignerons Réunis de Monségur 2014 Château ...
4397    Georges Duboeuf 2015 Clos Reyssie  (Pouilly-Fu...
3400    Parras Vinhos 2014 Montaria Tinto Red (Alentej...
2657                  Château Soutard 2009  Saint-Émilion
Name: title, dtype: object