# Instalação e Importação de Bibliotecas

In [None]:
%pip install -r "../requirements.txt"

In [None]:
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\55139\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\55139\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Leitura e Limpeza dos Dados

In [28]:
df_raw = pd.read_csv("../data/winemag-data-130k-v2.csv")

In [29]:
df = df_raw[["designation","description"]]

In [30]:
df = df.drop_duplicates().dropna().reset_index(drop=True)

In [31]:
df.sample(3)

Unnamed: 0,designation,description
43158,Cuvée 4,The light-gold color of this wine shows its ma...
58656,Reserve,This wine is 86% Cabernet Sauvignon and 14% Pe...
70240,Le Vigne di Turano,The Sartori family is behind this wonderful vi...


# Remoção de "Stop Words"

In [32]:
stop_words = set(stopwords.words("english"))

In [33]:
def remove_stopwords(column: pd.Series) -> pd.Series:
    return column.apply(
        lambda description: " ".join(
            [
             word for word in word_tokenize(description)
             if word.lower() not in stop_words 
             and word.isalpha()
             ]
        )
    )

In [34]:
df["clean_description"] = remove_stopwords(df["description"])

# Vetorização com TF-IDF

In [35]:
vectorizer = TfidfVectorizer()

In [36]:
df_matrix = vectorizer.fit_transform(df["clean_description"])

In [37]:
df_matrix.shape

(85439, 26655)

## Entrada de Dados do Usuário

In [38]:
available_wines = [w.lower() for w in df["designation"].values]

random_wines = df.sample(3)["designation"].values.tolist()

In [48]:
while True:
    print("\nSugestões de vinhos:")
    for wine in random_wines:
        print(f"- {wine}")
        
    chosen_wine = input("\nDigite o nome de um vinho que você goste (ou digite 000 para aleatório): ").strip().lower()
    
    if chosen_wine == "000":
        chosen_wine = df.sample()["designation"].str.lower().values[0]
        print(f"\n🍷 Vinho aleatório selecionado: {chosen_wine}")
        break
    
    elif chosen_wine in available_wines: 
        print(f"\n✅ Vinho selecionado: {chosen_wine}")
        break
    
    else:
        print("\n❌ Vinho não disponível. Tente novamente.")
        random_wines = df["designation"].dropna().sample(3).values


Sugestões de vinhos:
- Small Lot Reserve
- Piastraia
- Marquis de Lafayette

🍷 Vinho aleatório selecionado: viña famatina


In [49]:
chosen_wine_line = df.loc[df["designation"].str.lower() == chosen_wine].iloc[0]

In [50]:
df = df[df["designation"].str.lower() != chosen_wine]

In [51]:
to_compare_list = df["clean_description"].tolist() + [chosen_wine_line["clean_description"]]

In [52]:
to_compare_matrix = vectorizer.transform(to_compare_list)

In [53]:
to_compare_matrix.shape

(85403, 26655)

# Aplicação do Algoritmo de Similaridade

In [54]:
cosine_sim = cosine_similarity(to_compare_matrix[-1], to_compare_matrix[:-1])

# Retorno dos Resultados

In [55]:
most_sim = np.argmax(cosine_sim)

In [56]:
print(f"✅Tendo como base o vinho: {chosen_wine}")
print(f" Descrição: \n ->{chosen_wine_line["description"]} ")


print(f"\n🍷O vinho recomendado foi: {df["designation"].iloc[most_sim]}")
print(f" Descrição: \n ->{df["description"].iloc[most_sim]} ")


print(f"\n🧾Similidade calculada: {cosine_sim[0,most_sim]}   ")

✅Tendo como base o vinho: viña famatina
 Descrição: 
 ->Stalky and green smelling, with tobacco and stemmy fruit aromas. The palate is scratchy and stewy, with soupy flavors of tomato and raspberry. Herbal on the deficient finish. 

🍷O vinho recomendado foi: Estate Bottled
 Descrição: 
 ->This is herbal, burnt and stalky smelling, then raw and herbal tasting, with weedy notes and a scratchy finish. 

🧾Similidade calculada: 0.3735280540019786   
