# Instalação e Importação de Bibliotecas

In [None]:
%pip install -r "../requirements.txt"

In [None]:
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np

nltk.download('punkt')

## Leitura e Limpeza dos Dados

In [66]:
df_raw = pd.read_csv("../data/winemag-data-130k-v2.csv")

In [67]:
df = df_raw[["designation","description"]]

In [68]:
df = df.drop_duplicates().dropna().reset_index(drop=True)

In [69]:
df.sample(3)

Unnamed: 0,designation,description
81533,Libra,Light strawberry fruit is quickly overtaken wi...
35518,Les Bertins Premier Cru,The Bertins Premier Cru vineyard is on the sou...
53264,Sommerberg Grand Cru,Mellow inviting aromas of baked apple make for...


# Remoção de "Stop Words"

In [70]:
stop_words = set(stopwords.words("english"))

In [71]:
def remove_stopwords(column: pd.Series) -> pd.Series:
    return column.apply(
        lambda description: " ".join(
            [
             word for word in word_tokenize(description)
             if word.lower() not in stop_words 
             and word.isalpha()
             ]
        )
    )

In [72]:
df["clean_description"] = remove_stopwords(df["description"])

# Vetorização com TF-IDF

In [73]:
vectorizer = TfidfVectorizer()

In [74]:
df_matrix = vectorizer.fit_transform(df["clean_description"])

In [75]:
df_matrix.shape

(85439, 26655)

## Entrada de Dados do Usuário

In [76]:
available_wines = [w.lower() for w in df["designation"].values]

random_wines = df.sample(3)["designation"].values.tolist()

In [77]:
while True:
    print("\nSugestões de vinhos:")
    for wine in random_wines:
        print(f"- {wine}")
        
    chosen_wine = input("\nDigite o nome de um vinho que você goste (ou digite 000 para aleatório): ").strip().lower()
    
    if chosen_wine == "000":
        chosen_wine = df.sample()["designation"].str.lower().values[0]
        print(f"\n🍷 Vinho aleatório selecionado: {chosen_wine}")
        break
    
    elif chosen_wine in available_wines: 
        print(f"\n✅ Vinho selecionado: {chosen_wine}")
        break
    
    else:
        print("\n❌ Vinho não disponível. Tente novamente.")
        random_wines = df["designation"].dropna().sample(3).values


Sugestões de vinhos:
- The Hermit
- Terrassen
- Terre de Vertus Premier Cru

🍷 Vinho aleatório selecionado: dutton ranch shop block


In [78]:
chosen_wine_line = df.loc[df["designation"].str.lower() == chosen_wine].iloc[0]

In [79]:
df = df[df["designation"].str.lower() != chosen_wine]

In [80]:
to_compare_list = df["clean_description"].tolist() + [chosen_wine_line["clean_description"]]

In [81]:
to_compare_matrix = vectorizer.transform(to_compare_list)

# Aplicação do Algoritmo de Similaridade

In [82]:
cosine_sim = cosine_similarity(to_compare_matrix[-1], to_compare_matrix[:-1])

# Retorno dos Resultados

In [83]:
most_sim = np.argmax(cosine_sim)

In [84]:
print(f"✅Tendo como base o vinho: {chosen_wine}")
print(f" Descrição: \n ->{chosen_wine_line["description"]} ")


print(f"\n🍷O vinho recomendado foi: {df["designation"].iloc[most_sim]}")
print(f" Descrição: \n ->{df["description"].iloc[most_sim]} ")


print(f"\n🧾Similidade calculada: {cosine_sim[0,most_sim]}   ")

✅Tendo como base o vinho: dutton ranch shop block
 Descrição: 
 ->This is great white wine, especially at this price. With no oak, it shows pure, clean orange, pineapple, fig and vanilla flavors that are brightened by crisp acidity. Flashy and showy, yet complex, it's a beautiful wine to drink now. 

🍷O vinho recomendado foi: Estate Monte Bello Vineyards
 Descrição: 
 ->Rich, ripe and flashy, with honey, pineapple and orange jam and toasty oak flavors. It's brightened with crisp acidity. With some heat from alcohol, it's deliciously approachable now. 

🧾Similidade calculada: 0.372587494342177   
