# Instalando Bibliotecas

In [None]:
%pip install -r "../requirements.txt"

## Importando Bibliotecas

In [None]:
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np

nltk.download('punkt_tab')

## Leitura e Limpeza dos Dados

In [595]:
df_raw = pd.read_csv("../data/winemag-data-130k-v2.csv")

In [596]:
df = df_raw[["designation","description"]]

In [597]:
df = df.drop_duplicates().dropna().reset_index(drop=True)

In [598]:
df.sample(3)

Unnamed: 0,designation,description
77030,Le Village,"Attractive, luscious fruits are balanced by ac..."
83944,Xplorador,"Ripe and ready, with powerful dark fruit. This..."
66680,Laguna Knoll Vineyard,Raspberry crumble immediately comes to mind in...


## Removendo "Stop Words"

In [599]:
stop_words = set(stopwords.words("english"))

In [None]:
def remove_stopwords(column: pd.Series) -> pd.Series:
    return column.apply(
        lambda description: " ".join(
            [
             word for word in word_tokenize(description)
             if word.lower() not in stop_words 
             and word.isalpha()
             ]
        )
    )

In [None]:
df["clean_description"] = remove_stopwords(df["description"])

## Vetorizando os Dados

In [602]:
vectorizer = TfidfVectorizer()

In [603]:
df_matrix = vectorizer.fit_transform(df["clean_description"])

In [620]:
df_matrix.shape

(85439, 26655)

## Entrada de Dados

In [None]:
available_wines = [w.lower() for w in df["designation"].values]

random_wines = df.sample(3)["designation"].values.tolist()

In [None]:
while True:
    print("\nSugestões de vinhos:")
    for wine in random_wines:
        print(f"- {wine}")
        
    chosen_wine = input("\nDigite o nome de um vinho que você goste (ou digite 000 para aleatório): ").strip().lower()
    
    if chosen_wine == "000":
        chosen_wine = df.sample()["designation"].str.lower().values[0]
        print(f"\n🍷 Vinho aleatório selecionado: {chosen_wine}")
        break
    
    elif chosen_wine in available_wines: 
        print(f"\n✅ Vinho selecionado: {chosen_wine}")
        break
    
    else:
        print("\n❌ Vinho não disponível. Tente novamente.")
        # Atualiza sugestões aleatórias
        random_wines = df["designation"].dropna().sample(3).values


Sugestões de vinhos:
- The Chairman Series
- Ridgecrest Vineyards Reserve
- Indian Wells

❌ Vinho não disponível. Tente novamente.

Sugestões de vinhos:
- LFNG Blind Trust
- Ried Rüsselgarten Select
- Barton

🍷 Vinho aleatório selecionado: privat reserva brut nature rosé


In [607]:
chosen_wine_line = df.loc[df["designation"].str.lower() == chosen_wine].iloc[0]

In [608]:
df = df[df["designation"].str.lower() != chosen_wine]

In [None]:
to_compare_list = df["clean_description"].tolist() + [chosen_wine_line["clean_description"]]

In [None]:
to_compare_matrix = vectorizer.transform(to_compare_list)

## Aplicando o Algoritmo

In [611]:
cosine_sim = cosine_similarity(to_compare_matrix[-1], to_compare_matrix[:-1])

## Obtendo e Retornando o Resultado

In [612]:
most_sim = np.argmax(cosine_sim)

In [None]:
print(f"✅Tendo como base o vinho: {chosen_wine}")
print(f" Descrição de {chosen_wine}: \n ->{chosen_wine_line["description"]} ")


print(f"\n🍷O vinho recomendado foi: {df["designation"].iloc[most_sim]}")
print(f" Descrição de {df["designation"].iloc[most_sim]}: \n ->{df["description"].iloc[most_sim]} ")


print(f"\n🧾Similidade calculada: {cosine_sim[0,most_sim]}   ")

✅Tendo como base o vinho: privat reserva brut nature rosé
 Descrição de privat reserva brut nature rosé: 
 ->Fresh, floral, citrusy aromas precede a well-balanced palate. Sleek, dry raspberry and plum flavors are fresh and forward, while the finish on this sparkling Mourvèdre is salty, genuine, elegant and stylish. 

🍷O vinho recomendado foi: Fincas
 Descrição de Fincas: 
 ->Fresh aromas of white gumdrop, quince and peach precede a peachy, citrusy palate. Dry citrus flavors persist on a clean finish. This is a fresh Chardonnay where less is more. 

🧾Similidade calculada: 0.3114928113237686 
