In [1]:
import pandas as pd
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import pickle
from dotenv import load_dotenv
import os
from pymongo import MongoClient
from sentence_transformers import SentenceTransformer
import spacy

  from .autonotebook import tqdm as notebook_tqdm


## Load data

In [2]:
wines_df = pd.read_csv('data/winemag-data-130k-v2.csv').head(10000)

# rename column 'Unnamed: 0' to 'id'
wines_df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)

print(wines_df.shape)
wines_df.head()

(10000, 14)


Unnamed: 0,id,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


## Embedding

In [3]:
import os
import spacy
from pymongo import MongoClient
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer

# Load the spaCy medium model with word vectors
nlp = spacy.load('en_core_web_md')

# Initialize the SentenceTransformer once to avoid repeated loading
gist_model = SentenceTransformer('avsolatorio/GIST-Embedding-v0')

def get_spacy_embedding(text):
    """Generate spaCy embeddings for the given text."""
    doc = nlp(text)
    return doc.vector

def get_gist_embedding(text):
    """Generate Gist embeddings for the given text."""
    return gist_model.encode(text)

def load_embeddings_to_db(wines_df):
    """
    Load MongoDB database with spaCy and Gist embeddings from wines_df descriptions.
    """
    # Load environment variables
    load_dotenv()
    uri = os.getenv("MONGODB_URI")
    db_name = "BottleBuddy"
    collection_name = "Wine"
    
    # Connect to MongoDB
    client = MongoClient(uri)
    db = client[db_name]
    collection = db[collection_name]

    # Iterate over each row in the DataFrame
    for index, row in wines_df.iterrows():
        text = row['description']
        doc_id = row['id']  # Assuming 'id' is the column name in wines_df that corresponds to MongoDB '_id'
        
        # Generate both embeddings
        spacy_embedding = get_spacy_embedding(text).tolist()
        gist_embedding = get_gist_embedding(text).tolist()

        # Update MongoDB document
        update_result = collection.update_one(
            {"_id": doc_id},
            {"$set": {"spacy_embeddings": spacy_embedding, "gist_embeddings": gist_embedding}}
        )

        # Optional: Print out the result of the update
        print(f"Updated document ID {doc_id}: {update_result.modified_count} document modified.")

In [4]:
# Call the function
load_embeddings_to_db(wines_df)

Updated document ID 0: 1 document modified.
Updated document ID 1: 1 document modified.
Updated document ID 2: 1 document modified.
Updated document ID 3: 1 document modified.
Updated document ID 4: 1 document modified.
Updated document ID 5: 1 document modified.
Updated document ID 6: 1 document modified.
Updated document ID 7: 1 document modified.
Updated document ID 8: 1 document modified.
Updated document ID 9: 1 document modified.
Updated document ID 10: 1 document modified.
Updated document ID 11: 1 document modified.
Updated document ID 12: 1 document modified.
Updated document ID 13: 1 document modified.
Updated document ID 14: 1 document modified.
Updated document ID 15: 1 document modified.
Updated document ID 16: 1 document modified.
Updated document ID 17: 1 document modified.
Updated document ID 18: 1 document modified.
Updated document ID 19: 1 document modified.
Updated document ID 20: 1 document modified.
Updated document ID 21: 1 document modified.
Updated document ID 

In [10]:
# save the top 10000 wines to a csv file
wines_df.to_csv('data/wines10k.csv', index=False)

In [11]:
wines_tags = pd.read_csv('data/wines10k_tags.csv')
wines_tags.head()

Unnamed: 0,id,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,tags
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,"apple, citrus, tropical"
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,"berry, firm tannins"
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,"citrus, high acidity, tropical"
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,"citrus, tropical"
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,


In [12]:
wines_tags['description'][4]

"Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rustic, earthy, herbal characteristics. Nonetheless, if you think of it as a pleasantly unfussy country wine, it's a good companion to a hearty winter stew."

In [62]:
import os
from dotenv import load_dotenv
import openai

def generate_wine_tags(description):
    """
    Use the OpenAI API to generate tags for a wine description.
    The tags are selected from a predefined list:
    'apple', 'berry', 'citrus', 'dry', 'earth', 'firm tannins', 'floral', 'full-bodied',
    'high acidity', 'light-bodied', 'low acidity', 'medium-bodied', 'oak', 'off-dry',
    'semi-sweet', 'smooth tannins', 'spice', 'stone fruit', 'sweet', 'tropical'.
    """
    load_dotenv()
    # connect to openai
    client = openai.Client(api_key=os.getenv('OPENAI_API_KEY'))

    # declare system instructions
    system_instructions = ("Given the description of a wine, assign tags that best describe its characteristics. "
                           "Please use only the following tags as options: 'apple', 'berry', 'citrus', 'dry', 'earth', 'firm tannins', "
                           "'floral', 'full-bodied', 'high acidity', 'light-bodied', 'low acidity', 'medium-bodied', 'oak', 'off-dry', "
                           "'semi-sweet', 'smooth tannins', 'spice', 'stone fruit', 'sweet', 'tropical'. "
                           "Your answer should be a comma-separated list of tags."
                           "Example output: dry, full-bodied, high acidity")

    # generate the tags
    try:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_instructions},
                {"role": "user", "content": description},
            ]
        )
        tags = completion.choices[0].message.content
        output_list = [item.strip() for item in tags.split(',')]

    except Exception as e:
        print("Failed to generate wine tags.")
        print(e)
        output_list = None
    
    return output_list

In [63]:
tags = generate_wine_tags(wines_tags['description'][5])

print(tags)

['berry', 'herbal', 'full-bodied', 'high acidity', 'spice']


In [65]:
def update_db(wines_df):
    """
    Load MongoDB database with spaCy and Gist embeddings from wines_df descriptions.
    """
    # Load environment variables
    load_dotenv()
    uri = os.getenv("MONGODB_URI")
    db_name = "BottleBuddy"
    collection_name = "Wine"
    
    # Connect to MongoDB
    client = MongoClient(uri)
    db = client[db_name]
    collection = db[collection_name]

    # Iterate over each row in the DataFrame
    for index, row in wines_df.iterrows():
        text = row['description']
        doc_id = row['id']  # Assuming 'id' is the column name in wines_df that corresponds to MongoDB '_id'

        tags = generate_wine_tags(text)
        country = row['country']
        designation = row['designation']
        points = row['points']
        province = row['province']
        region_1 = row['region_1']
        region_2 = row['region_2']
        taster_name = row['taster_name']
        variety = row['variety']
        winery = row['winery']

        # Update MongoDB document
        update_result = collection.update_one(
            {"_id": doc_id},
            {"$set": {
                "tags": tags,
                "country": country,
                "designation": designation,
                "points": points,
                "province": province,
                "region_1": region_1,
                "region_2": region_2,
                "taster_name": taster_name,
                "variety": variety,
                "winery": winery
            }}
        )

        # Optional: Print out the result of the update
        print(f"Updated document ID {doc_id}: {update_result.modified_count} document modified.")

In [67]:
update_db(wines_df)

Updated document ID 0: 1 document modified.
Updated document ID 1: 1 document modified.
Updated document ID 2: 1 document modified.
Updated document ID 3: 1 document modified.
Updated document ID 4: 1 document modified.
Updated document ID 5: 1 document modified.
Updated document ID 6: 1 document modified.
Updated document ID 7: 1 document modified.
Updated document ID 8: 1 document modified.
Updated document ID 9: 1 document modified.
Updated document ID 10: 1 document modified.
Updated document ID 11: 1 document modified.
Updated document ID 12: 1 document modified.
Updated document ID 13: 1 document modified.
Updated document ID 14: 1 document modified.
Updated document ID 15: 1 document modified.
Updated document ID 16: 1 document modified.
Updated document ID 17: 1 document modified.
Updated document ID 18: 1 document modified.
Updated document ID 19: 1 document modified.
Updated document ID 20: 1 document modified.
Updated document ID 21: 1 document modified.
Updated document ID 