In [75]:
import pandas as pd
import os
import json
import time
from narwhals import Boolean
from pydantic import BaseModel, Field
from typing import Optional
from langchain_core.output_parsers import JsonOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import certifi

In [44]:
base_llm = ChatOpenAI(openai_api_key=os.getenv('whiskey_key'), model='gpt-4o-mini', temperature=0.5)

In [61]:
### Pydantic Object
class Whiskey(BaseModel):
    year: int = Field(..., description="The year the whiskey was bottled (if provided)")
    distillery: str = Field(..., description="The name of the distillery or bottler")
    disillery_region: str = Field(..., description='The region the whiskey came from')
    whiskey_country_of_origin: str = Field(..., description='The country the distillery is based in')
    whiskey_name: str = Field(..., description="The name of the whiskey without distiller or age")
    is_blend: str = Field(..., description="Boolean (True/False) for whether the whiskey is a blended or single malt")
    age: str = Field(..., description="The age of the whiskey, amount of time it was in a cask (in years)")
    whiskey_type: str = Field(..., description="The type of whiskey (for example scotch, bourbon, irish, or rye")
    nose_tags: list[str] = Field(..., description='Keywords describing the nose/smell of the whiskey from the provided review')
    palette_tags: list[str] = Field(..., description='Keywords describing the nose/smell of the whiskey from the provided review')
    finish_tags: list[str] = Field(..., description="Keywords describing the finish of the whiskey")
    uuid: str = Field(..., description='The review record UUID information came from for this record')
    rating: int = Field(..., description='The rating from the reddit review, expressed as an integer between 0-100')
    user: str = Field(..., description='The username for the person who posted the review')


In [62]:
whiskey_parser = JsonOutputParser(pydantic_object=Whiskey)

In [63]:
parsing_template = '''
You are provided with a review of a whiskey from reddit.  Your job is to extract metadata and output it according to the instructions below.

### Formatting Instructions
{format_instructions}

### Review
{review}


'''

In [64]:
parsing_prompt = PromptTemplate(
    template = parsing_template,
    input_variables = ['review'],
    partial_variables = {'format_instructions': whiskey_parser.get_format_instructions()}
)

In [65]:
review_llm = base_llm.with_structured_output(Whiskey)
whiskey_chain = parsing_prompt | review_llm

### Load in Review Data

In [8]:
uri = f"mongodb+srv://{os.getenv('whiskeydb_admin')}:{os.getenv('whiskeydb_pwd')}@whiskeyrecommender.mvfds.mongodb.net/?retryWrites=true&w=majority&appName=WhiskeyRecommender"
ca = certifi.where()

# Create a new client and connect to the server
client = MongoClient(uri,
                     server_api=ServerApi('1'),
                     tls=True,
                     tlsAllowInvalidCertificates=False,
                     tlsCAFile=ca)
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [9]:
reddit_reviews = client.reddit_reviews
submissions = reddit_reviews['submissions']

documents = [x for x in submissions.find()]

In [10]:
len(documents)

31030

### Chain Parsing

In [11]:
test_docs = documents[0:10]

In [12]:
documents[0]

{'_id': '1497ef54838b47d7a34f4085720a33da',
 'submission_id': '14uder',
 'whisky_name': '100 Pipers',
 'redditor_name': 'merlinblack',
 'region_or_style': 'Blend',
 'rating': '68',
 'review_date': '12/14/12',
 'first_comment': "My wife and I are on a trip to Thailand to meet her family.  I've seen plenty of whisky here, mostly JW, but this one stood out from the rest.  100 pipers is not something I've seen before and it seems to have quite the following here.  It is a blend at 40% alcohol by volume and 35cl was 220 baht or about $8 Canadian.  I got it more as a novelty as I suspect it is the Thai equivalent of chivas or glenfiddich 12.\n\nColour: caramel, I suspect it is artificially coloured.\n\nNose: (I had some tiger balm on my hands so this may be *way* off) alcohol, little bit of leather and some hints of sweetness.\n\nPalate: very bland, I taste almost nothing really, a bit of woody flavour, the promise of leather and sweetness from the nose is gone.\n\nFinish: short and devoid o

In [66]:
data = whiskey_chain.invoke({'review': documents[1]})

In [67]:
data

Whiskey(year=2018, distillery='1792 Distillery', disillery_region='Kentucky', whiskey_country_of_origin='USA', whiskey_name='225th Anniversary', is_blend='False', age='NAS (nearly a decade)', whiskey_type='Bourbon', nose_tags=['sweet', 'brown sugar', 'dark cherries', 'baking spice'], palette_tags=['cherry pie', 'excellent mouthfeel', 'baking spice', 'cherries', 'brown sugar'], finish_tags=['short', 'hot', 'medium length', 'medium heat'], uuid='5bfcb7bf5f454473b2042da21a0055a8', rating=80, user='scottmotorrad')

### Parse Them All

In [68]:
whiskey_batch = [{'review': x} for x in documents]

In [70]:
# Breaks from rate limits - need to add handling code in to chunk/limit the LLM class calls from langchain
#whiskey_docs = whiskey_chain.batch(whiskey_batch)

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-mehmp72KgMul6sa0TZ4xGRYS on tokens per min (TPM): Limit 200000, Used 199330, Requested 1760. Please try again in 327ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [73]:
parsed_docs = []

In [101]:
parsed_docs

{'review': {'_id': 'c0e4165324544715aee2d112a070c34f',
  'submission_id': 'bgvkd1',
  'whisky_name': 'Auchentoshan 17 (Cask 190 – 1990)',
  'redditor_name': 'devoz',
  'region_or_style': 'Campbeltown',
  'rating': '82',
  'review_date': '4/24/19',
  'first_comment': 'Review 505: Auchentoshan 17 (Cask 190 – 1990) \n\n\n\n* 17 years \n* 46% ABV \n* Ex-bourbon cask \n\n\n\nThanks /u/dramfine for the sample. I find it a little off Auchentoshan would bottle a single cask and then water it down to 46%. I am not the biggest fan of the Auchentoshan profile, so let us see how this goes. \n \n \n \n \n \n**Nose:** Floral, honey, vanilla, malt, grassy, mango \n \n \n**Palate:** Creamy, grassy, malt, vanilla, apple, pepper, honey\n \n \n**Finish:** Malt, honey, floral, vanilla, pepper, apple\n \n \n**Score: 82** \n\n\nThis is a great example of the standard Auchentoshan profile. Which is great it you love the light, grassy, floral malt. Sadly it is not a profile I love. Really easy drinking, would

In [111]:
for i, whiskey in enumerate(whiskey_batch):
    id = whiskey.get('review').get('_id')

    if id in completed_ids:
        pass
    else:
        whiskey_data = whiskey_chain.invoke(whiskey)
        parsed_docs.append(whiskey_data)
        time.sleep(0.5)

    if i%1000 == 0:
        print(f'Processed {i} documents')

Processed 0 documents
Processed 1000 documents
Processed 2000 documents
Processed 3000 documents
Processed 4000 documents
Processed 5000 documents
Processed 6000 documents
Processed 7000 documents
Processed 8000 documents
Processed 9000 documents
Processed 10000 documents
Processed 11000 documents
Processed 12000 documents
Processed 13000 documents
Processed 14000 documents
Processed 15000 documents
Processed 16000 documents
Processed 17000 documents
Processed 18000 documents
Processed 19000 documents
Processed 20000 documents
Processed 21000 documents
Processed 22000 documents
Processed 23000 documents
Processed 24000 documents
Processed 25000 documents
Processed 26000 documents
Processed 27000 documents
Processed 28000 documents
Processed 29000 documents
Processed 30000 documents
Processed 31000 documents


In [112]:
### Need to implement some rate limit fixing.
len(parsed_docs)

31035

In [102]:
completed_ids = [x._id for x in parsed_docs]

In [106]:
completed_ids

['1497ef54838b47d7a34f4085720a33da',
 '1497ef54838b47d7a34f4085720a33da',
 '5bfcb7bf5f454473b2042da21a0055a8',
 '29ec172373944280ad2c2d0158db2928',
 'bffcc71c3ad64363a899fcac1be0cb46',
 '2db4d5ef774240a381d127cb4b41d2df',
 'b7b0cf751a6e4dfd96a0dc79a90ab92b',
 'bc23e67c952b4573a6e1b66789580041',
 'bb12070cc3a840a2a5793071a6222c34',
 '9c721cd60522463a90965238247db881',
 'b4b7f0b6e19940238882fa1ba67568f2',
 '6c430247c0b243f4b3dd26dcd3c5ab0f',
 'fa30f21b1e4b4deebf4a84d1474164a1',
 'd49129f74ced42c9a1163ee3d260d9f3',
 '554439749a3642f69ee758469aa741ad',
 '501fd5bb9a5145ebb7afc719b954450f',
 '9c6a39e67cd140f6a7e68ef1e2f26372',
 '774d45393a7d40db8984c7b36e18772d',
 '611a452a30d248af8a479f1742c206a6',
 '33dd9123c56b45689c4d582bd0f7562e',
 '9dcc3e8f8b8d4a0f96ef59f24b69552d',
 '89b6ac74eadf4f2eb79a41d3b5c45557',
 '17ec64c924f047fdbe224230af1b6b73',
 '28da9b96c38f472eb90c2f12be1440ec',
 'a87b86975e924a0fafe2b1e566d4ee64',
 'b164f76ce5494ba696495e1671181100',
 'a95473f4379e41309946c162c7a6b5d6',
 

In [110]:
whiskey_batch[0].get('review').get('_id') in completed_ids

True

### Parsed Docs to Pickle

In [113]:
import pickle as pkl
pkl.dump(parsed_docs, open('../data/parsed_reviews.pkl', 'wb'), pkl.HIGHEST_PROTOCOL)

### Parsed Docs to Mongo DB

In [114]:
uri = f"mongodb+srv://{os.getenv('whiskeydb_admin')}:{os.getenv('whiskeydb_pwd')}@whiskeyrecommender.mvfds.mongodb.net/?retryWrites=true&w=majority&appName=WhiskeyRecommender"
ca = certifi.where()

# Create a new client and connect to the server
client = MongoClient(uri,
    server_api=ServerApi('1'),
    tls=True,
    tlsAllowInvalidCertificates=False,
    tlsCAFile=ca)
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [115]:
reddit_reviews = client.reddit_reviews
parsed = reddit_reviews['parsed_reviews']

In [116]:
errors = []
i = 0

for i, doc in enumerate(parsed_docs):
    try:
        doc._id = doc.uuid
        # We can't store off the PRAW object since it can't be encoded.
        parsed.insert_one(doc.dict())
    except Exception as e:
        errors.append(doc.uuid)
        print(e)

    if i%1000 == 0:
        print('iteration ', i, 'with ', len(errors), 'errors')

    i += 1

/var/folders/t1/r7nzywq96z7_1663xk78kpmc0000gn/T/ipykernel_90274/2845191331.py:8: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  parsed.insert_one(doc.dict())


iteration  0 with  0 errors
iteration  1000 with  0 errors
iteration  2000 with  0 errors
iteration  3000 with  0 errors
iteration  4000 with  0 errors
iteration  5000 with  0 errors
iteration  6000 with  0 errors
iteration  7000 with  0 errors
iteration  8000 with  0 errors
iteration  9000 with  0 errors
iteration  10000 with  0 errors
iteration  11000 with  0 errors
iteration  12000 with  0 errors
iteration  13000 with  0 errors
iteration  14000 with  0 errors
iteration  15000 with  0 errors
iteration  16000 with  0 errors
iteration  17000 with  0 errors
iteration  18000 with  0 errors
iteration  19000 with  0 errors
iteration  20000 with  0 errors
iteration  21000 with  0 errors
iteration  22000 with  0 errors
iteration  23000 with  0 errors
iteration  24000 with  0 errors
iteration  25000 with  0 errors
iteration  26000 with  0 errors
iteration  27000 with  0 errors
iteration  28000 with  0 errors
iteration  29000 with  0 errors
iteration  30000 with  0 errors
iteration  31000 with