In [1]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import (
    HumanMessage,
)
from typing import List, Optional
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.llms import HuggingFaceEndpoint
from dotenv import load_dotenv
import os
import json
from langchain_google_genai import ChatGoogleGenerativeAI
from json import JSONDecodeError
import sys
import stat

  from .autonotebook import tqdm as notebook_tqdm


#### Load up Gemini API

In [2]:
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Initialize LangChain model
llm = ChatGoogleGenerativeAI(model="gemini-pro", token=GOOGLE_API_KEY)

#### Read up Template

In [7]:
pat_dir = r"C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_categories_fixed\mouses\templates\template.txt"

os.chmod(pat_dir, stat.S_IREAD | stat.S_IWRITE)

with open(pat_dir, "r") as f: template = f.read()

#### Example Text

In [8]:
text="""
{
    "url": "https://www.neptun.mk/categories/GLUVCINA/EVEREST-SM-258-Usb-1200dpi-Red--34582-",
    "title": "MOUSE EVEREST SM-258 USB 1200DPI RED (34582)",
    "warranty": "24",
    "regular_price": "219",
    "happy_price": "105",
    "description": [
        "Mouse\nUsb конекција\n1200dpi\nОптички\nБрој на копчиња:3\nДолжина на кабелот:110Cm"
    ],
    "category": "GLUVCINA"
}
"""

#### Create Prompt Template for Gemini

In [9]:
prompt_template = """
{{tmplate}}

You need to extract the features of the provided product:
{{text}}
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["template","text"], template_format="jinja2")
chain = LLMChain(llm=llm, prompt=prompt)

#### Testing Feature Extraction for Single Product Text 

In [10]:
structured_data = chain.invoke({"tmplate": template, "text": text})

print(structured_data['text'])

{
    "url": "https://www.neptun.mk/categories/GLUVCINA/EVEREST-SM-258-Usb-1200dpi-Red--34582-",
    "title": "MOUSE EVEREST SM-258 USB 1200DPI RED (34582)",
    "warranty_months": 24,
    "regular_price_mkd": 219,
    "happy_price_mkd": 105,
    "category": "GLUVCINA",
    "description": "Mouse\nUsb конекција\n1200dpi\nОптички\nБрој на копчиња:3\nДолжина на кабелот:110Cm",
    "features":{
        "device_type": "Mouse",
        "connection_type": "Usb конекција",
        "dpi": 1200,
        "tracking_technology": "Оптички",
        "buttons_count": 3,
        "scroll_wheel": "Yes",
        "battery_type": null,
        "ergonomic_design": null,
        "compatible_os": null
    }
}


#### Extract All Features of All Products of Mouses Category

In [11]:
# Directory containing JSON files
input_directory = r"C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_with_categories\GLUVCINA"
# Directory to save processed JSON files
output_directory = r"C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_categories_fixed\mouses\mouses_new_template"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

In [13]:
# Loop through all files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.json'):
        input_filepath = os.path.join(input_directory, filename)
        
        # Open and read each JSON file
        with open(input_filepath, 'r', encoding='utf-8') as file:
            data = json.load(file)
            text = json.dumps(data)
            
            # Process the JSON data using the LangChain model
            structured_data = chain.invoke({"tmplate": template, "text": text})
            if 'text' in structured_data:
                try:
                    structured_data = json.loads(structured_data['text'])
                except JSONDecodeError:
                    continue
            
            # Determine the output file path
            output_filename = f"processed_{filename}"
            output_filepath = os.path.join(output_directory, output_filename)
            
            # Write the processed data to the output file
            with open(output_filepath, 'w', encoding='utf-8') as outfile:
                json.dump(structured_data, outfile, indent=4)

print(f"Processed JSON files are saved in {output_directory}")

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 8.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 16.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<lo

Processed JSON files are saved in C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_categories_fixed\mouses\mouses_new_template
