In [1]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import (
    HumanMessage,
)
from typing import List, Optional
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.llms import HuggingFaceEndpoint
from dotenv import load_dotenv
import os
import json
from langchain_google_genai import ChatGoogleGenerativeAI
from json import JSONDecodeError
import sys
import stat

  from .autonotebook import tqdm as notebook_tqdm


#### Load up Gemini API

In [2]:
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Initialize LangChain model
llm = ChatGoogleGenerativeAI(model="gemini-pro", token=GOOGLE_API_KEY)

#### Read up Template

In [8]:
pat_dir = r"C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_categories_fixed\tvs\templates\template.txt"

os.chmod(pat_dir, stat.S_IREAD | stat.S_IWRITE)

with open(pat_dir, "r") as f: template = f.read()

#### Example Text

In [5]:
text="""
{
    "url": "https://www.neptun.mk/categories/televizori/FUEGO-55-ELU-620-AND-T",
    "title": "4K UHD SMART DLED TV FUEGO 55 ELU 620 AND T, 55\"(139CM), ANDROID TV",
    "warranty": "24",
    "regular_price": "18.999",
    "happy_price": "16.999",
    "description": [
        "55\"(139cm)4К UHD SMART DLED TV\nРезолуција:3840x2160pix\nAspect Ratio - 16:9\nОсветлeност: 250cd\nКонтраст: 5000:1\nАгол на гледање: 178° x 178°\nПодржани бои:1.07G\nВреме на реакција: 6.5ms\nОперативен систем: Android13\nЗвучници: 2x10W\n3xHDMI,2xUSB, RJ45, OPTICAL, AV IN, CI, EARPHONE OUT\nDVB-T/T2/C/S/S2\nВграден Wi-Fi"
    ],
    "category": "televizori"
}
"""

#### Create Prompt Template for Gemini

In [6]:
prompt_template = """
{{tmplate}}

You need to extract the features of the provided product:
{{text}}
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["template","text"], template_format="jinja2")
chain = LLMChain(llm=llm, prompt=prompt)

  warn_deprecated(


#### Testing Feature Extraction for Single Product Text 

In [9]:
structured_data = chain.invoke({"tmplate": template, "text": text})

print(structured_data['text'])

{
    "url": "https://www.neptun.mk/categories/televizori/FUEGO-55-ELU-620-AND-T",
    "title": "4K UHD SMART DLED TV FUEGO 55 ELU 620 AND T, 55\"(139CM), ANDROID TV",
    "warranty_months": 24,
    "regular_price_mkd": 18999,
    "happy_price_mkd": 16999,
    "category": "televizori",
    "device_type": "TV",
    "description": "55\"(139cm)4К UHD SMART DLED TV\nРезолуција:3840x2160pix\nAspect Ratio - 16:9\nОсветлeност: 250cd\nКонтраст: 5000:1\nАгол на гледање: 178° x 178°\nПодржани бои:1.07G\nВреме на реакција: 6.5ms\nОперативен систем: Android13\nЗвучници: 2x10W\n3xHDMI,2xUSB, RJ45, OPTICAL, AV IN, CI, EARPHONE OUT\nDVB-T/T2/C/S/S2\nВграден Wi-Fi",
    "features":{
        "screen_size_inches": 55,
        "screen_resolution": "3840 x 2160",
        "panel_type": "DLED",
        "smart_tv_os": "Android13",
        "audio_output_watts": 20,
        "hdmi_ports": 3,
        "usb_ports": 2,
        "dvb_t2cs2": "Yes"
    }
}


#### Extract All Features of All Products of TV Category

In [10]:
# Directory containing JSON files
input_directory = r"C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_with_categories\televizori"
# Directory to save processed JSON files
output_directory = r"C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_categories_fixed\tvs\tvs_new_template"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

In [11]:
# Loop through all files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.json'):
        input_filepath = os.path.join(input_directory, filename)
        
        # Open and read each JSON file
        with open(input_filepath, 'r', encoding='utf-8') as file:
            data = json.load(file)
            text = json.dumps(data)
            
            # Process the JSON data using the LangChain model
            structured_data = chain.invoke({"tmplate": template, "text": text})
            if 'text' in structured_data:
                try:
                    structured_data = json.loads(structured_data['text'])
                except JSONDecodeError:
                    continue
            
            # Determine the output file path
            output_filename = f"processed_{filename}"
            output_filepath = os.path.join(output_directory, output_filename)
            
            # Write the processed data to the output file
            with open(output_filepath, 'w', encoding='utf-8') as outfile:
                json.dump(structured_data, outfile, indent=4)

print(f"Processed JSON files are saved in {output_directory}")

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Processed JSON files are saved in C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_categories_fixed\tvs\tvs_new_template
