In [1]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import (
    HumanMessage,
)
from typing import List, Optional
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.llms import HuggingFaceEndpoint
from dotenv import load_dotenv
import os
import json
from langchain_google_genai import ChatGoogleGenerativeAI
from json import JSONDecodeError
import sys
import stat

  from .autonotebook import tqdm as notebook_tqdm


#### Load up Gemini API

In [2]:
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Initialize LangChain model
llm = ChatGoogleGenerativeAI(model="gemini-pro", token=GOOGLE_API_KEY)

#### Read up Template

In [3]:
pat_dir = r"C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_categories_fixed\laptops\templates\template.txt"

os.chmod(pat_dir, stat.S_IREAD | stat.S_IWRITE)

with open(pat_dir, "r") as f: template = f.read()

#### Example Text

In [4]:
text="""
{
    "url": "https://www.neptun.mk/categories/prenosni_kompjuteri/ACER-Swift-Go-14-SFG14-42-R54G-14--R5-7640U-16GB-DDR5-512GB",
    "title": "ЛАПТОП ACER SWIFT GO 14 SFG14-42-R54G, R5-7640U/16GB/512GB",
    "warranty": "24",
    "regular_price": "56.999",
    "happy_price": "54.999",
    "description": [
        "Лаптоп\nДисплеј:14.0\" (35.5cm)\nРезолуција:2.8K OLED SlimBezel Adobe&DCI-P3 100%\nПроцесор:AMD Ryzen™ 5 7640U\nГрафика:AMD Radeon™ Graphics\nRAM Меморија: 16 GB LPDDR5\nДиск (Storage): 512GB PCIe NVMe SSD\nБез оперативен систем\nWIFI6E+BT\nКамера:QHD Camera\nБатерија:65Wh Li-ion battery\nПолнач: 65W TypeC WM adapter"
    ],
    "category": "prenosni_kompjuteri"
}
"""

#### Create Prompt Template for Gemini

In [5]:
prompt_template = """
{{tmplate}}

You need to extract the features of the provided product:
{{text}}
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["template","text"], template_format="jinja2")
chain = LLMChain(llm=llm, prompt=prompt)

  warn_deprecated(


#### Testing Feature Extraction for Single Product Text 

In [6]:
structured_data = chain.invoke({"tmplate": template, "text": text})

print(structured_data['text'])

{
    "url": "https://www.neptun.mk/categories/prenosni_kompjuteri/ACER-Swift-Go-14-SFG14-42-R54G-14--R5-7640U-16GB-DDR5-512GB",
    "title": "ЛАПТОП ACER SWIFT GO 14 SFG14-42-R54G, R5-7640U/16GB/512GB",
    "warranty_months": 24,
    "regular_price_mkd": 56999,
    "happy_price_mkd": 54999,
    "category": "prenosni_kompjuteri",
    "device_type": "Laptop",
    "description": "Лаптоп\nДисплеј:14.0\" (35.5cm)\nРезолуција:2.8K OLED SlimBezel Adobe&DCI-P3 100%\nПроцесор:AMD Ryzen™ 5 7640U\nГрафика:AMD Radeon™ Graphics\nRAM Меморија: 16 GB LPDDR5\nДиск (Storage): 512GB PCIe NVMe SSD\nБез оперативен систем\nWIFI6E+BT\nКамера:QHD Camera\nБатерија:65Wh Li-ion battery\nПолнач: 65W TypeC WM adapter",
    "features": {
        "screen_size_inches": 14.0,
        "screen_resolution": "2.8K",
        "screen_type": "OLED SlimBezel",
        "processor_model": "AMD Ryzen™ 5 7640U",
        "gpu_model": "AMD Radeon™ Graphics",
        "ram_size_gb": 16,
        "ram_type": "LPDDR5",
        "storag

#### Extract All Features of All Products of Laptops Category

In [7]:
# Directory containing JSON files
input_directory = r"C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_with_categories\prenosni_kompjuteri"
# Directory to save processed JSON files
output_directory = r"C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_categories_fixed\laptops\laptops_new_template"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

In [8]:
# Loop through all files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.json'):
        input_filepath = os.path.join(input_directory, filename)
        
        # Open and read each JSON file
        with open(input_filepath, 'r', encoding='utf-8') as file:
            data = json.load(file)
            text = json.dumps(data)
            
            # Process the JSON data using the LangChain model
            structured_data = chain.invoke({"tmplate": template, "text": text})
            if 'text' in structured_data:
                try:
                    structured_data = json.loads(structured_data['text'])
                except JSONDecodeError:
                    continue
            
            # Determine the output file path
            output_filename = f"processed_{filename}"
            output_filepath = os.path.join(output_directory, output_filename)
            
            # Write the processed data to the output file
            with open(output_filepath, 'w', encoding='utf-8') as outfile:
                json.dump(structured_data, outfile, indent=4)

print(f"Processed JSON files are saved in {output_directory}")

Processed JSON files are saved in C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_categories_fixed\laptops\laptops_new_template
