In [1]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import (
    HumanMessage,
)
from typing import List, Optional
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.llms import HuggingFaceEndpoint
from dotenv import load_dotenv
import os
import json
from langchain_google_genai import ChatGoogleGenerativeAI
from json import JSONDecodeError
import sys
import stat

  from .autonotebook import tqdm as notebook_tqdm


#### Load up Gemini API

In [2]:
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Initialize LangChain model
llm = ChatGoogleGenerativeAI(model="gemini-pro", token=GOOGLE_API_KEY)

#### Read up Template

In [25]:
pat_dir = r"C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_categories_fixed\gaming_laptops\templates\template2.txt"

os.chmod(pat_dir, stat.S_IREAD | stat.S_IWRITE)

with open(pat_dir, "r") as f: template = f.read()

#### Example Text

In [26]:
text="""
{
    "url": "https://www.neptun.mk/categories/gaming_laptopi/ASUS-ROG-Zephyrus-G14-GA403UI-QS048W-14--OLED-R9-8945HS-16GB-DDR5X-1TB-RTX-4070-8GB-WIN11",
    "title": "",
    "warranty": "24",
    "regular_price": "139.999",
    "happy_price": "129.999",
    "description": [
        "Лаптоп\nДисплеј: 14\" (39.6cm)\nРезолуција:(2880 x 1800) 3К, OLED, 120Hz, 0.2ms, 500 nits, 16:10\nПроцесор:AMD Ryzen™ 9 8945HS, 4GHz (24MB Cache, up to 5.2 GHz, 8 cores, 16 Threads)\nГрафика:NVIDIA GeForce RTX 4070 8GB GDDR6\nRAM меморија:16GB DDR5X\nДиск (Storage):1TB M.2 2280 PCIe Gen4 SSD\nОперативен систем:Windows 11 Home\nWi-Fi 6E(802.11ax) (Triple band) 2*2 + Bluetooth 5.3\nПорти:\n1x Type C USB 4 support DisplayPort    / power delivery\n1x USB 3.2 Gen 2 Type-C support DisplayPor\n2x USB 3.2 Gen 2 Type-A\n1x card reader (microSD) (UHS-II)\n1x HDMI 2.1 FRL\n1x 3.5mm Combo Audio Jack\nВгадени 3 микрофони\nBacklit Chiclet Keyboard 1-Zone RGB\n4x звучници (4-speaker (dual force woofer) system with Smart Amplifier Technology)\nАудио:Tech AI noise-canceling - Dolby Atmos\nHi-Res certification\nSmart Amp Technology\nSecurity BIOS Administrator Password and User Password Protection\nБатерија:73WHrs, 4S1P, 4-cell Li-ion"
    ],
    "category": "gaming_laptopi"
}
"""

#### Create Prompt Template for Gemini

In [27]:
prompt_template = """
{{tmplate}}

You need to extract the features of the provided product:
{{text}}
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["template","text"], template_format="jinja2")
chain = LLMChain(llm=llm, prompt=prompt)

#### Testing Feature Extraction for Single Product Text 

In [13]:
structured_data = chain.invoke({"tmplate": template, "text": text})

print(structured_data['text'])

{
    "url": "https://www.neptun.mk/categories/gaming_laptopi/ASUS-ROG-Zephyrus-G14-GA403UI-QS048W-14--OLED-R9-8945HS-16GB-DDR5X-1TB-RTX-4070-8GB-WIN11",
    "title": "ASUS ROG Zephyrus G14 GA403UI-QS048W",
    "warranty_months": 24,
    "regular_price_mkd": 139999,
    "happy_price_mkd": 129999,
    "category": "gaming_laptopi",
    "features":{
        "device_type": "Gaming Laptop",
        "description": "Лаптоп\nДисплеј: 14\" (39.6cm)\nРезолуција:(2880 x 1800) 3К, OLED, 120Hz, 0.2ms, 500 nits, 16:10\nПроцесор:AMD Ryzen™ 9 8945HS, 4GHz (24MB Cache, up to 5.2 GHz, 8 cores, 16 Threads)\nГрафика:NVIDIA GeForce RTX 4070 8GB GDDR6\nRAM меморија:16GB DDR5X\nДиск (Storage):1TB M.2 2280 PCIe Gen4 SSD\nОперативен систем:Windows 11 Home\nWi-Fi 6E(802.11ax) (Triple band) 2*2 + Bluetooth 5.3\nПорти:\n1x Type C USB 4 support DisplayPort    / power delivery\n1x USB 3.2 Gen 2 Type-C support DisplayPor\n2x USB 3.2 Gen 2 Type-A\n1x card reader (microSD) (UHS-II)\n1x HDMI 2.1 FRL\n1x 3.5mm Combo Aud

#### Extract All Features of All Products of Gaming Laptops Category

In [28]:
# Directory containing JSON files
input_directory = r"C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_with_categories\gaming_laptopi"
# Directory to save processed JSON files
output_directory = r"C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_categories_fixed\gaming_laptops\gaming_laptops_new_template"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

In [29]:
# Loop through all files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.json'):
        input_filepath = os.path.join(input_directory, filename)
        
        # Open and read each JSON file
        with open(input_filepath, 'r', encoding='utf-8') as file:
            data = json.load(file)
            text = json.dumps(data)
            
            # Process the JSON data using the LangChain model
            structured_data = chain.invoke({"tmplate": template, "text": text})
            if 'text' in structured_data:
                try:
                    structured_data = json.loads(structured_data['text'])
                except JSONDecodeError:
                    continue
            
            # Determine the output file path
            output_filename = f"processed_{filename}"
            output_filepath = os.path.join(output_directory, output_filename)
            
            # Write the processed data to the output file
            with open(output_filepath, 'w', encoding='utf-8') as outfile:
                json.dump(structured_data, outfile, indent=4)

print(f"Processed JSON files are saved in {output_directory}")

Processed JSON files are saved in C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_categories_fixed\gaming_laptops\gaming_laptops_new_template
