In [1]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import (
    HumanMessage,
)
from typing import List, Optional
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.llms import HuggingFaceEndpoint
from dotenv import load_dotenv
import os
import json
from langchain_google_genai import ChatGoogleGenerativeAI
from json import JSONDecodeError
import sys
import stat

  from .autonotebook import tqdm as notebook_tqdm


#### Load up Gemini API

In [2]:
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Initialize LangChain model
llm = ChatGoogleGenerativeAI(model="gemini-pro", token=GOOGLE_API_KEY)

#### Example Text

In [20]:
text="""
{
    "url": "https://www.neptun.mk/categories/mobilni_telefoni/Nokia-105-Black--1GF019CPA2L03-",
    "title": "NOKIA 105, 1.8\", 1000MAH, BLACK",
    "warranty": "24",
    "regular_price": "1.599",
    "happy_price": "",
    "description": [
        "Мобилен телефон\nЕкран:1.8\" TFT LCD\nРезолуција: 160x120pixels, 111ppi, 4:3\nОтпорност на прав и вода: IP52\n2G мрежа\nРадио\nИзлез за слушалки\nПолнење: Micro-USB\nКапацитет на батеријата: 1000mAh"
    ],
    "category": "mobilni_telefoni"
}
"""

#### Read up Template

In [18]:
pat_dir = r"C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_categories_fixed\mobile_phones\templates\template2.txt"

os.chmod(pat_dir, stat.S_IREAD | stat.S_IWRITE)

with open(pat_dir, "r") as f: template = f.read()

#### Create Prompt Template for Gemini

In [28]:
prompt_template = """
{{tmplate}}

If you don't find a feature from the provided template, don't include it.
You need to extract the features of the provided product:
{{text}}
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["template","text"], template_format="jinja2")
chain = LLMChain(llm=llm, prompt=prompt)

#### Testing Feature Extraction for Single Product Text 

In [24]:
structured_data = chain.invoke({"tmplate": template, "text": text})

print(structured_data['text'])

{
    "url": "https://www.neptun.mk/categories/mobilni_telefoni/Nokia-105-Black--1GF019CPA2L03-",
    "title": "NOKIA 105, 1.8", 1000MAH, BLACK",
    "warranty_months": 24,
    "regular_price_mkd": 1599,
    "happy_price_mkd": null,
    "features":{
      "device_type": "Mobile Phone",
      "screen_size_inches": 1.8,
      "screen_type": "TFT LCD",
      "screen_resolution": "160x120pixels, 111ppi, 4:3",
      "water_resistance_ip_rating": "IP52",
      "feature_1": "2G network",
      "feature_2": "Radio",
      "feature_3": "Headphone jack",
      "usb_type": "Micro-USB",
      "battery_capacity_mah": 1000,
      "category": "mobilni_telefoni"
    }
}


#### Extract All Features of All Products of Mobile Phones Category

In [25]:
# Directory containing JSON files
input_directory = r"C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_with_categories\mobilni_telefoni"
# Directory to save processed JSON files
output_directory = r"C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_categories_fixed\mobile_phones\mobile_phones_new_template"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

In [29]:
# Loop through all files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.json'):
        input_filepath = os.path.join(input_directory, filename)
        
        # Open and read each JSON file
        with open(input_filepath, 'r', encoding='utf-8') as file:
            data = json.load(file)
            text = json.dumps(data)
            
            # Process the JSON data using the LangChain model
            structured_data = chain.invoke({"tmplate": template, "text": text})
            if 'text' in structured_data:
                try:
                    structured_data = json.loads(structured_data['text'])
                except JSONDecodeError:
                    continue
            
            # Determine the output file path
            output_filename = f"processed_{filename}"
            output_filepath = os.path.join(output_directory, output_filename)
            
            # Write the processed data to the output file
            with open(output_filepath, 'w', encoding='utf-8') as outfile:
                json.dump(structured_data, outfile, indent=4)

print(f"Processed JSON files are saved in {output_directory}")

Processed JSON files are saved in C:\Users\tomce\OneDrive - UKIM, FINKI\Desktop\Fakultet 3ta Godina\2 Sesti Semestar\0 DATA SCIENCE SEMINARSKA\1 Starting Over\products_categories_fixed\mobile_phones\mobile_phones_new_template
