In [22]:
import pinecone
from dotenv import load_dotenv

# Load the API key from the .env file
load_dotenv()

import os
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
        api_key=''
)


# Define the name of the Pinecone index
index_name = "logblack-esg-bot" # SKT

# Create a new index or retrieve an existing one
if index_name in pc.list_indexes().names():
    index = pc.Index(index_name)
else:
    print("INDEX ERROR!")


In [23]:
from openai import OpenAI

client = OpenAI(api_key = '')

def get_embedding(text, model="text-embedding-3-large"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

# Example usage
text = """환경경영 이행 전담조직을 구축하고 역할과 책임을 공개하고 있는가?"""
embedding = get_embedding(text)
print(embedding)

[-0.009686898440122604, -0.01333430502563715, -0.00643701059743762, -0.014108420349657536, 0.023697683587670326, 0.01498017180711031, 0.0033998293802142143, -0.005003851372748613, -0.009149899706244469, -0.03832915425300598, 0.013669057749211788, -0.06153865531086922, -0.01557993609458208, 0.04008660465478897, -0.0014872074825689197, -0.01346681173890829, 0.016388921067118645, -0.038608115166425705, 0.05311405286192894, 0.02892819046974182, 0.024255603551864624, -0.07068855315446854, -0.05489939823746681, 0.01087945420295, -0.0222331415861845, 0.0036753027234226465, 0.012134775519371033, -0.01216267142444849, 0.022414464503526688, 0.039891332387924194, -0.000237552187172696, -0.00426460662856698, -0.00235024094581604, 0.009101081639528275, -0.0580795481801033, -0.008194460533559322, 0.019583016633987427, -0.042011432349681854, -0.04563791677355766, 0.008815147913992405, -0.027533387765288353, 0.03263836354017258, -0.016430765390396118, 0.010272715240716934, 0.0013503425288945436, 0.015

In [21]:
# Query the embedding from Pinecone
result = index.query(vector=embedding, top_k=5, include_metadata=True, namespace='krx017670-2023-kr')
print(result)

{'matches': [{'id': 'e579985c-bf3a-4d0e-be5e-cde756ff1d98',
              'metadata': {'doc_id': 'b631b995-5944-4e43-a558-b97e31e7bc53',
                           'source_url': 'https://logblack-public.s3.ap-northeast-2.amazonaws.com/KRX_GRI_Example/whole_pdf/whole_pdf/SK텔레콤_2023.pdf',
                           'text': '페이지 제목: 지속가능경영 E.S.G\n'
                                   '키워드: 환경경영, 지속가능성, ESG\n'
                                   '페이지 번호: 51\n'
                                   '핵심 내용:\n'
                                   '- 주요 내용:\n'
                                   '  - 환경경영: 지속 가능한 경영을 위한 환경적 측면 강조\n'
                                   '- 주요 이미지 설명:\n'
                                   '  - 파란색 배경에 "Environmental"과 "환경경영" 텍스트가 '
                                   '포함된 페이지 디자인'},
              'score': 0.4385297,
              'values': []},
             {'id': '63268065-95c3-4f7b-aedd-04e72a72c654',
              'metadata': {'doc_id': 'd64ca951-b71f-4f48-9f77-7e060ea

In [33]:
# 이미지 (pdf의 각 페이지를 image 로 만든 이미지) 도 요약

import base64
import os

from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI

os.environ['OPENAI_API_KEY'] =  'sk-proj-fMXnY9lQV8t71sSwCQSKT3BlbkFJfbwNMG9EJAtcvvIFdz1r'


def encode_image(image_path):
    # 이미지 파일을 base64 문자열로 인코딩합니다.
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def image_summarize(img_base64, prompt):
    # 이미지 요약을 생성합니다.
    chat = ChatOpenAI(model="gpt-4o", max_tokens=2048)

    msg = chat.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
                    },
                ]
            )
        ]
    )
    return msg.content


def generate_img_summary(path):
    """
    이미지에 대한 요약과 base64 인코딩된 문자열을 생성합니다.
    path: Unstructured에 의해 추출된 .jpg 파일 목록의 경로
    """

    # base64로 인코딩된 이미지를 저장할 리스트
    img_base64_list = []

    # 이미지 요약을 저장할 리스트
    image_summaries = []

    # 요약을 위한 프롬프트
    prompt = """You are an assistant tasked with summarizing images for retrieval. \
    These summaries will be embedded and used to retrieve the raw image. \
    Give a concise summary of the image that is well optimized for retrieval."""

    # 이미지에 적용
    base64_image = encode_image(path)
    return (image_summarize(base64_image, prompt))

# 이미지 요약 실행
image_summary = generate_img_summary("../data/reports/SK텔레콤2023/page_images/page_0051.jpg")

print(image_summary)

Title: SK Telecom Annual Report 2022 - Sustainability Management

Summary: 
The image is a page from the SK Telecom Annual Report 2022, focusing on sustainability management. It includes text in Korean discussing environmental management strategies and goals, particularly the '2050 Net Zero' target. The page features an illustration of individuals working on environmental initiatives around the globe, a strategic framework diagram with the goal '2050 Net Zero', and three key initiative areas: climate change response, environmental system enhancement, and green culture creation.

Key elements:
- Title: SK Telecom Annual Report 2022 - Sustainability Management
- Language: Korean
- Focus: Environmental management, '2050 Net Zero' strategy
- Illustration: Individuals working on a globe with eco-friendly symbols
- Diagram: Strategic framework with '2050 Net Zero' goal
- Three key initiatives: Climate change response, environmental system enhancement, green culture creation


In [18]:
load_dotenv()

True

In [4]:
import os

In [10]:
os.chdir("../")

In [9]:
load_dotenv()

True

In [6]:
ls

README.md              data.json              [1m[36mref_and_testing_codes[m[m/
[1m[36mcodes[m[m/                 [1m[36mipynbs[m[m/                [1m[36mvenv[m[m/
[1m[36mdata[m[m/                  krx.csv


In [11]:
# Configuring
from codes.data_handler.lc_docstore_handler.in_memory_docstore_handler import InMemoryDocstoreHandler
from codes.data_handler.lc_retrieverHandler.retriever_handler import RetrieverHandler
from codes.data_handler.report_handler.pdf_image_report_handler import PdfImageReportHandler
from codes.data_handler.summary_handler.gpt_summary_handler import GPTSummaryHandler
from codes.data_handler.lc_vectorstore_handler.pinecone_vectorstore_handler import PineconeVectorstoreHandler

from dotenv import load_dotenv
import pandas as pd
import os

# Load the .env file
load_dotenv()

report_data_dir = "./data/reports/"
report_name = "report.pdf"
file_list_df = pd.read_csv("./data/reports.csv")

target = "SK텔레콤"#"LG에너지솔루션"#
row = file_list_df[file_list_df.company_name == target].iloc[0]

company_name = row["company_name"]
year = row["year"]
url = f"{os.getenv('logblack_url')}{company_name}_{year}.pdf"

# FIXME: TESTING.
#url = "https://www.clickdimensions.com/links/TestPDFfile.pdf"

In [15]:
company = company_name
report_url = url

In [30]:


components = PdfImageReportHandler(
        company_name=company, year=year, report_url=report_url
        ).splitReport(report_data_dir, report_name)
summarized_components = GPTSummaryHandler().summary(components)


File downloaded successfully and saved to ./data/reports/SK텔레콤2023/report.pdf
Saved image to ./data/reports/SK텔레콤2023/page_images, 193 files
source_url summary generated
tables summary generated
page_images_path summary generated


In [42]:
import json
# Writing dictionary to a JSON file
with open("data.json", "w") as json_file:
    json.dump(summarized_components, json_file)

In [13]:
import json

# Reading dictionary from a JSON file
with open("data.json", "r") as json_file:
    summarized_components = json.load(json_file)

In [16]:


# USE LangChain from here.
vectorstore_handler = PineconeVectorstoreHandler(
        company_name=company, year=year, embeddingModel='text-embedding-3-large', postfix="kr"
    ).getStore()




Vector Store Connection Failed, fallback to load.
Pinecone is Cloud based Vector Database, load opeartion Failed.
Vector Store Load Failed, fallback to create


In [17]:
docstore_handler = InMemoryDocstoreHandler()
lc_docstore = docstore_handler.getStore()

lc_retriever = RetrieverHandler(vectorstore_handler, lc_docstore)
lc_retriever.add(summarized_components)
docstore_handler.export_to_file(report_data_dir+company_name+str(year)+"/store_data.json")