In [None]:
import os 
import re
import glob
import streamlit as st
import base64
from markdownify import markdownify as markdown

from utils.Classes import GraphState, LayoutAnalyzer
from utils.funcs import *
from utils.extracts import *
from utils.crops import *
from utils.creates import *
from utils.save import save_results

from utils.creates import create_text_trans_summary
from utils.vectordb import build_db
from utils.prompt import summary_prompt, map_prompt, trans_prompt


from dotenv import load_dotenv
load_dotenv()

True

In [2]:
OPENAI_API_KEY = os.environ.get("UPSTAGE_API_KEY")

In [3]:
analyzer = LayoutAnalyzer(OPENAI_API_KEY)

In [4]:
file_path = './papers/objectvla.pdf'

In [5]:

state = GraphState(filepath=file_path, batch_size=10)

In [6]:
split_file_list = split_pdf(state)

총 페이지 수: 11
분할 PDF 생성: ./papers/objectvla_0000_0009.pdf
분할 PDF 생성: ./papers/objectvla_0010_0010.pdf


In [7]:
split_file_list

{'split_filepaths': ['./papers/objectvla_0000_0009.pdf',
  './papers/objectvla_0010_0010.pdf']}

In [8]:
# 그래프에 업데이트
state.update(split_file_list)

In [9]:
state

{'filepath': './papers/objectvla.pdf',
 'batch_size': 10,
 'split_filepaths': ['./papers/objectvla_0000_0009.pdf',
  './papers/objectvla_0010_0010.pdf']}

# PDF 파일 분석

In [10]:
# 1.1 문서 구조 분석기를 통해 기본 분석 결과 저장 
# layour_analtzer가 분석한 결과 -> json
state_out = analyze_layout(analyzer, state)
state.update(state_out)

In [11]:
state

{'filepath': './papers/objectvla.pdf',
 'batch_size': 10,
 'split_filepaths': ['./papers/objectvla_0000_0009.pdf',
  './papers/objectvla_0010_0010.pdf'],
 'analyzed_files': ['./papers/objectvla_0000_0009.json',
  './papers/objectvla_0010_0010.json']}

In [12]:
# 1.2 문서에 대한 메타데이터 추출 
# 논문 페이지 크기에 대한 파라메타 
state_out = extract_page_metadata(state)
state.update(state_out)

In [13]:
state

{'filepath': './papers/objectvla.pdf',
 'batch_size': 10,
 'split_filepaths': ['./papers/objectvla_0000_0009.pdf',
  './papers/objectvla_0010_0010.pdf'],
 'analyzed_files': ['./papers/objectvla_0000_0009.json',
  './papers/objectvla_0010_0010.json'],
 'page_metadata': {0: {'size': [1275, 1650]},
  1: {'size': [1275, 1650]},
  2: {'size': [1275, 1650]},
  3: {'size': [1275, 1650]},
  4: {'size': [1275, 1650]},
  5: {'size': [1275, 1650]},
  6: {'size': [1275, 1650]},
  7: {'size': [1275, 1650]},
  8: {'size': [1275, 1650]},
  9: {'size': [1275, 1650]},
  10: {'size': [1275, 1650]}}}

In [14]:
# 1.3 문서 구조와 내용에 대한 html 내용 추출
# 페이지별 정보를 추출 
state_out = extract_page_elements(state)
state.update(state_out)

In [15]:
# 1.4 문서 요소 별 tag 추출
state_out = extract_tag_elements_per_page(state)
state.update(state_out)

In [16]:
# 1.5 페이지 번호 추출 
state_out = page_numbers(state)
state.update(state_out)

In [17]:
# 2.1 이미지를 추출하여 저장하고 위치를 저장 
state_out = crop_image(state)
state.update(state_out)

page:0, id:3, path: ./papers/objectvla/3.png
page:3, id:33, path: ./papers/objectvla/33.png
page:4, id:47, path: ./papers/objectvla/47.png
page:4, id:50, path: ./papers/objectvla/50.png
page:4, id:59, path: ./papers/objectvla/59.png
page:4, id:63, path: ./papers/objectvla/63.png
page:5, id:71, path: ./papers/objectvla/71.png
page:10, id:121, path: ./papers/objectvla/121.png
page:10, id:123, path: ./papers/objectvla/123.png


In [18]:
# 2.2 표를 추출하여 저장하고 위치를 저장 
state_out = crop_table(state)
state.update(state_out)

page:5, id:76, path: ./papers/objectvla/76.png
page:6, id:85, path: ./papers/objectvla/85.png


In [19]:
# 2.3 수식을 추출하여 저장하고 위치를 저장 
state_out = crop_equation(state)
state.update(state_out)

In [20]:
# 2.4 텍스트를 추출하고 저장하여 위치를 저장 
state_out = extract_page_text(state)
state.update(state_out)

In [21]:
state["texts"]

{0: '# ObjectVLA: End-to-End Open-World Object Manipulation\nWithout Demonstration \n 2025\nFeb\n28\n[cs.RO]\narXiv:2502.19250v2Minjie Zhu12∗ Yichen Zhu1∗† Jinming Li3 Zhongyi Zhou2\nJunjie Wen2 Xiaoyu Liu3 Chaomin Shen2 Yaxin Peng3 Feifei Feng1\n1Midea Group 2East China Normal University 3Shanghai University\n∗Equal contribution †Corresponding authorFigure 1. A brief illustration of ObjectVLA. Conventional imitation learning focuses on tasks that involve both skills and objects. While\nit performs well on seen objects and sometimes generalizes to similar ones (e.g., objects with changed colors), it typically fails with novel\nobjects. By co-training with image-text data, our approach enables VLA models to generalize to any object present in the image-text\ndataset. Additionally, users can capture object images, automatically generate image-text data, and fine-tune a pre-trained VLA model\nwith minimal resources to learn manipulation on novel objects.',
 1: '# Abstract \n Imitation lea

In [22]:
len(state['html_content'])

115

In [None]:
# pdf파일 이름과 같은 이름으로 마크다운 파일 저장

pdf_file = state["filepath"]  # PDF 파일 경로
output_folder = os.path.splitext(pdf_file)[0]  # 출력 폴더 경로 설정
filename = os.path.basename(pdf_file).split('.')[0]

md_output_file1 = save_results(output_folder, filename, state['html_content'])

In [26]:
md_output_file1

'./papers/objectvla/objectvla.md'

# 생성 
- 번역 
- 요약

In [33]:
selected_model = 'gpt-4o-mini'

In [34]:
text_summary_chain = get_chain(selected_model, summary_prompt)
paper_summary_chain = get_chain(selected_model, map_prompt)
trans_chain = get_translator(selected_model, trans_prompt)

In [29]:
# 텍스트 요약 생성
state_out = create_text_summary(text_summary_chain, state)
state.update(state_out)

state_out = map_reduce_summary(paper_summary_chain, state)
state.update(state_out)

# 요약 번역
trans_chain = get_translator(selected_model, trans_prompt)
state_out = create_text_trans_summary(trans_chain, state)
state.update(state_out)


In [28]:
# # 원본 번역
# trans_chain = get_translator(selected_model, trans_prompt)
# state_out = create_text_trans(trans_chain, state)
# state.update(state_out)

In [30]:
# Image 요약 생성 
state_out = create_image_summary_data_batches(state)
state.update(state_out)

state_out = create_image_summary(state)
state.update(state_out)

In [35]:
# Table 요약 생성 
state_out = create_table_summary_data_batches(state)
state.update(state_out)


state_out = create_table_summary(state)
state.update(state_out)

In [36]:
# Equation 요약 생성 
state_out = create_equation_summary_data_batches(state)
state.update(state_out)

state_out = create_equation_summary(state)
state.update(state_out)

In [37]:
# 4 표를 다시 마크다운 표 생성 
state_out = create_table_markdown(state)
state.update(state_out)

In [38]:
# 수식 이미지 처리

cnt = 1
for key, value in state['equation_summary'].items():
    equation_html = f"<p id='{key}_1' data-category='equation' style='font-size:14px'>{value}</p>"
    state['html_content'].insert(cnt+int(key), equation_html)
    cnt+=1

In [39]:
# 생성 내용 분석 파일에 덮어써서 저장
md_output_file = save_results(output_folder, filename, state['html_content'])

HTML 파일이 ./papers/objectvla/objectvla.html에 저장되었습니다.
Markdown 파일이 ./papers/objectvla/objectvla.md에 저장되었습니다.


In [40]:

output_file = output_folder + '/'+ filename + "_analy.json"    

# pdf구조를 json으로 저장 
with open(output_file, "w", encoding='utf-8') as file:
    json.dump(state, file, ensure_ascii=False)

# 분석 번역 요약 과정에서 생긴 json 파일 제거 
for del_file in state['split_filepaths'] + state['analyzed_files']:
    os.remove(del_file)
    
# 과정에서 생긴 html 파일 제거 
# os.remove('.'.join(file_path.split('.')[:-1]) + f'/{filename}.html')

In [41]:
# # vectordb 만들기 
# build_db(state)

# 요약 내용 

- 원본 요약 내용 
- 한국어 요약 내용 

구분해서 하자

## 원본 영어 번역

원본 md 파일을 읽어서 한국어로 통번역 


In [None]:
original_paper_md = f"{output_folder}/{filename}.md"
new_docs = load_and_split(original_paper_md)
translated_paragraph = ['# ' + new_docs[0].metadata['Header 1']] + trans_chain.batch(new_docs[1:])
combined_content = "\n".join(translated_paragraph)
md_output = markdown(combined_content)

trans_paper_md = f"{output_folder}/{filename}_trans.md"
with open(trans_paper_md, "w", encoding="utf-8") as f:
    f.write(md_output)

print(trans_paper_md)

./papers/objectvla/objectvla.md


## 영어 요약

In [65]:
output_file = output_folder + '/'+ filename + "_analy.json"    
with open(output_file, "r", encoding='utf-8') as f:
    json_data = json.load(f)

In [81]:
markdown_contents = []  # 마크다운 내용을 저장할 빈 리스트
    

names = json_data['section_names']
for i, page in enumerate(json_data['texts_summary'].keys()):
    page = int(page)
    if names[page] == 'References':
        continue
    print(names[page])
    text_summary = json_data['texts_summary'][str(page)]
    section_title = f'# {names[page]}'
    if i==0:
        text_summary = json_data['paper_summary']
    
    markdown_contents.append(section_title)
    markdown_contents.append(text_summary)
    
    for image_summary_data_batch in json_data['image_summary_data_batches']:
        if image_summary_data_batch['page'] == page:
            img_file = image_summary_data_batch['image'].split('/')[-1]
            img_name = os.path.basename(img_file).split('.')[0]
            # markdown_result = html_to_markdown_table(json_data['images_summary'][img_name])
            
            # 이미지와 테이블 마크다운을 리스트에 추가
            markdown_contents.append(f'\n ![{img_name}]({img_file}) \n')
            # markdown_contents.append(f'\n {markdown_result} \n')
            
    for table_summary_data_batch in json_data['table_summary_data_batches']:
        if table_summary_data_batch['page'] == page:
            table_img_file = table_summary_data_batch['table'].split('/')[-1]
            table_text = table_summary_data_batch['text']
            
            table_img_name = os.path.basename(table_img_file).split('.')[0]
            # markdown_result = html_to_markdown_table(json_data['tables_summary'][table_img_name])
            
            # 테이블과 텍스트도 리스트에 추가
            markdown_contents.append(f'\n ![{table_img_name}]({table_img_file}) \n')
            # markdown_contents.append(f'\n {markdown_result} \n')



ObjectVLA: End-to-End Open-World Object Manipulation
Without Demonstration
Abstract
1. Introduction
2. Related Work
3. Methodology
4. Experiments
Photo taken by cameras from robot
Question Detecting the bounding box of stick.
Question Detecting the bounding box of yellow dragon.
5. Conclusion
6. Limitation


In [83]:
# summary 마크다운 저장하기 

# 리스트에 저장된 마크다운 내용을 하나의 파일로 저장
markdown_file_path = f'{output_folder}/{filename}_summary_en.md'
with open(markdown_file_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(markdown_contents))

## 한국어 요약

In [84]:
with open(output_file, "r", encoding='utf-8') as f:
    json_data = json.load(f)

In [85]:
names = json_data['section_names']

In [86]:
markdown_contents = []

for i, page in enumerate(json_data['texts_summary'].keys()):
    page = int(page)
    if names[page] == 'References':
        continue
    print(names[page])
    if i == 0:
        text_summary = json_data['paper_trans_summary']
    else:
        text_summary = json_data['texts_trans_summary'][str(page)]
        
    section_title = f'# {names[page]}'
    
    markdown_contents.append(section_title)
    markdown_contents.append(text_summary)
    
    for image_summary_data_batch in json_data['image_summary_data_batches']:
        if image_summary_data_batch['page'] == page:
            img_file = image_summary_data_batch['image'].split('/')[-1]
            img_name = os.path.basename(img_file).split('.')[0]
            
            # 이미지와 테이블 마크다운을 리스트에 추가
            markdown_contents.append(f'![{img_name}]({img_file})')

            
    for table_summary_data_batch in json_data['table_summary_data_batches']:
        if table_summary_data_batch['page'] == page:
            table_img_file = table_summary_data_batch['table'].split('/')[-1]
            table_text = table_summary_data_batch['text']
            table_img_name = os.path.basename(table_img_file).split('.')[0]
           
            # 테이블과 텍스트도 리스트에 추가
            markdown_contents.append(f'![{table_img_name}]({table_img_file})')

ObjectVLA: End-to-End Open-World Object Manipulation
Without Demonstration
Abstract
1. Introduction
2. Related Work
3. Methodology
4. Experiments
Photo taken by cameras from robot
Question Detecting the bounding box of stick.
Question Detecting the bounding box of yellow dragon.
5. Conclusion
6. Limitation


In [87]:
markdown_file_path = f'{output_folder}/{filename}_summary_ko.md'

In [88]:
with open(markdown_file_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(markdown_contents))

# 채팅하기