In [36]:
import os
import pymupdf
import json

PDF = "../pdfs"
SPM = "../spms"
CROP = "../crops"
ANNOTATIONS = "../annotations"  

project_id = 10
pdf_path = os.path.join(PDF, f"{project_id}_cs231n_2017_lecture2.pdf")
spm_path = os.path.join(SPM, f"15_page_info.json")
annnotation_path = os.path.join(ANNOTATIONS, f"{project_id}_annotation.json")

with open(spm_path, 'r') as file:
    page_info = json.load(file)

with open(annnotation_path, "r") as file:
    annotations = json.load(file)

output = page_info

doc = pymupdf.open(pdf_path)

for page_num in range(len(doc)):
    page = doc.load_page(page_num)
    text = page.get_text("text")  
    images_info = page.get_image_info(xrefs=True)  
    image_path = os.path.join(CROP, str(project_id), str(page_num + 1)) 
    os.makedirs(image_path, exist_ok=True)

    crop_images = []
    for image_index, img_info in enumerate(images_info):
        xref = img_info['xref']  # 이미지의 xref 값
        base_image = doc.extract_image(xref)  # 이미지 데이터 추출
        image_bytes = base_image["image"]  # 이미지 바이트 데이터
        crop_path = os.path.join(image_path, f"{image_index + 1}.png")  
        crop_images.append(crop_path)
        # 이미지 저장
        with open(crop_path, "wb") as img_file:
            img_file.write(image_bytes)

    output["pages"][str(page_num+1)]["pdf_text"] = text    
    output["pages"][str(page_num+1)]["pdf_images"] = crop_images
    output["pages"][str(page_num+1)]["annotation"] = annotations[str(page_num+1)]

with open(spm_path, 'w') as file:
    json.dump(output, file, indent=4)



In [None]:
ANNOTATIONS = "../annotations"  

annnotation_path = os.path.join(ANNOTATIONS, f"{project_id}_annotation.json")

with open(annnotation_path, "r") as file:
    annotations = json.load(file)




In [40]:

for page_num in range(len(doc)):
    page = doc.load_page(page_num)
    images_info = page.get_image_info(xrefs=True)  
    image_path = os.path.join(CROP, str(project_id), str(page_num + 1)) 
    os.makedirs(image_path, exist_ok=True)

    crop_images = []
    for image_index, img_info in enumerate(images_info):
        xref = img_info['xref']  # 이미지의 xref 값
        base_image = doc.extract_image(xref)  # 이미지 데이터 추출
        image_bytes = base_image["image"]  # 이미지 바이트 데이터
        crop_path = os.path.join(image_path, f"{image_index + 1}.png")  
        crop_images.append(crop_path)
        # 이미지 저장
        with open(crop_path, "wb") as img_file:
            img_file.write(image_bytes)
        
    print(page_num, crop_images)




TypeError: a bytes-like object is required, not 'int'

In [66]:
import os
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

def calculate_clip_similarity(image_dir, search_query):
    # CLIP 모델 및 프로세서 로드
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    # 검색어를 텍스트 토큰으로 변환
    text_inputs = processor(text=[search_query], return_tensors="pt", padding=True).to(device)

    similarities = {}

    # 이미지 디렉터리 내의 모든 이미지 파일에 대해 유사도 계산
    for image_name in os.listdir(image_dir):
        if image_name.lower().endswith('png'):
            image_path = os.path.join(image_dir, image_name)
            print(image_path)
            image = Image.open(image_path)

            # 이미지를 CLIP 모델의 입력 형식으로 변환
            image_inputs = processor(images=image, return_tensors="pt").to(device)

            # CLIP 모델을 사용하여 이미지와 텍스트의 특성 벡터 추출
            with torch.no_grad():
                image_features = model.get_image_features(**image_inputs)
                text_features = model.get_text_features(**text_inputs)

            # 이미지와 텍스트 특성 간의 코사인 유사도 계산
            image_features /= image_features.norm(dim=-1, keepdim=True)
            text_features /= text_features.norm(dim=-1, keepdim=True)
            similarity = (image_features @ text_features.T).item()  # 유사도 계산

            similarities[image_name] = similarity

    # 유사도 결과를 내림차순으로 정렬
    sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

    # 결과 출력
    print("Similarity results:")
    for image_name, similarity in sorted_similarities:
        print(f"{image_name}: {similarity:.4f}")

# 사용 예시
image_dir = "../crops/10/10"  # 이미지가 저장된 디렉터리
search_query = "a standing cat"  # 검색어

calculate_clip_similarity(image_dir, search_query)




../crops/10/10/4.png
../crops/10/10/2.png
../crops/10/10/3.png
../crops/10/10/1.png
Similarity results:
2.png: 0.3069
1.png: 0.2714
4.png: 0.2637
3.png: 0.2379


In [67]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(Image.open("../crops/10/10/2.png")).unsqueeze(0).to(device) # 원하는 이미지 경로로 수정
text = clip.tokenize("a standing cat").to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    # logits_per_image, logits_per_text = model(image, text)
    # probs = logits_per_image.softmax(dim=-1).cpu().numpy()

    # 코사인 유사도를 계산
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = (image_features @ text_features.T).item()  # 유사도 값 계산

# print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]
print(f"Similarity: {similarity}")


Similarity: 0.3068891167640686


In [69]:
from sentence_transformers import SentenceTransformer, util

# 모델 로드
model = SentenceTransformer('all-MiniLM-L6-v2')

# 두 텍스트 정의
text1 = "This is a sample sentence."
text2 = "This sentence is similar to a sample sentence."

# 텍스트 임베딩 계산
embedding1 = model.encode(text1, convert_to_tensor=True)
embedding2 = model.encode(text2, convert_to_tensor=True)

# 코사인 유사도 계산
similarity = util.pytorch_cos_sim(embedding1, embedding2).item()

print(f"Similarity: {similarity}")


Similarity: 0.8880940675735474


In [81]:
from sentence_transformers import SentenceTransformer, util
import torch
import clip
from PIL import Image

# 모델 로드
device = "cuda" if torch.cuda.is_available() else "cpu"
text_model = SentenceTransformer('all-MiniLM-L6-v2')  # Sentence Transformers 모델
clip_model, preprocess = clip.load("ViT-B/32", device=device)

def calculate_similarity(data, query):
    # Query 텍스트 임베딩 계산
    query_embedding = text_model.encode(query, convert_to_tensor=True)

    results = {}

    for page, content in data.items():
        page_result = {}
        
        # text 유사도 계산
        text_embedding = text_model.encode(content["script"], convert_to_tensor=True)
        page_result["script"] = util.pytorch_cos_sim(query_embedding, text_embedding).item()

        # pdf_text 유사도 계산
        pdf_text_embedding = text_model.encode(content["pdf_text"], convert_to_tensor=True)
        page_result["pdf_text"] = util.pytorch_cos_sim(query_embedding, pdf_text_embedding).item()

        # pdf_images 유사도 계산 (CLIP 사용)
        image_similarities = []
        for image_path in content["pdf_images"]:
            image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
            with torch.no_grad():
                image_features = clip_model.encode_image(image)
                image_features /= image_features.norm(dim=-1, keepdim=True)
                
                # Query 텍스트 임베딩도 CLIP을 사용해 계산
                text_tokens = clip.tokenize([query]).to(device)
                text_features = clip_model.encode_text(text_tokens)
                text_features /= text_features.norm(dim=-1, keepdim=True)

                similarity = (image_features @ text_features.T).item()
                image_similarities.append(similarity)

        # 여러 이미지가 있을 경우 평균 유사도 계산
        if image_similarities:
            page_result["pdf_image"] = sum(image_similarities) / len(image_similarities)

        results[page] = page_result

    return results


with open(spm_path, 'r') as file:
    page_info = json.load(file)
# 데이터 예시
data = page_info

query = "administration"  # 검색어

result = {}
# 유사도 계산
similarities = calculate_similarity(data, query)
result["query"] = query
result["similarities"] = similarities

SIM = "../sims"
search_id = 1
sim_path = os.path.join(SIM, str(project_id))
os.makedirs(sim_path, exist_ok=True)
sim_json_path = os.path.join(sim_path, f"{search_id}.json")

with open(sim_json_path, 'w') as file:
    json.dump(result, file, indent=4)

In [82]:
strr = "administration"

len(strr)

14

In [1]:
from PIL import Image
import pytesseract

def perform_ocr_on_image(image_path):
    # 이미지 파일을 열기
    image = Image.open(image_path)
    
    # OCR을 수행하여 이미지에서 텍스트 추출
    extracted_text = pytesseract.image_to_string(image)
    
    # 추출된 텍스트 출력
    print("Extracted Text from OCR:")
    print(extracted_text)

image_path = "../annotations/11/drawing_1_processed.png"  
perform_ocr_on_image(image_path)

Extracted Text from OCR:



In [4]:
import easyocr

# EasyOCR Reader 초기화 (한국어 및 영어 지원)
reader = easyocr.Reader(['en'])

# 이미지 경로 설정
image_path = "../annotations/11/drawing_1_processed.png"  

# 이미지에서 텍스트 추출
result = reader.readtext(image_path)

print(result)
# 결과 출력
for res in result:
    print(res[1])


[([[84, 54], [230, 54], [230, 176], [84, 176]], 'APple', 0.30573700128243714)]
APple


In [3]:
strr = ""

if strr:
    print("True")


In [7]:
import os
import json
from collections import defaultdict

SPM = "../spms"
SIMILARITY = "../similarity"

project_id = 10
search_id = 1
search_type = "keyword" # "semantic" 또는 "keyword"
spm_path = os.path.join(SPM, f"{project_id}_page_info.json")
search_path = os.path.join(SIMILARITY, str(project_id), f"{search_id}_{search_type}.json")
page_from_script = []
page_from_pdf_text = []
page_from_annotation = []

with open(spm_path, 'r') as file:
    page_info = json.load(file)

query = "apple"  # 검색어

result = {}
result["query"] = query
result["source"] = defaultdict(list)

for page, content in page_info["pages"].items():
    # script에서 검색어가 나타나는 페이지
    if query.lower() in content["script"].lower():
        result["source"]["script"].append(page)
    # pdf_text에서 검색어가 나타나는 페이지
    if query.lower() in content["pdf_text"].lower():
        result["source"]["pdf_text"].append(page)
    # annotation에서 검색어가 나타나는 페이지
    if query.lower() in content["annotation"].lower():
        result["source"]["annotation"].append(page)

with open(search_path, 'w') as file:
    json.dump(result, file, indent=4)

In [28]:
import fitz
import os
import json
import pymupdf

PDF = "../pdfs"
SPM = "../spms"
CROP = "../crops"
ANNOTATIONS = "../annotations"

def get_pdf_text_and_image(para_id, pdf_path):
    doc = pymupdf.open(pdf_path)
    page_num = int(para_id) - 1
    page = doc.load_page(page_num)
    text = page.get_text("text")
    images_info = page.get_image_info(xrefs=True)
    image_path = os.path.join(CROP, str(project_id), str(page_num + 1))
    os.makedirs(image_path, exist_ok=True)

    crop_images = []
    for image_index, img_info in enumerate(images_info):
        xref = img_info["xref"]  # 이미지의 xref 값
        base_image = doc.extract_image(xref)  # 이미지 데이터 추출
        image_bytes = base_image["image"]  # 이미지 바이트 데이터
        crop_path = os.path.join(image_path, f"{image_index + 1}.png")
        crop_images.append(crop_path)
        # 이미지 저장
        with open(crop_path, "wb") as img_file:
            img_file.write(image_bytes)
            
    print(text)
    print(crop_images)
    return text, crop_images

In [43]:
project_id = 15

pdf_file = [
        file
        for file in os.listdir(PDF)
        if file.startswith(f"{project_id}_") and file.endswith(".pdf")
    ]    
pdf_path = os.path.join(PDF, pdf_file[0])

pdf_path = '/Users/yikim/Downloads/cs231n_2017_lecture2 (1)-pages.pdf'

matched_file_path = os.path.join(SPM, f"{project_id}_matched_paragraphs.json")
with open(matched_file_path, "r") as matched_file:
    matched_data = json.load(matched_file)

print(pdf_path)

for para_id, paragraph_text in matched_data.items():
    get_pdf_text_and_image(para_id, pdf_path)


/Users/yikim/Downloads/cs231n_2017_lecture2 (1)-pages.pdf
Fei-Fei Li & Justin Johnson & Serena Yeung
Lecture 2 - 
April 6, 2017
Fei-Fei Li & Justin Johnson & Serena Yeung
Lecture 2 - 
April 6, 2017
1

[]
Fei-Fei Li & Justin Johnson & Serena Yeung
Lecture 2 - 
April 6, 2017
Administrative: Piazza
For questions about midterm, poster session, projects, 
use Piazza instead of staff list!
SCPD students: Use your @stanford.edu address to register for Piazza; contact 
scpd-customerservice@stanford.edu for help.
2

[]
Fei-Fei Li & Justin Johnson & Serena Yeung
Lecture 2 - 
April 6, 2017
Administrative: Assignment 1
Out tonight, due 4/18 11:59pm
- K-Nearest Neighbor
- Linear classifiers: SVM, Softmax
- Two-layer neural network
- Image features
3

[]
Fei-Fei Li & Justin Johnson & Serena Yeung
Lecture 2 - 
April 6, 2017
Administrative: Python + Numpy
4
http://cs231n.github.io/python-numpy-tutorial/ 

['../crops/15/4/1.png']
Fei-Fei Li & Justin Johnson & Serena Yeung
Lecture 2 - 
April 6, 2017
Adm

In [5]:
import fitz
import os
import json

project_id = 13
PDF = "../pdfs"
SPM = "../spms"
CROP = "../crops"
ANNOTATIONS = "../annotations"

pdf_file = [
        file
        for file in os.listdir(PDF)
        if file.startswith(f"{project_id}_") and file.endswith(".pdf")
    ]

pdf_path = os.path.join(PDF, pdf_file[0])


def get_pdf_text_and_image(project_id, pdf_path):
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        raise ValueError(f"Failed to open the PDF file: {e}")

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text("text")
        images_info = page.get_images(full=True)
        image_path = os.path.join(CROP, str(project_id), str(page_num + 1))
        os.makedirs(image_path, exist_ok=True)

        crop_images = []
        for image_index, img_info in enumerate(images_info):
            xref = img_info[0]  # 이미지의 xref 값, 첫 번째 요소로 위치를 가져옴
            try:
                base_image = doc.extract_image(xref)  # 이미지 데이터 추출
                image_bytes = base_image["image"]  # 이미지 바이트 데이터
                crop_path = os.path.join(image_path, f"{image_index + 1}.png")
                crop_images.append(crop_path)
                # 이미지 저장
                with open(crop_path, "wb") as img_file:
                    img_file.write(image_bytes)
            except ValueError as e:
                print(f"Skipping invalid xref {xref} on page {page_num + 1}: {e}")
                continue

        return text, crop_images


text, crop_images = get_pdf_text_and_image(project_id, pdf_path)

print(text)
print(crop_images)

Public Economics Lectures
Part 1: Introduction
Raj Chetty and Gregory A. Bruich
Harvard University
Fall 2012
Public Economics Lectures
()
Part 1: Introduction
1 / 49

[]


In [9]:
project_id = 13
spm_path = os.path.join(SPM, f"{project_id}_page_info.json")
with open(spm_path, 'r') as file:
    page_info = json.load(file)

for para_id, para_info in page_info["pages"].items():
    if para_info["gpt_timestamp"]["start"] < para_info["gpt_timestamp"]["end"]:
        print(para_id)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


In [1]:
print('hi')

hi
