In [1]:
# Import necessary libraries
import os
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List
from dotenv import load_dotenv
import fitz  # PyMuPDF

# Load environment variables
load_dotenv()

  from tqdm.autonotebook import tqdm, trange


True

In [2]:
# Initialize Elasticsearch client
es = Elasticsearch(["http://localhost:9200"])

In [3]:
# Initialize Sentence Transformer model
embedder = SentenceTransformer('all-MiniLM-L6-v2')




In [23]:
def create_index_if_not_exists(index_name: str):
    if es.indices.exists(index=index_name):
        print(f"Index '{index_name}' exists. Deleting it...")
        es.indices.delete(index=index_name)
    
    print(f"Creating new index '{index_name}'...")
    mapping = {
        'mappings': {
            'properties': {
                'paper_id': {'type': 'keyword'},
                'chunk_id': {'type': 'integer'},
                'title': {'type': 'text'},
                'text': {'type': 'text'},
                'embedding': {
                    'type': 'dense_vector',
                    'dims': 384,
                    'index': True,
                    'similarity': 'cosine'
                }
            }
        }
    }
    es.indices.create(index=index_name, body=mapping)
    print(f"Index '{index_name}' created successfully.")


In [24]:
# Create index
create_index_if_not_exists('paper_chunks')

Index 'paper_chunks' exists. Deleting it...
Creating new index 'paper_chunks'...
Index 'paper_chunks' created successfully.


In [25]:
def extract_text_from_pdf(file_path: str) -> str:
    with fitz.open(file_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

In [26]:
pdf_path = "/Users/isaackargar/Downloads/volker-2023-noncontact.pdf"
text = extract_text_from_pdf(pdf_path)
print(f"Extracted {len(text)} characters from the PDF.")


Extracted 35222 characters from the PDF.


In [28]:
def chunk_text(text: str, max_length: int = 500) -> List[str]:
    sentences = text.split('. ')
    chunks = []
    current_chunk = ''
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 <= max_length:
            current_chunk += sentence + '. '
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + '. '
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

In [29]:
# Chunk the text
chunks = chunk_text(text, max_length=500)
print(f"Created {len(chunks)} chunks from the text.")


Created 87 chunks from the text.


In [30]:
def index_chunks(paper_id: str, title: str, chunks: List[str]):
    actions = []
    for idx, chunk in enumerate(chunks):
        embedding = embedder.encode(chunk)
        doc = {
            'paper_id': paper_id,
            'chunk_id': idx,
            'title': title,
            'text': chunk,
            'embedding': embedding.tolist()
        }
        action = {
            "_index": "paper_chunks",
            "_id": f"{paper_id}_{idx}",
            "_source": doc
        }
        actions.append(action)
    bulk(es, actions)

In [31]:
# Index the chunks
paper_id = pdf_path.split("/")[-1].split(".")[0]
title = pdf_path.split("/")[-1].split(".")[0]
index_chunks(paper_id, title, chunks)
print("Chunks indexed in Elasticsearch.")


Chunks indexed in Elasticsearch.


In [36]:
def search_similar_chunks(query: str, top_k: int = 5):
    query_embedding = embedder.encode(query)
    # Normalize the embedding
    query_vector = query_embedding / np.linalg.norm(query_embedding)
    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                # Use max() to ensure non-negative scores
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {"query_vector": query_vector.tolist()}
            }
        }
    }

    response = es.search(
        index="paper_chunks",
        body={
            "size": top_k,
            "query": script_query,
            "_source": {"includes": ["paper_id", "chunk_id", "title", "text"]}
        }
    )
    return response['hits']['hits']


In [40]:
# Test search
test_query = "How this paper tries to solve NDT? Tell me in detail"
similar_chunks = search_similar_chunks(test_query, top_k=10)
print(f"\nTop 3 similar chunks for query '{test_query}':")
for i, hit in enumerate(similar_chunks):
    source = hit['_source']
    print(f"\nChunk {i+1}:")
    print(source['text'])
    print("---")


Top 3 similar chunks for query 'How this paper tries to solve NDT? Tell me in detail':

Chunk 1:
6, NOVEMBER 2023
Transactions of the ASME
Downloaded from http://asmedigitalcollection.asme.org/nondestructive/article-pdf/6/4/041002/6992491/nde_6_4_041002.pdf by guest on 05 March 2023
amount of time.
---

Chunk 2:
7
Lay-up
Thickness (mm)
5/4
3.05
4/3
2.4
3/2
1.74
2/1
1.05
Table 2
Artiﬁcial defect dimensions corresponding to the
GLARE 2 material in Fig. 7
Defect ID
Design diameter (mm)
1, 6, 7
3
2, 5, 8
6
3, 4, 9
12
041002-4 / Vol. 6, NOVEMBER 2023
Transactions of the ASME
Downloaded from http://asmedigitalcollection.asme.org/nondestructive/article-pdf/6/4/041002/6992491/nde_6_4_041002.pdf by guest on 05 March 2023
Fig. 6
Veriﬁcation of scaling rule of the dispersion curve
Fig.
---

Chunk 3:
The ﬁnite-difference scheme is based on a
rotated staggered grid. In the case of anisotropy, it is known that
a rotated staggered grid provides better results compared to stag-
gered grids, where num

In [41]:
# Test OpenAI integration (if you want to include this part)
try:
    from openai import OpenAI

    client = OpenAI(
        api_key=os.environ.get("OPENAI_API_KEY"),
    )

    # Prepare the prompt (you can modify this as needed)
    relevant_texts = [hit['_source']['text'] for hit in similar_chunks]
    combined_text = "\n\n".join(relevant_texts)
    prompt = f"""You are an expert assistant. Based on the following excerpts from a research paper, answer the question concisely and accurately.

Question: {test_query}

Excerpts:
{combined_text}

Answer:"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Make sure to use an available model
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=2000,
        temperature=0.5,
    )
    answer = response.choices[0].message.content
    print("\nGenerated Answer:")
    print(answer)
except Exception as e:
    print(f"Error generating answer: {e}")


Generated Answer:
The paper addresses Non-Destructive Testing (NDT) by employing a combination of advanced simulation techniques and practical ultrasonic testing methods to detect and characterize defects in composite materials, specifically GLARE 2 material. Here are the detailed approaches outlined in the excerpts:

1. **Finite-Difference Modeling**: The authors utilize a finite-difference scheme based on a rotated staggered grid, which is particularly advantageous for handling anisotropic materials. This method minimizes numerical artifacts that can arise from improper boundary handling, enhancing the accuracy of the simulation results. The approach includes a tensor rotation that corresponds to the lay-up of the composite, allowing for a more precise modeling of wave propagation through the material.

2. **Dispersion Curve Analysis**: The paper calculates dispersion curves as functions of frequency and propagation direction, highlighting the measurable anisotropy within the materi