In [None]:
import os

from langchain.document_loaders import TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma


In [86]:
assert os.environ["OPENAI_API_KEY"]

In [87]:
loader = TextLoader("lammps_fix_nvt.txt")
documents = loader.load()

In [88]:
documents

[Document(metadata={'source': 'lammps_fix_nvt.txt'}, page_content='These commands perform time integration on Nose-Hoover style non-Hamiltonian equations of motion which are designed to generate positions and velocities sampled from the canonical (nvt), isothermal-isobaric (npt), and isenthalpic (nph) ensembles. This updates the position and velocity for atoms in the group each timestep.\n\nThe thermostatting and barostatting is achieved by adding some dynamic variables which are coupled to the particle velocities (thermostatting) and simulation domain dimensions (barostatting). In addition to basic thermostatting and barostatting, these fixes can also create a chain of thermostats coupled to the particle thermostat, and another chain of thermostats coupled to the barostat variables. The barostat can be coupled to the overall box volume, or to individual dimensions, including the xy, xz and yz tilt dimensions. The external pressure of the barostat can be specified as either a scalar pr

In [106]:
text_splitter = CharacterTextSplitter(
    chunk_size=2000, chunk_overlap=200, separator="\n"
)
chunks = text_splitter.split_documents(documents)

In [107]:
chunks

[Document(metadata={'source': 'lammps_fix_nvt.txt'}, page_content='These commands perform time integration on Nose-Hoover style non-Hamiltonian equations of motion which are designed to generate positions and velocities sampled from the canonical (nvt), isothermal-isobaric (npt), and isenthalpic (nph) ensembles. This updates the position and velocity for atoms in the group each timestep.\nThe thermostatting and barostatting is achieved by adding some dynamic variables which are coupled to the particle velocities (thermostatting) and simulation domain dimensions (barostatting). In addition to basic thermostatting and barostatting, these fixes can also create a chain of thermostats coupled to the particle thermostat, and another chain of thermostats coupled to the barostat variables. The barostat can be coupled to the overall box volume, or to individual dimensions, including the xy, xz and yz tilt dimensions. The external pressure of the barostat can be specified as either a scalar pres

In [108]:
chunks[0].page_content

'These commands perform time integration on Nose-Hoover style non-Hamiltonian equations of motion which are designed to generate positions and velocities sampled from the canonical (nvt), isothermal-isobaric (npt), and isenthalpic (nph) ensembles. This updates the position and velocity for atoms in the group each timestep.\nThe thermostatting and barostatting is achieved by adding some dynamic variables which are coupled to the particle velocities (thermostatting) and simulation domain dimensions (barostatting). In addition to basic thermostatting and barostatting, these fixes can also create a chain of thermostats coupled to the particle thermostat, and another chain of thermostats coupled to the barostat variables. The barostat can be coupled to the overall box volume, or to individual dimensions, including the xy, xz and yz tilt dimensions. The external pressure of the barostat can be specified as either a scalar pressure (isobaric ensemble) or as components of a symmetric stress te

In [109]:
chunks[1].page_content

'The thermostat parameters for fix styles nvt and npt are specified using the temp keyword. Other thermostat-related keywords are tchain, tloop and drag, which are discussed below.\nThe thermostat is applied to only the translational degrees of freedom for the particles. The translational degrees of freedom can also have a bias velocity removed before thermostatting takes place; see the description below. The desired temperature at each timestep is a ramped value during the run from Tstart to Tstop. The Tdamp parameter is specified in time units and determines how rapidly the temperature is relaxed. For example, a value of 10.0 means to relax the temperature in a timespan of (roughly) 10 time units (e.g. \n or fs or ps - see the units command). The atoms in the fix group are the only ones whose velocities and positions are updated by the velocity/position update portion of the integration.'

In [110]:
import numpy as np

chunk_sizes = [len(chunk.page_content) for chunk in chunks]

# 基本的な統計情報
stats = {
    "チャンク数": len(chunks),
    "平均サイズ": np.mean(chunk_sizes),
    "中央値サイズ": np.median(chunk_sizes),
    "最小サイズ": min(chunk_sizes),
    "最大サイズ": max(chunk_sizes),
    "標準偏差": np.std(chunk_sizes),
}
stats

{'チャンク数': 2,
 '平均サイズ': 1325.0,
 '中央値サイズ': 1325.0,
 '最小サイズ': 899,
 '最大サイズ': 1751,
 '標準偏差': 426.0}

In [111]:
import re

sentences1 = re.split(r"(?<=[.!?])\s+", chunks[0].page_content)
sentences2 = re.split(r"(?<=[.!?])\s+", chunks[1].page_content)

sentences1 = [s.strip() for s in sentences1 if s.strip()]
sentences2 = [s.strip() for s in sentences2 if s.strip()]

common_sentences = []
for s1 in sentences1:
    for s2 in sentences2:
        if s1 == s2 and len(s1) > 30:  # 最小長さのしきい値
            common_sentences.append(s1)
common_sentences

['The thermostat parameters for fix styles nvt and npt are specified using the temp keyword.',
 'Other thermostat-related keywords are tchain, tloop and drag, which are discussed below.']

In [112]:
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(
    documents=chunks, embedding=embeddings, persist_directory="./data"
)

In [113]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [114]:
results_with_scores = vectorstore.similarity_search_with_score(
    query="What is MTK?", k=2
)
print("\n=== スコア付き類似度検索 ===")
for doc, score in results_with_scores:
    print(f"テキスト: {doc.page_content}")
    print(f"類似度スコア: {score}\n")


=== スコア付き類似度検索 ===
テキスト: The equations of motion used are those of Shinoda et al in (Shinoda), which combine the hydrostatic equations of Martyna, Tobias and Klein in (Martyna) with the strain energy proposed by Parrinello and Rahman in (Parrinello). The time integration schemes closely follow the time-reversible measure-preserving Verlet and rRESPA integrators derived by Tuckerman et al in (Tuckerman).
類似度スコア: 0.5385632274571723

テキスト: The thermostat parameters for fix styles nvt and npt are specified using the temp keyword. Other thermostat-related keywords are tchain, tloop and drag, which are discussed below.
類似度スコア: 0.5610930909835947



In [115]:
chunks[0].metadata

{'source': 'lammps_fix_nvt.txt'}

In [116]:
results_mmr = vectorstore.max_marginal_relevance_search(
    query="MTKとはどういう熱浴ですか？",
    k=2,
    fetch_k=3,  # 候補として取得する数
    lambda_mult=0.5,  # 多様性の重み（0-1）
)
print("\n=== MMR検索（多様性考慮） ===")
for doc in results_mmr:
    print(f"MMR検索結果: {doc.page_content}\n")


=== MMR検索（多様性考慮） ===
MMR検索結果: The thermostat parameters for fix styles nvt and npt are specified using the temp keyword. Other thermostat-related keywords are tchain, tloop and drag, which are discussed below.

MMR検索結果: The equations of motion used are those of Shinoda et al in (Shinoda), which combine the hydrostatic equations of Martyna, Tobias and Klein in (Martyna) with the strain energy proposed by Parrinello and Rahman in (Parrinello). The time integration schemes closely follow the time-reversible measure-preserving Verlet and rRESPA integrators derived by Tuckerman et al in (Tuckerman).



In [130]:
chunks_with_metadata = []
for category, text in {
    "fix": "lammps_fix_nvt.txt",
    "pair_style": "lammps_pair_style.txt",
}.items():
    loader = TextLoader(text)
    documents = loader.load()
    text_splitter = CharacterTextSplitter(
        chunk_size=1000, chunk_overlap=100, separator="\n"
    )
    chunks = text_splitter.split_documents(documents)
    for chunk in chunks:
        chunk.metadata["category"] = category
    chunks_with_metadata += chunks

In [131]:
chunks_with_metadata

[Document(metadata={'source': 'lammps_fix_nvt.txt', 'category': 'fix'}, page_content='These commands perform time integration on Nose-Hoover style non-Hamiltonian equations of motion which are designed to generate positions and velocities sampled from the canonical (nvt), isothermal-isobaric (npt), and isenthalpic (nph) ensembles. This updates the position and velocity for atoms in the group each timestep.'),
 Document(metadata={'source': 'lammps_fix_nvt.txt', 'category': 'fix'}, page_content='The thermostatting and barostatting is achieved by adding some dynamic variables which are coupled to the particle velocities (thermostatting) and simulation domain dimensions (barostatting). In addition to basic thermostatting and barostatting, these fixes can also create a chain of thermostats coupled to the particle thermostat, and another chain of thermostats coupled to the barostat variables. The barostat can be coupled to the overall box volume, or to individual dimensions, including the xy

In [None]:
vectorstore_with_metadata = Chroma.from_documents(
    documents=chunks_with_metadata,
    embedding=embeddings,
    persist_directory="./data_with_metadata",
)

In [133]:
results_with_filter = vectorstore_with_metadata.similarity_search(
    query="Nose-Hoover thermostatとは何ですか？", k=2, filter={"category": "fix"}
)
print("\n=== メタデータフィルタ付き検索 ===")
for doc in results_with_filter:
    print(f"フィルタ検索結果: {doc.page_content}")
    print(f"メタデータ: {doc.metadata}\n")



=== メタデータフィルタ付き検索 ===
フィルタ検索結果: These commands perform time integration on Nose-Hoover style non-Hamiltonian equations of motion which are designed to generate positions and velocities sampled from the canonical (nvt), isothermal-isobaric (npt), and isenthalpic (nph) ensembles. This updates the position and velocity for atoms in the group each timestep.
メタデータ: {'category': 'fix', 'source': 'lammps_fix_nvt.txt'}

フィルタ検索結果: The equations of motion used are those of Shinoda et al in (Shinoda), which combine the hydrostatic equations of Martyna, Tobias and Klein in (Martyna) with the strain energy proposed by Parrinello and Rahman in (Parrinello). The time integration schemes closely follow the time-reversible measure-preserving Verlet and rRESPA integrators derived by Tuckerman et al in (Tuckerman).
The thermostat parameters for fix styles nvt and npt are specified using the temp keyword. Other thermostat-related keywords are tchain, tloop and drag, which are discussed below.
メタデータ: {'ca