In [1]:
import openai
import pandas as pd
import json
from IPython.display import Markdown
import sys
from tqdm import tqdm

# Load prompt template

In [3]:
sys.path.append('..')
from utils import *
from functools import partial

In [4]:
prompt_dict = partial(load_prompt_template,file='../prompt_template.md')

In [5]:
def drop_text_after(string):
    string_new=''
    for line in string.split('\n'):
        if '===' in line:
            break
        else:
            if 'You should recall that' not in line:
                string_new='\n'.join([string_new,line])
    return string_new[1:]


In [6]:
with open('2111.01152.jsonl','r') as f:
    kwargs_jsonl= [json.loads(line) for line in f]

In [7]:
def prompt_template(descriptor):
    return drop_text_after(prompt_dict()[descriptor['task']])

In [8]:
prompt_template(kwargs_jsonl[0])

'You will be instructed to describe the kinetic term of Hamiltonian in {system} in the {real|momentum} space in the {single-particle|second-quantized} form.   \nThe degrees of freedom of the system are: {degrees_of_freedom}.  \nExpress the Kinetic Hamiltonian {kinetic_symbol} using {variable} which are only on the diagonal terms, and arrange the basis in the order of {order}. [Note that the sublattice degrees of freedom is suppressed for now and will be stated later]\n\nUse the following conventions for the symbols:  \n{definition_of_variables}\n\n'

In [14]:
kwargs_jsonl[0]

{'task': 'Construct Kinetic Hamiltonian (continuum version, single-particle)',
 'system': 'the hole-doped AB-stacked MoTe2/WSe2',
 'real|momentum': 'real',
 'single-particle|second-quantized': 'single-particle',
 'dof': 'valley index (+K and -K valley), layer index (top and bottom layer)',
 'kinetic_symbol': '$H_{Kinetic}(r)$',
 'var': '$E_{\\tau,l}$',
 'order': '(+K,bottom), (+K,top), (-K,bottom), (-K,top)',
 'Note that the sublattice degrees of freedom is suppressed for now and will be stated later|None': '',
 'def_var': '$l$ : layer index   \n$t$ : top layer  \n$b$ : bottom layer  \n$\\tau$ : valley index  \n$+K$ : +K valley  \n$-K$ : -K valley  \n$k$ : momentum operator  \n$E_{\\tau,l}$ : energy dispersion for layer $l$ and valley $\\tau$',
 'answer': '$H_{Kinetic}(r) = \\begin{pmatrix} E_{+K,b} & 0 & 0 & 0 \\\\ 0 & E_{+K,t} & 0 & 0 \\\\ 0 & 0 & E_{-K,b} & 0 \\\\ 0 & 0 & 0 & E_{-K,t} \\end{pmatrix}$',
 'source': {'2111.01152.tex': [[55, 62]]}}

# Load automatically filled prompt

In [17]:
def parse_markdown_to_dict(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()

    result_dict = {}
    current_title = None
    content_buffer = []

    for line in lines:
        stripped_line = line.strip()

        if stripped_line.startswith("# "):
            if current_title:  # If there's already a title detected
                result_dict[current_title] = '\n'.join(content_buffer).strip()
                content_buffer = []
            current_title = stripped_line[2:]
        else:
            content_buffer.append(line)

    # For the last title-content pair
    if current_title and content_buffer:
        result_dict[current_title] = '\n'.join(content_buffer).strip()

    return result_dict

In [21]:
filename = "2111.01152_extractor.md"
filled_dict = parse_markdown_to_dict(filename)


In [22]:
print(filled_dict['Construct Kinetic Hamiltonian (continuum version, single-particle)'])

You will be instructed to describe the kinetic term of Hamiltonian in a system with two layers in the momentum space in the second-quantized form.   

The degrees of freedom of the system are: valleys, layers, and momentum.  

Express the Kinetic Hamiltonian H_{\tau} using variables which are only on the diagonal terms, and arrange the basis in the order of bottom layer and top layer. Note that the sublattice degrees of freedom is suppressed for now and will be stated later.



Use the following conventions for the symbols:  

$\tau=\pm 1$ represents $\pm K$ valleys, $\bm{\kappa}=\frac{4\pi}{3a_M}\left(1,0\right)$ is at a corner of the moiré Brillouin zone. The $2\times 2$ Hamiltonian hybridizes the bottom layer ($\mathfrak{b}$) and top layer ($\mathfrak{t}$), where the off diagonal terms describe the interlayer tunneling $\Delta_{\text{T},\tau}$, and the diagonal terms describe the momentum-shifted kinetic energy with the effective mass $(m_{\mathfrak{b}},m_{\mathfrak{t}})=(0.65,0.35)

In [25]:
print(prompt_dict()['Construct Kinetic Hamiltonian (continuum version, single-particle)'])

You will be instructed to describe the kinetic term of Hamiltonian in {system} in the {real|momentum} space in the {single-particle|second-quantized} form.   
The degrees of freedom of the system are: {degrees_of_freedom}.  
Express the Kinetic Hamiltonian {kinetic_symbol} using {variable} which are only on the diagonal terms, and arrange the basis in the order of {order}. [Note that the sublattice degrees of freedom is suppressed for now and will be stated later]

Use the following conventions for the symbols:  
{definition_of_variables}




# Return place holder

In [150]:
import re

def extract_filled_values(template_str, filled_str):
    # template_str = template_str.replace('\\', '\\\\')
    # Extract placeholders from the template
    # placeholders = re.findall(r"\{(\w+)\}", template_str)
    placeholders = re.findall(r"[\{|\[]([\w|\||-||$| ]+)[\}|\]]", template_str)
    # Create a regex pattern to match the filled values based on the placeholders
    placeholders_unique=[]
    for placeholder in placeholders:
        if placeholder not in placeholders_unique:
            placeholders_unique.append(placeholder)
    return placeholders_unique
    
    # pattern = re.sub(r"\{(\w+)\}", r"(?P<\1>\\S+)", template_str)

    # # Find the values using the generated pattern
    # match = re.match(pattern, filled_str)
    # if match:
    #     return {placeholder: match.group(placeholder) for placeholder in placeholders}
    # else:
    #     return None



In [161]:
# Example
template_str = drop_text_after(prompt_dict()["Swap the index to combine Hartree and Fock terms"])
# filled_str = filled_dict['Construct Kinetic Hamiltonian (continuum version, single-particle)']
results = extract_filled_values(template_str, filled_str)
print(results)

['Hartree_Fock_second_quantized_symbol', 'expected_value', 'expression_Hartree_1', 'expression_Hartree_2']


In [162]:
string= 'placeholder:\n'
for result in results:
    string+=f'    {result}:\n'
    string+='      LLM: \n'
    string+='      human: \n'
    string+='      score: \n'
print(string)
pyperclip.copy(string)    

placeholder:
    Hartree_Fock_second_quantized_symbol:
      LLM: 
      human: 
      score: 
    expected_value:
      LLM: 
      human: 
      score: 
    expression_Hartree_1:
      LLM: 
      human: 
      score: 
    expression_Hartree_2:
      LLM: 
      human: 
      score: 



# Validate yaml

In [163]:
import yaml
with open('2111.01152.yaml','r') as f:
    kwargs_yaml = yaml.safe_load(f)

In [165]:
kwargs_yaml[1]['placeholder']

{'Energy_dispersion': {'LLM': '$H_{\\tau}$', 'human': None, 'score': 2},
 'parabolic|Dirac|cos': {'LLM': 'parabolic', 'human': None, 'score': 2},
 'electrons|holes': {'LLM': 'electrons', 'human': 'holes', 'score': 0},
 'In addition, a shift of {momentum_shift} in the momentum {k_symbol} for {shifted_Ek}, respectively.': {'LLM': True,
  'human': None,
  'score': 2},
 'momentum_shift': {'LLM': '$\\bm{\\kappa}=\\frac{4\\pi}{3a_M}\\left(1,0\\right)$',
  'human': '$+\\kappa$ and $-\\kappa$',
  'score': 1},
 'k_symbol': {'LLM': '$\\bm{k}$', 'human': None, 'score': 2},
 'shifted_Ek': {'LLM': '$\\bm{k}-\\tau \\bm{\\kappa}$',
  'human': '$E_{t,+K}$ and $E_{t,-K}$',
  'score': 0},
 'r_symbol': {'LLM': '\\bm{r}', 'human': None, 'score': 2},
 'kinetic_symbol': {'LLM': '$\\hat{\\mathcal{H}}_0$',
  'human': None,
  'score': 2},
 'definition_of_variables': {'LLM': "$\\tau=\\pm 1$ represents $\\pm K$ valleys, $\\bm{\\kappa}=\\frac{4\\pi}{3a_M}\\left(1,0\\right)$  is at a corner of the  moir\\'e Brillo

In [155]:
kwargs_yaml[9]

{'task': 'Construct interaction Hamiltonian (momentum space)',
 'source': {'2111.01152_SM.tex': [[100, 108]]},
 'placeholder': {'second_int_symbol': {'LLM': '$\\hat{\\mathcal{H}}_{\\text{int}}$',
   'human': None,
   'score': 2},
  'index_of_operator': {'LLM': None,
   'human': 'the valley index and layer index',
   'score': 0},
  'momentum': {'LLM': None, 'human': 'momentum', 'score': 0},
  'For each operator, the total momentum is the sum of moire reciprocal lattice $b_i$ and momentum with in the first BZ $k_i$': {'LLM': False,
   'human': None,
   'score': 2},
  'interaction': {'LLM': None, 'human': None, 'score': None},
  'int_form': {'LLM': None, 'human': None, 'score': None},
  'normalization_factor': {'LLM': None, 'human': None, 'score': None},
  'op': {'LLM': None, 'human': None, 'score': None},
  'definition_of_variables': {'LLM': None, 'human': None, 'score': None}}}

In [103]:
import pyperclip
# spam = pyperclip.paste()

In [104]:
pyperclip.copy('What')


In [129]:
string=''
for kwargs in kwargs_jsonl:
    string+=f"# {kwargs['task']}\n"
    string+=drop_text_after(prompt_dict()[kwargs['task']])

with open('../cmp2.md','w') as f:
    f.write(string)