In [22]:
# Generate an ORCA input for a simple calculation request from user
# The locally deployed LLM deepseek:32b should be running with the command 'ollama run deepseek:32b'

import sys, os
import requests
import json
import subprocess
from rdkit import Chem
from rdkit.Chem import AllChem
from urllib.request import urlopen
from urllib.parse import quote
import re
from deep_translator import GoogleTranslator

# This function finds a chemical name out of a calculation request sentence 
# using the LLM deepseek:32b (locally deployed)
# Input could be a string or a list
def find_chemical_name_from_sentence(sentence):
    API_URL = "http://localhost:11434/api/generate"  # Ollama API port

    headers = {
        "Content-Type": "application/json"
    }
    
    # if input is a list, make it a string
    if isinstance(sentence, list):
        sentence = " ".join(sentence)
        
    data = {
        "model": "deepseek-r1:32b",  # your loaded deepseek model
        # inputs for the LLM API
        "prompt": "find me a chemical name from the following sentence, respond using JSON:  '"+sentence+"' ",
        "stream": False,  # close stream output
        "format": {
            "type": "object",
            "properties": {
            "name": {
                "type": "string"
            },
            },
            "required": [
            "name"
            ]
        }
    }

    response = requests.post(API_URL, headers=headers, json=data)

    if response.status_code == 200:
        result = response.json()
        resp =  result["response"];
        nameresult = json.loads(resp);
        return (nameresult["name"]).lower();
    else:
        return ""


# check if any Chinese character is in some text
def contains_chinese(text):
    chinese_pattern = re.compile(r'[\u4e00-\u9fff]')
    return bool(chinese_pattern.search(text))
    
# translate chinese into english using Google translator
def req2eng(txt):
    if contains_chinese(txt):
        translator = GoogleTranslator(source='auto', target='en')
        return translator.translate(txt)
    else:
        return txt
    
def split_sentence(sentence, delimiters):
  """
  Splits a sentence into a list of words using multiple delimiters.

  Args:
    sentence: The sentence to split.
    delimiters: A string containing the delimiters to split by, 
                e.g., ",| ".

  Returns:
    A list of words.
  """
  regex_pattern = '[' + re.escape(delimiters) + ']+'
  words = re.split(regex_pattern, sentence)
  return [word.lower() for word in words if word]  

# Using the web interface of NIH to convert chemical IDs into SMILES
def CIRconvert(ids):
    try:
        url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles'
        ans = urlopen(url).read().decode('utf8')
        return ans
    except:
        return ''

# This function read a .xyz file into a multi-line string
def read_xyz(path_to_file):
    with open(path_to_file) as f:
        f.readline()       # strip the first two lines
        f.readline()       
        data=''.join(line for line in f)
    return data

# Key reference
key_ref = {
    "program": ["orca", "psi4", "nwchem", "gaussian", "cp2k"],
    "method": ["semi-empirical", "am1", "pm3", "pm6", "cndo", "mndo", "nddo", "indo", "zindo", "hf", \
               "dft", "lda", "bp", "blyp", "b3lyp", "pbe", "m06l", "tpss", "scan", "r2scan", "x3lyp", "bhandhlyp", \
               "tpssh", "r2scanh", "wb97x", "camb3lyp", "lc_pbe", "lc_blyp", "wr2scan",
              "mp2", \
              "cis", "cis(d)", "cisd", \
              "cc2", "ccsd", "ccsd(t)"],
    "basis": ["3-21g", "4-31g", "6-31g(d)", "6-31g*", "6-31++g(d,p)", "6-31++g**", "6-311g*", "6-311g(d)", \
              "6-311g**", "6-311g(d,p)", "6-311++g**", "6-311++g(d,p)", "aug-cc-pvtz", "aug-cc-pvdz", \
              "cc-pvdz", "cc-pvtz", "def2-ecp", "def2-svp", "def2-tzvp", "iglo-iii", "lanl2dz", "lanl2tz", \
              "mini", "sapporo-dkh3-dzp-2012", "sapporo-dkh3-tzp-2012", "sto-3g", "stuttgart-rsc-1997"],
    "property": ["single-point", "geometry", "ir", "infrared", "raman", "uv-vis", "uv-visible", "x-ray", \
                 "xas", "xes", "rixs", "xps", "auger", "epr", "esr", "nmr", "mossbauer", "vcd", "roa", "ecd", \
                 "xcd", "photoelectron"],
    "spec_type": ["absorption", "emission"],
    "xray_edge": ["c1s", "o1s", "n1s"]
}

def find_key(sentence, calc_plan, key_ref):
    for word in sentence:
        if word in key_ref["program"]:
            calc_plan.update({'program' : word})
        if word in key_ref["method"]:
            calc_plan.update({'method' : word})
        if word in key_ref["basis"]:
            calc_plan.update({'basis' : word}) 
        if word in key_ref["property"]:
            calc_plan.update({'property' : word}) 
        if word in key_ref["spec_type"]:
            calc_plan.update({'spec_type' : word})
        if word in key_ref["xray_edge"]:
            calc_plan.update({'xray_edge' : word})
            
    return calc_plan

def read_textfile(fname):
    l_str = []
    with open(fname, "r") as IN:
        for line in IN:
            l_str.extend(line.split())
    return l_str

# write a list of strings into a text file
def write_textfile(list1, fname):
    with open(fname, "w") as OUT:
        for word in list1:
            OUT.write("{}\n".format(word))
            
def remove_supe_subs(text):
  """Removes superscript and subscript formatting from a string.

  Args:
    text: The string to process.

  Returns:
    The string with superscript and subscript formatting removed.
  """
  return re.sub(r'[\u2070-\u209F]+', '', text)

# This is a calculation request from user in Chinese
# calc_req = "用ORCA及pbe/3-21g方法计算咖啡因的紫外-可见光谱。"
# calc_req = "计算二氧化碳的红外光谱。"
# calc_req = "计算氯仿的拉曼光谱。"
# calc_req = "计算丁醇溶剂中的咖啡因的紫外可见吸收光谱"
# calc_req = "计算水溶剂中的咖啡因的紫外可见吸收光谱"
# calc_req = "计算H2O溶剂中的咖啡因的紫外可见吸收光谱"
calc_req = "在溶剂水中计算咖啡因的红外光谱"


# First translate the calculation request into English
calc_req_en = req2eng(calc_req)

# Then split the sentence into a word array
delimiters = ",;./ "
calc_req_en = split_sentence(calc_req_en, delimiters)

# cut the solvent description if exists
if ("in" in calc_req_en) and (("solvent" in calc_req_en) or ("solvents" in calc_req_en)):
    in_idx = calc_req_en.index("in")
    sol_desc = calc_req_en[in_idx:] # The solvent description part
    # replace 'aqueous' with 'water'. A fix  for inaccurate translations
    sol_desc = [w.replace('aqueous', 'water') for w in sol_desc]
#     print(sol_desc)
    calc_req_en = calc_req_en[:in_idx] # The other part
    solv_name = (find_chemical_name_from_sentence(sol_desc)).lower()
#     print(solv_name)
    solv_name = remove_supe_subs(solv_name) # get rid of super/subscript formatting
# read the ORCA supported solvent names in
    orca_solvents = read_textfile("ORCA_solvents.txt")
    if solv_name not in orca_solvents:
        print("Solvent not recognized or not supported!")
        sys.exit()

# Get the chemical system from the calculation request
name = find_chemical_name_from_sentence(calc_req_en)

# Find keys in the calculation request
calc_plan = {}
find_key(calc_req_en, calc_plan, key_ref)

# print(calc_req_en)
# sys.exit()

# Default values
if "program" not in calc_plan:
    calc_plan.update({"program" : "orca"})

if "method" not in calc_plan:
    calc_plan.update({"method" : "b3lyp"})
        
if "basis" not in calc_plan:
    calc_plan.update({"basis" : "def2-svp"})
        
if "property" not in calc_plan:
    calc_plan.update({"property" : "single-point"})
                     
if "geom" not in calc_plan:
    calc_plan.update({"geom" : {"type" : "xyz", "unit" : "angstrom"}})
                     
if "charge" not in calc_plan:
    calc_plan.update({"charge" : "0"})
                     
if "spin" not in calc_plan:
    calc_plan.update({"spin" : "1"})
                     
# Alternative values
if calc_plan["method"] == "dft":
    calc_plan["method"] = "b3lyp"
    
if calc_plan["basis"] == "6-31g(d)":
    calc_plan["basis"] = "6-31g*"
    
if calc_plan["basis"] == "6-31++g(d,p)":
    calc_plan["basis"] = "6-31++g**"
    
if calc_plan["basis"] == "6-311g(d)":
    calc_plan["basis"] = "6-311g*"
    
if calc_plan["basis"] == "6-311g(d,p)":
    calc_plan["basis"] = "6-311g**"
    
if calc_plan["basis"] == "6-311++g(d,p)":
    calc_plan["basis"] = "6-311++g**"
    
if calc_plan["property"] == "uv-visible":
    calc_plan["property"] = "uv-vis"

if calc_plan["property"] in ["uv-vis", "ecd", "xas", "xcd"]:
    calc_plan.update({"n_ex_states" : "30"}) # default to calculate 30 excited states
    
if calc_plan["property"] == "infrared":
    calc_plan["property"] = "ir"
    
if calc_plan["property"] == "ir":
    calc_plan.update({"calc_type" : "opt freq"})
    
if calc_plan["property"] == "raman":
    calc_plan.update({"calc_type" : "opt numfreq"})

if solv_name != "":
    calc_plan.update({"solvent" : solv_name})

# The default spectroscopy type is absorption
if (calc_plan["property"] in ["uv-vis", "x-ray", "xas"]) and ("spec_type" not in calc_plan):
    calc_plan.update({"spec_type" : "absorption"})    

# Chemical name
if name == "":
    print("No chemical system found in the calculation request!")
    sys.exit()
else:
    calc_plan.update({"sys_name" : name})

# print(calc_plan)
# sys.exit()
    
def Input_gen_orca(plan):
# Generate the 3-D structure from the chemical name
    if "sys_name" in plan:
        name = plan["sys_name"]
    else:
        print("No chemical system found in the calculation request!")
        sys.exit()
                     
# Cenerate the 3D coordinates and save them into a temp file  
# STOUT only works for IUPAC names
#     proc = subprocess.Popen(['./stout_call.sh', name], stdout=subprocess.PIPE)
#     smiles = (proc.stdout.read().decode("utf-8")).rstrip('\n')
    smiles = CIRconvert(name)
    mol = Chem.MolFromSmiles(smiles)  
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol)
    AllChem.MMFFOptimizeMolecule(mol)
    Chem.MolToXYZFile(mol,"TMP.xyz")

# Create the method section of the ORCA calculation
    if "calc_type" in plan:
        tmp_str1 = '!'+plan["method"]+' '+plan['basis']+' '+plan['calc_type']+'\n' # the method line
    else:
        tmp_str1 = '!'+plan["method"]+' '+plan['basis']+'\n'
    if "solvent" in plan:
        tmp_str1 = tmp_str1[:-1] + " CPCM(" + plan["solvent"] + ")\n"
# TDDFT section
    if 'n_ex_states' in plan:
        tmp_str2 = '\n%TDDFT\n   NROOTS   '+plan['n_ex_states']+'\nEND\n'
    else:
        tmp_str2 =''
        
# elprop section
    if plan["property"] == "raman":
        tmp_str2 = '\n%elprop\nPolar  1\nend\n'

# molecular geom section
    tmp_str3 = '\n*'+calc_plan["geom"]["type"]+' '+calc_plan["charge"]+' '+calc_plan["spin"]+'\n'+read_xyz("TMP.xyz")+'*'
    
    os.remove("TMP.xyz")
    return tmp_str1 + tmp_str2 + tmp_str3 # return the input file as a multiple line string
                     
                                              
if calc_plan["program"] == "orca":
    input_str = Input_gen_orca(calc_plan)
    print(input_str)
else:
    print("Program input generation not implemented yet!")
    sys.exit()

!b3lyp def2-svp opt freq CPCM(water)

*xyz 0 1
C      3.244061   -0.716923    0.262808
N      2.135757    0.203652    0.247770
C      2.210311    1.562365    0.405955
N      1.019789    2.125245    0.342673
C      0.163898    1.085612    0.137388
N     -1.200556    1.172257    0.003076
C     -1.857908    2.464218    0.076690
C     -1.941474    0.008204   -0.203843
O     -3.169465    0.030057   -0.330926
N     -1.228468   -1.207798   -0.263182
C     -2.019076   -2.403740   -0.478871
C      0.156322   -1.348032   -0.132657
O      0.755130   -2.418033   -0.187887
C      0.817664   -0.107324    0.074355
H      4.174134   -0.165197    0.421934
H      3.281785   -1.232087   -0.700226
H      3.094222   -1.428845    1.078263
H      3.145040    2.086232    0.562494
H     -1.465918    3.110247   -0.715275
H     -2.940629    2.378705   -0.046982
H     -1.652441    2.914589    1.052991
H     -2.748760   -2.498740    0.332018
H     -2.563597   -2.304505   -1.423675
H     -1.409820   -3.310160   -0.