In [2]:
# Generate an ORCA input for a simple calculation request from user
# The locally deployed LLM deepseek:32b should be running with the command 'ollama run deepseek:32b'

import sys, os
import requests
import json
import subprocess
from rdkit import Chem
from rdkit.Chem import AllChem
from urllib.request import urlopen
from urllib.parse import quote
import re
from deep_translator import GoogleTranslator

# This function finds a chemical name out of a calculation request sentence 
# using the LLM deepseek:32b (locally deployed)
# Input could be a string or a list
def find_chemical_name_from_sentence(sentence):
    API_URL = "http://localhost:11434/api/generate"  # Ollama API port

    headers = {
        "Content-Type": "application/json"
    }
    
    # if input is a list, make it a string
    if isinstance(sentence, list):
        sentence = " ".join(sentence)
        
    data = {
        "model": "deepseek-r1:32b",  # your loaded deepseek model
        # inputs for the LLM API
        "prompt": "find me a chemical name from the following sentence, respond using JSON:  '"+sentence+"' ",
        "stream": False,  # close stream output
        "format": {
            "type": "object",
            "properties": {
            "name": {
                "type": "string"
            },
            },
            "required": [
            "name"
            ]
        }
    }

    response = requests.post(API_URL, headers=headers, json=data)

    if response.status_code == 200:
        result = response.json()
        resp =  result["response"];
        nameresult = json.loads(resp);
        return nameresult["name"];
    else:
        return ""


# check if any Chinese character is in some text
def contains_chinese(text):
    chinese_pattern = re.compile(r'[\u4e00-\u9fff]')
    return bool(chinese_pattern.search(text))
    
# translate chinese into english using Google translator
def req2eng(txt):
    if contains_chinese(txt):
        translator = GoogleTranslator(source='auto', target='en')
        return translator.translate(txt)
    else:
        return txt
    
def split_sentence(sentence, delimiters):
  """
  Splits a sentence into a list of words using multiple delimiters.

  Args:
    sentence: The sentence to split.
    delimiters: A string containing the delimiters to split by, 
                e.g., ",| ".

  Returns:
    A list of words.
  """
  regex_pattern = '[' + re.escape(delimiters) + ']+'
  words = re.split(regex_pattern, sentence)
  return [word for word in words if word]  

# Using the web interface of NIH to convert chemical IDs into SMILES
def CIRconvert(ids):
    try:
        url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles'
        ans = urlopen(url).read().decode('utf8')
        return ans
    except:
        return ''

# This function read a .xyz file into a multi-line string
def read_xyz(path_to_file):
    with open(path_to_file) as f:
        f.readline()       # strip the first two lines
        f.readline()       
        data=''.join(line for line in f)
    return data

# Key reference
key_ref = {
    "program": ["orca", "psi4", "nwchem", "gaussian", "cp2k"],
    "method": ["semi-empirical", "am1", "pm3", "pm6", "cndo", "mndo", "nddo", "indo", "zindo", "hf", \
               "dft", "lda", "bp", "blyp", "b3lyp", "pbe", "m06l", "tpss", "scan", "r2scan", "x3lyp", "bhandhlyp", \
               "tpssh", "r2scanh", "wb97x", "camb3lyp", "lc_pbe", "lc_blyp", "wr2scan",
              "mp2", \
              "cis", "cis(d)", "cisd", \
              "cc2", "ccsd", "ccsd(t)"],
    "basis": ["3-21g", "4-31g", "6-31g(d)", "6-31g*", "6-31++g(d,p)", "6-31++g**", "6-311g*", "6-311g(d)", \
              "6-311g**", "6-311g(d,p)", "6-311++g**", "6-311++g(d,p)", "aug-cc-pvtz", "aug-cc-pvdz", \
              "cc-pvdz", "cc-pvtz", "def2-ecp", "def2-svp", "def2-tzvp", "iglo-iii", "lanl2dz", "lanl2tz", \
              "mini", "sapporo-dkh3-dzp-2012", "sapporo-dkh3-tzp-2012", "sto-3g", "stuttgart-rsc-1997"],
    "property": ["single-point", "geometry", "ir", "infrared", "raman", "uv-vis", "uv-visible", "x-ray", \
                 "xas", "xes", "rixs", "xps", "auger", "epr", "esr", "nmr", "mossbauer", "vcd", "roa", "ecd", \
                 "xcd", "photoelectron"],
    "spec_type": ["absorption", "emission"],
    "xray_edge": ["c1s", "o1s", "n1s"]
}

def find_key(sentence, calc_plan, key_ref):
    for word in sentence:
        if word.lower() in key_ref["program"]:
            calc_plan.update({'program' : word.lower()})
        if word.lower() in key_ref["method"]:
            calc_plan.update({'method' : word.lower()})
        if word.lower() in key_ref["basis"]:
            calc_plan.update({'basis' : word.lower()}) 
        if word.lower() in key_ref["property"]:
            calc_plan.update({'property' : word.lower()}) 
        if word.lower() in key_ref["spec_type"]:
            calc_plan.update({'spec_type' : word.lower()})
        if word.lower() in key_ref["xray_edge"]:
            calc_plan.update({'xray_edge' : word.lower()})
            
    return calc_plan

# This is a calculation request from user in Chinese
calc_req = "用ORCA及pbe/3-21g方法计算咖啡因的紫外-可见光谱。"

# First translate the calculation request into English
calc_req_en = req2eng(calc_req)

# Then split the sentence into a word array
delimiters = ",;./ "
calc_req_en = split_sentence(calc_req_en, delimiters)

# Get the chemical system from the calculation request
name = find_chemical_name_from_sentence(calc_req_en)

# Find keys in the calculation request
calc_plan = {}
find_key(calc_req_en, calc_plan, key_ref)

# print(calc_req_en)

# Default values
if "program" not in calc_plan:
    calc_plan.update({"program" : "orca"})

if "method" not in calc_plan:
    calc_plan.update({"method" : "b3lyp"})
        
if "basis" not in calc_plan:
    calc_plan.update({"basis" : "6-31g*"})
        
if "property" not in calc_plan:
    calc_plan.update({"property" : "single-point"})
    
if "spec_type" not in calc_plan:
    calc_plan.update({"spec_type" : "absorption"})
                     
if "geom" not in calc_plan:
    calc_plan.update({"geom" : {"type" : "xyz", "unit" : "angstrom"}})
                     
if "charge" not in calc_plan:
    calc_plan.update({"charge" : "0"})
                     
if "spin" not in calc_plan:
    calc_plan.update({"spin" : "1"})
                     
# Alternative values
if calc_plan["method"] == "dft":
    calc_plan["method"] = "b3lyp"
    
if calc_plan["basis"] == "6-31g(d)":
    calc_plan["basis"] = "6-31g*"
    
if calc_plan["basis"] == "6-31++g(d,p)":
    calc_plan["basis"] = "6-31++g**"
    
if calc_plan["basis"] == "6-311g(d)":
    calc_plan["basis"] = "6-311g*"
    
if calc_plan["basis"] == "6-311g(d,p)":
    calc_plan["basis"] = "6-311g**"
    
if calc_plan["basis"] == "6-311++g(d,p)":
    calc_plan["basis"] = "6-311++g**"
    
if calc_plan["property"] == "uv-visible":
    calc_plan["property"] = "uv-vis"

if calc_plan["property"] == "uv-vis" or "ecd" or "xas" or "xcd":
    calc_plan.update({"n_ex_states" : "30"}) # default to calculate 30 excited states
    
# Chemical name
if name == "":
    print("No chemical system found in the calculation request!")
    sys.exit()
else:
    calc_plan.update({"sys_name" : name})

def Input_gen_orca(plan):
# Generate the 3-D structure from the chemical name
    if "sys_name" in plan:
        name = plan["sys_name"]
    else:
        print("No chemical system found in the calculation request!")
        sys.exit()
                     
# Cenerate the 3D coordinates and save them into a temp file  
# STOUT only works for IUPAC names
#     proc = subprocess.Popen(['./stout_call.sh', name], stdout=subprocess.PIPE)
#     smiles = (proc.stdout.read().decode("utf-8")).rstrip('\n')
    smiles = CIRconvert(name)
    mol = Chem.MolFromSmiles(smiles)  
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol)
    AllChem.MMFFOptimizeMolecule(mol)
    Chem.MolToXYZFile(mol,"TMP.xyz")

# Create the method section of the ORCA calculation
    tmp_str1 = '!'+plan["method"]+' '+plan['basis']+'\n' # the method line
# TDDFT section
    if 'n_ex_states' in plan:
        tmp_str2 = '\n%TDDFT\n   NROOTS   '+plan['n_ex_states']+'\nEND\n'
    else:
        tmp_str2 =''
# molecular geom section
    tmp_str3 = '\n*'+calc_plan["geom"]["type"]+' '+calc_plan["charge"]+' '+calc_plan["spin"]+'\n'+read_xyz("TMP.xyz")+'*'
    
    os.remove("TMP.xyz")
    return tmp_str1 + tmp_str2 + tmp_str3 # return the input file as a multiple line string
                     
                                              
if calc_plan["program"] == "orca":
    input_str = Input_gen_orca(calc_plan)
    print(input_str)
else:
    print("Program input generation not implemented yet!")
    sys.exit()

!pbe 3-21g

%TDDFT
   NROOTS   30
END

*xyz 0 1
C      3.316780    0.269967    0.012299
N      1.991169    0.829558   -0.062767
C      1.675661    2.162466   -0.084936
N      0.373058    2.352604   -0.157412
C     -0.151991    1.095919   -0.181845
N     -1.485816    0.775054   -0.254208
C     -2.484547    1.826611   -0.313022
C     -1.865111   -0.567582   -0.270301
O     -3.049548   -0.910503   -0.334353
N     -0.834620   -1.529360   -0.210056
C     -1.252505   -2.917491   -0.228104
C      0.533983   -1.254167   -0.135608
O      1.413316   -2.108943   -0.084072
C      0.815091    0.138834   -0.125111
H      3.388265   -0.335337    0.919354
H      4.051987    1.077935    0.049919
H      3.487271   -0.341785   -0.877051
H      2.423272    2.944939   -0.046536
H     -3.499021    1.423166   -0.367483
H     -2.403304    2.447238    0.584996
H     -2.304871    2.440830   -1.201091
H     -1.913261   -3.101114    0.625529
H     -1.815511   -3.107484   -1.147850
H     -0.409746   -3.611355   -0