In [1]:
#Settings
file_name = 'the input .mol file'

In [2]:
import numpy as np
from datetime import date
today = date.today()

In [3]:
def get_atoms_and_bonds(data):
    num_rows  = len(data.split('\n'))
    n_atoms   = num_rows
    n_bonds   = num_rows
    counts_ind= num_rows

    current_ind = 0
    atoms = {}
    bonds = []
    for i in  data.split('\n'):
        cols = i.split()
        if len(cols) == 11:
            n_atoms = int(cols[0])
            n_bonds = int(cols[1])
            counts_ind = current_ind
            #print(f"Atoms: {n_atoms}  Bonds: {cols[1]}")

        if current_ind > counts_ind and current_ind <= counts_ind + n_atoms:
            atom_ind = current_ind - counts_ind
            atoms[f'{cols[3]}{atom_ind}'] = [float(j) for j in cols[:3]]
        if current_ind > counts_ind + n_atoms and current_ind <= counts_ind + n_atoms + n_bonds:
            bonds.append(cols)
        current_ind +=1
    return atoms,bonds

In [4]:
def get_cell_extents(atoms_dict):#alpha,beta,gamma always assumed to be 90degrees
    xs=[]
    ys=[]
    zs=[]
    
    for i in atoms_dict.keys():
        xs.append(atoms_dict[i][0])
        ys.append(atoms_dict[i][1])
        zs.append(atoms_dict[i][2])
    x_range = (min(xs), max(xs)) 
    y_range = (min(ys), max(ys)) 
    z_range = (min(zs), max(zs)) 
    return x_range, y_range, z_range #tuples

In [5]:
def scale_and_translate_coordinates(atoms):
    output_atoms = {}
    x_range, y_range, z_range = get_cell_extents(atoms)
    
    for i in atoms.keys():
        output_atoms[i] = []
        for coord_range, coord in zip([x_range, y_range, z_range],atoms[i]):
            output_atoms[i].append((coord - coord_range[0]) / (coord_range[1]-coord_range[0]))
    return output_atoms

In [6]:
remove_numeric = lambda S:  ''.join([i for i in S if not i.isdigit()])
keep_numeric   = lambda S:  ''.join([i for i in S if i.isdigit()])

In [7]:
def get_atoms_block_text(atoms):
    text = ""
    scaled = scale_and_translate_coordinates(atoms)
    for i in scaled.keys():
        line = f"{i} {remove_numeric(i)} {scaled[i][0]:6.5f} {scaled[i][1]:6.5f} {scaled[i][2]:6.5f}\n"
        text += line
    return text[:-1]

In [8]:
def get_bonds_block_text(atoms,bonds):
    bond_types = ('S','D','T','A')#(“S” for single, “D” for double, “T” for triple, and “A” for aromatic)
    text = ""
    atoms_list = [i for i in atoms.keys()]
    for i in bonds:
        bond = [int(j) for j in i]
        A1 = atoms_list[bond[0]-1]
        A2 = atoms_list[bond[1]-1]
        dist = np.linalg.norm(np.array(atoms[A1]) - np.array(atoms[A2]))
        line = f"{A1} {A2} {dist:7.5f} . {bond_types[bond[2]-1]}\n"
        text += line
    return text[:-1]

In [9]:
def generate_cif_text(atoms,bonds):
#def generate_cif_text(atoms,bonds,with_optional_columns= False):
#    optional_columns = ""
#    if with_optional_columns:
#        optional_columns = """
#        _atom_site_U_iso_or_equiv
#_atom_site_adp_type
#_atom_site_occupancy
#_atom_site_charge"""

#_atom_site_fract_z{optional_columns}
        
    x_range, y_range, z_range = get_cell_extents(atoms)
    atoms_block = get_atoms_block_text(atoms)
    bonds_block = get_bonds_block_text(atoms,bonds)
    
    cif_text = f"""data_{file_name.split('.')[0]}
_audit_creation_date              {today}
_audit_creation_method            'mol2cif'
_symmetry_space_group_name_H-M    'P1'
_symmetry_Int_Tables_number       1
_symmetry_cell_setting            triclinic
loop_
_symmetry_equiv_pos_as_xyz
  x,y,z
_cell_length_a                    {x_range[1] - x_range[0]}
_cell_length_b                    {y_range[1] - y_range[0]}
_cell_length_c                    {z_range[1] - z_range[0]}
_cell_angle_alpha                 90.000000
_cell_angle_beta                  90.000000
_cell_angle_gamma                 90.000000
loop_
_atom_site_label
_atom_site_type_symbol
_atom_site_fract_x
_atom_site_fract_y
_atom_site_fract_z
{atoms_block}
loop_
_geom_bond_atom_site_label_1
_geom_bond_atom_site_label_2
_geom_bond_distance
_geom_bond_site_symmetry_2
_ccdc_geom_bond_type
{bonds_block}
    """
    return cif_text

In [10]:
def clean_of_XeRn(cif_contents):
    delimeter1 = """loop_
_atom_site_label
_atom_site_type_symbol
_atom_site_fract_x
_atom_site_fract_y
_atom_site_fract_z
"""
    delimeter2 = """
loop_
_geom_bond_atom_site_label_1
_geom_bond_atom_site_label_2
_geom_bond_distance
_geom_bond_site_symmetry_2
_ccdc_geom_bond_type
"""
    atoms_block = cif_contents.split(delimeter1)[1].split(delimeter2)[0]
    bonds_block = cif_contents.split(delimeter2)[1]
    terminal_atoms = []
    new_bonds_block = ""
    for i in bonds_block.split('\n'):
        if "Xe" in i and "Rn" not in i:
            terminal_atoms += [i.split()[0],i.split()[1]]
        if "Xe" not in i and "Rn" not in i:
            new_bonds_block += i
            new_bonds_block += '\n'
    new_bonds_block = new_bonds_block[:-1]
    terminal_atoms = list(set(terminal_atoms))
    new_atoms_block = ""
    for i in atoms_block.split('\n'):
        if i.strip() != "":
            cols = i.split()
            if cols[0] in terminal_atoms and cols[1] != "Xe":
                line = f"X{keep_numeric(cols[0])} {cols[1]} {cols[2]} {cols[3]} {cols[4]}\n"
                new_atoms_block += line
            else:
                if cols[1] != "Xe" and cols[1] != "Rn":
                    line = f"{cols[0]} {cols[1]} {cols[2]} {cols[3]} {cols[4]}\n"
                    new_atoms_block += line
    new_atoms_block = new_atoms_block[:-1]
    new_cif_text = cif_contents.split(delimeter1)[0] + delimeter1 + new_atoms_block + delimeter2 + new_bonds_block
    
    for i in terminal_atoms:
        if i in new_cif_text:
            new_cif_text= new_cif_text.replace(i,f"X{keep_numeric(i)}")
    return new_cif_text

In [11]:
def mol2cif(file_name, output_file_name = ""):
    if output_file_name =="":
        output_file_name = file_name.split('.')[0] + '.cif'
    with open(file_name, 'r') as f:
        data = f.read()
    atoms,bonds = get_atoms_and_bonds(data)
    with open(output_file_name,'w+') as f:
        f.write(clean_of_XeRn(generate_cif_text(atoms,bonds)))

In [12]:
if __name__ == "__main__":
    mol2cif(file_name)