In [16]:
import pandas as pd

df = pd.read_csv('./data-density.csv', sep=',')
# check if there are nans in df
if df.isnull().values.any():
    df = df.dropna()
data = df[(df['T_K'] >= 298) & (df['T_K'] <= 298.15) & (df['P_MPa'] == 0.1)]
# drop duplicates based on smiles column
data = data.drop_duplicates(subset='smiles')
# drop cation, T_K, P_MPa, and anion columns
data = data.drop(['cation', 'T_K', 'P_MPa', 'anion'], axis=1)
# rename d_kg to density
data.rename(columns={'d_kg*m-3': 'density',
                     'smiles': 'SMILES'}, inplace=True)
# reset indexing and not store original indexes
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,IL,density,SMILES
0,"[azp-2o1,1][ntf2]",1415.6,COCC[N+]1(C)CCCCCC1.FC(F)(F)S(=O)(=O)[N-]S(=O)...
1,"[azp-2o1,1][otf]",1291.6,COCC[N+]1(C)CCCCCC1.[O-]S(=O)(=O)C(F)(F)F
2,"[azp-2o1,1][tfa]",1217.9,COCC[N+]1(C)CCCCCC1.[O-]C(=O)C(F)(F)F
3,"[azp-2o1,2o1][ntf2]",1339.3,COCC[N+]1(CCOC)CCCCCC1.FC(F)(F)S(=O)(=O)[N-]S(...
4,"[azp-4,1][ntf2]",1366.1,CCCC[N+]1(C)CCCCCC1.FC(F)(F)S(=O)(=O)[N-]S(=O)...
...,...,...,...
1168,"[turea-6,(2,0),(2,0)][ntf2]",1320.7,CCCCCC[S+]=C(NCC)NCC.FC(F)(F)S(=O)(=O)[N-]S(=O...
1169,[turea-6][ntf2],1424.5,CCCCCC[S+]=C(N)N.FC(F)(F)S(=O)(=O)[N-]S(=O)(=O...
1170,"[turea-8,(1,0),(1,0)][ntf2]",1314.5,CCCCCCCC[S+]=C(NC)NC.FC(F)(F)S(=O)(=O)[N-]S(=O...
1171,"[turea-8,(1,1),(1,1)][ntf2]",1296.7,CCCCCCCC[S+]=C(N(C)C)N(C)C.FC(F)(F)S(=O)(=O)[N...


In [17]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors

molla = []; mollc = []
for i in range(len(data)):
  smile = data['SMILES'][i].split('.')
  moltempa = Chem.MolFromSmiles(smile[1])
  molla.append(moltempa)
  moltempc = Chem.MolFromSmiles(smile[0])
  mollc.append(moltempc)
data['Molecule Anion'] = molla
data['Molecule Cation'] = mollc
data[:3]

Unnamed: 0,IL,density,SMILES,Molecule Anion,Molecule Cation
0,"[azp-2o1,1][ntf2]",1415.6,COCC[N+]1(C)CCCCCC1.FC(F)(F)S(=O)(=O)[N-]S(=O)...,<rdkit.Chem.rdchem.Mol object at 0x7f970698dd20>,<rdkit.Chem.rdchem.Mol object at 0x7f970698da10>
1,"[azp-2o1,1][otf]",1291.6,COCC[N+]1(C)CCCCCC1.[O-]S(=O)(=O)C(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x7f970698c0b0>,<rdkit.Chem.rdchem.Mol object at 0x7f970698dd90>
2,"[azp-2o1,1][tfa]",1217.9,COCC[N+]1(C)CCCCCC1.[O-]C(=O)C(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x7f970698de00>,<rdkit.Chem.rdchem.Mol object at 0x7f970698de70>


In [18]:
import numpy as np

NBITS = 4096; nB = 2
moldalist = []; fpa = []
for molekula in molla:
  molda = {}
  fpa += [AllChem.GetMorganFingerprintAsBitVect(molekula, nB, nBits=NBITS, bitInfo=molda)]
  moldalist.append(molda)
fpnpa = []
for f in fpa:
  temp = np.zeros((1,))
  DataStructs.ConvertToNumpyArray(f, temp) #konwersja z typu rdkit do typu wektora numpy
  fpnpa.append(temp)
fpdicta = dict()
for i in range(len(fpnpa)): 
  #nazwa = 'a' + str(i)
  fpdicta[str(i)] = fpnpa[i] #pomocniczy słownik
fpdfa = pd.DataFrame.from_dict(fpdicta, orient='index') #wytworzenie dataframe ze słownika

moldclist = []; fpc = []
for molekula in mollc:
  moldc = {}
  fpc += [AllChem.GetMorganFingerprintAsBitVect(molekula, nB, nBits=NBITS, bitInfo=moldc)]
  moldclist.append(moldc)
fpnpc = []
for f in fpc:
  temp = np.zeros((1,))
  DataStructs.ConvertToNumpyArray(f, temp) #konwersja z typu rdkit do typu wektora numpy
  fpnpc.append(temp)
fpdictc = dict()
for i in range(len(fpnpc)): 
  #nazwa = 'c' + str(i)
  fpdictc[str(i)] = fpnpc[i] #pomocniczy słownik
fpdfc = pd.DataFrame.from_dict(fpdictc, orient='index') #wytworzenie dataframe ze słownika

fpdf = pd.concat([fpdfa, fpdfc], axis=1)

nazwyKolumn = []
for i in range(fpdfa.shape[1]):
  nazwyKolumn.append('a'+str(i))
for i in range(fpdfc.shape[1]):
  nazwyKolumn.append('c'+str(i))
fpdf.columns = nazwyKolumn

fpdf.index = data.index #uzgodnienie sposobu indeksowania
fpdf[:3]

Unnamed: 0,a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,...,c4086,c4087,c4088,c4089,c4090,c4091,c4092,c4093,c4094,c4095
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

with open(f'fp-{NBITS}-{nB}.html', 'a') as hmtl:
  print('<h1> CATIONS </h1>', file=hmtl)
  for ifp in range(0,NBITS):
    imc = 0; t=True
    while imc<len(mollc) and t:
      try:
        dsvg = Draw.DrawMorganBit(mollc[imc], ifp, moldclist[imc], whichExample=0, useSVG=True)
        print(f"<h3> C{ifp} </h3>", file=hmtl)
        print(dsvg.data, file=hmtl)
        t = False
      except:
        imc += 1
        t = True
  print('<h1> ANIONS </h1> ', file=hmtl)
  for ifp in range(0,NBITS):
    ima = 0; t=True
    while ima<len(molla) and t:
      try:
        dsvg = Draw.DrawMorganBit(molla[ima], ifp, moldalist[ima], whichExample=0, useSVG=True)
        print(f"<h3> A{ifp} </h3>", file=hmtl)
        print(dsvg.data, file=hmtl)
        t = False
      except:
        ima += 1
        t = True
    

In [20]:
dane =  pd.concat([data, fpdf], axis=1)
dane.drop(['SMILES', 'Molecule Anion', 'Molecule Cation'], axis=1, inplace=True)
dane.to_csv(f'dane-{NBITS}-{nB}.csv', sep=';')
dane[:3]

Unnamed: 0,IL,density,a0,a1,a2,a3,a4,a5,a6,a7,...,c4086,c4087,c4088,c4089,c4090,c4091,c4092,c4093,c4094,c4095
0,"[azp-2o1,1][ntf2]",1415.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[azp-2o1,1][otf]",1291.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[azp-2o1,1][tfa]",1217.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
