In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from numpy import ndarray
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# tensorflow related libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler

# loading SMILES data using Chainer Chemistry
from chainer_chemistry.datasets.molnet import get_molnet_dataset
from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset
from chainer_chemistry.dataset.preprocessors import GGNNPreprocessor, construct_atomic_number_array

# import necessary libraries
import os
import glob

from rdkit import Chem

In [None]:
# use glob to get all the csv files 
# in the folder
path = './../data/trainingsets/highgap_outliergen_trans1/'
csv_files = glob.glob(os.path.join(path, "*.csv"))
  
data_gen = pd.DataFrame({})
# loop over the list of csv files
for f in csv_files:
    # read the csv file
    df = pd.read_csv(f)
    data_gen = pd.concat((data_gen, df), axis=0)
    # print the location and filename
    print('Location:', f)
    print('File Name:', f.split("\\")[-1])

In [None]:
# convert all the SMILES to Canonical format using rdkit
preprocessor = GGNNPreprocessor()
#atom_num = construct_atomic_number_array()
#data_gen = pd.read_csv('./outliers17.csv')
data_gen0 = data_gen.copy()

gen_smiles = []
idx = []
for i, smile in enumerate(data_gen['SMILES']):
    try:
        gen_smiles.append (Chem.MolToSmiles(Chem.MolFromSmiles(smile, sanitize=True), canonical=True))
        idx.append(i)
    except:
        print (smile)
        pass
idx = np.array(idx)
data_gen = data_gen.iloc[idx]
data_gen = data_gen.reset_index(drop=True)

In [None]:
print ('Test all gen smiles are canonical:', 
       sum(gen_smiles==data_gen['SMILES'])==data_gen.shape[0])
print (gen_smiles[0])

#data_gen = data_gen.drop_duplicates(subset=['SMILES'], keep='first').reset_index(drop=True)
gen_smiles = data_gen['SMILES']
try:
    DFT_gap = data_gen['DFT_gap']
except:
    DFT_gap = data_gen['HOMO-LUMO gap']

In [None]:
# use glob to get all the csv files from previous generation
# put the previous files in the same folder
path = './../data/trainingsets/highgap_outliergen_trans1/'
csv_files = glob.glob(os.path.join(path, "*.csv"))
  
data_gen_previus = pd.DataFrame({})
# loop over the list of csv files
for f in csv_files:
      
    # read the csv file
    df = pd.read_csv(f)
    data_gen_previus = pd.concat((data_gen_previus, df), axis=0)
    # print the location and filename
    #print('Location:', f)
    print('File Name:', f.split("\\")[-1])

data_gen = data_gen.reset_index(drop=True)

In [None]:
# find the repetitives from train
#previous_rep = pd.merge(data_gen, data_gen_previus, on = 'SMILES', how = 'inner')
#print ("Same generated SMILES compared to pubqc: \n{}".format(previous_rep))

In [None]:
"""
rep_smiles = previous_rep['SMILES']
for i in range(data_gen.shape[0]):
    if (data_gen['SMILES'].loc[i] in list(rep_smiles)):
        print (i)
        data_gen = data_gen.drop(i)
"""

In [None]:
#data_gen = data_gen.drop_duplicates(subset=['SMILES'], keep='first').reset_index(drop=True)

gen_smiles = data_gen['SMILES']
try:
    DFT_gap = data_gen['DFT_gap']
except:
    DFT_gap = data_gen['pred_gap']

In [None]:
print (data_gen.shape)
print (len(gen_smiles))
print (len(DFT_gap))

In [None]:
with open('./../data/trainingsets/60000_train_regular_pubqc/tokenizer_object.pickle', 'rb') as f:
    tokenizer_ = pickle.load(f)

with open('./../data/trainingsets/60000_train_regular_pubqc/tokenizer.pickle', 'rb') as f:
    tokenizer = pickle.load(f)

In [None]:
X_smiles = []
for smile in gen_smiles:
    print (smile)
    m  = Chem.MolFromSmiles(smile, sanitize=True)
    smiles_can = Chem.MolToSmiles(m, canonical=True)
    smiles_can_dot = smiles_can + '.'
    X_smiles0 = tokenizer_.texts_to_sequences([smiles_can_dot])
    X_smiles1 = pad_sequences(X_smiles0, maxlen = 40, padding = 'post')
    X_smiles2 = to_categorical(X_smiles1, num_classes=27)
    SHAPE = list(X_smiles2.shape[1:])+[1]
    X_smiles2 = X_smiles2.reshape(SHAPE)
    X_smiles.append(X_smiles2)

X_smiles = np.array (X_smiles)

In [None]:
sns.histplot (DFT_gap, bins=100)

In [None]:
#val_accurate.to_csv('gen_new_noscreen_all_joback.csv', index = False)
preprocessor = GGNNPreprocessor()

with open('./../data/trainingsets/image.pickle', 'rb') as f:
    X_smiles_pubqc, SMILES_pubqc, gap_pubqc = pickle.load(f)
    
with open('./../data/trainingsets/60000_train_regular_pubqc/tokenizer.pickle', 'rb') as f:
    tokenizer = pickle.load(f)
tokenizer[0] = ' '

with open('./../data/trainingsets/60000_train_regular_pubqc/tokenizer_object.pickle', 'rb') as f:
    tokenizer_ = pickle.load(f)

In [None]:
# save as canonical SMILES to find duplicates
# the gen smiles already converted to Canonical
SMILES_pubqc_can = []
for s in SMILES_pubqc:
    try:
        m = Chem.MolFromSmiles (s[:-1])
        ss = Chem.MolToSmiles(m)
        SMILES_pubqc_can.append(ss)
    except Exception as error:
        print (error)
#SMILES = SMILES.astype('str')
print ('First SMILES in pubqc', SMILES_pubqc[0])
print (np.array(SMILES_pubqc).shape)
print (SMILES_pubqc.shape)
SMILES_pubqc_can = np.array(SMILES_pubqc_can)
data_pubqc = {}
data_pubqc ['SMILES'] = SMILES_pubqc_can
data_pubqc ['gap'] = gap_pubqc
data_pubqc = pd.DataFrame(data_pubqc)

In [None]:
# find the repetitives from train
database_samples_rep = pd.merge(data_gen, data_pubqc, on = 'SMILES', how = 'inner')
print ( "Same generated SMILES compared to pubqc: \n{}".format(database_samples_rep))

In [None]:
rep_smiles = database_samples_rep['SMILES']
rep_idx = []
for i in range(gen_smiles.shape[0]):
    if (gen_smiles[i] in list(rep_smiles)):
        #print (i)
        rep_idx.append(i)

In [None]:
idx = np.setdiff1d(list(range(len(gen_smiles))), rep_idx)
idx.shape

In [None]:
# if do not want to remove the replicates
X_smiles_norep = X_smiles.copy()
gen_smiles_norep = gen_smiles.copy()
DFT_gap_norep = DFT_gap.copy()

# if you want to remove the replicates
X_smiles_norep = X_smiles[idx]
gen_smiles_norep = gen_smiles[idx]
DFT_gap_norep0 = DFT_gap[idx]
DFT_gap_norep = [np.round(i, 2) for i in DFT_gap_norep0]
DFT_gap_norep = np.array (DFT_gap_norep)

In [None]:
higher_9p8_gap =  DFT_gap_norep0[DFT_gap_norep >=9.8]
higher_9p8_SMILES =  gen_smiles_norep[DFT_gap_norep >=9.8]
temp = pd.DataFrame({})
temp['SMILES'] = higher_9p8_SMILES
temp['DFT_gap'] = higher_9p8_gap
temp.to_csv('./temp.csv', index=False)

gen_smiles_norep = gen_smiles_norep.reset_index(drop=True)
DFT_gap_norep = DFT_gap_norep.reset_index(drop=True)

In [None]:
gen_smiles0 = []
for smile in gen_smiles_norep:
    s_dot = smile + '.'
    gen_smiles0.append(s_dot)
gen_smiles0 = np.array(gen_smiles0)

In [None]:
SMILES_nodot = []
for i in gen_smiles0:
    smile = i[:-1]
    SMILES_nodot.append(smile)
SMILES_nodot = np.array (SMILES_nodot)

In [None]:
# subsampling
np.random.seed(420)
idx = np.random.choice(len(DFT_gap_norep), int(len(DFT_gap_norep) * 0.8), replace = False)
X_smiles_train, SMILES_train, y_train = (X_smiles_norep[idx], 
                                         gen_smiles0[idx], 
                                         DFT_gap_norep[idx])

idx_test = np.setdiff1d(list(range(len(DFT_gap_norep))), idx)
X_smiles_test, SMILES_test, y_test = (X_smiles_norep[idx_test], 
                                      gen_smiles0[idx_test], 
                                      DFT_gap_norep[idx_test])

In [None]:
sns.histplot(y_train, bins=50, stat='percent')
sns.histplot(y_test, bins=50, stat='percent')

In [None]:
# need DFT evaluation for creating the dataset.

with open('./../data/trainingsets/highgap_outliergen_trans1/image.pickle', 'wb') as f:
    pickle.dump((X_smiles_norep, gen_smiles0, DFT_gap_norep), f)
    
with open('./../data/trainingsets/highgap_outliergen_trans1/image_train.pickle', 'wb') as f:
    pickle.dump((X_smiles_train, SMILES_train, y_train), f)
    
with open('./../data/trainingsets/highgap_outliergen_trans1/image_test.pickle', 'wb') as f:
    pickle.dump((X_smiles_test, SMILES_test, y_test), f)

