In [8]:
from sklearn.kernel_ridge import KernelRidge
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import sqlite3
import numpy as np
import csv
import math
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from sklearn.model_selection import cross_validate

In [18]:
def Transform_Data():
    
    SMILESdata, GapData = readCSV('../../data/qm9.csv')
    SMILESstrings = list(SMILESdata.keys())
    FeatureVector = []
    BadParticles = []
    for string in SMILESstrings:
        mol = Chem.MolFromSmiles(SMILESdata[string])
        if(mol != None):
            fp = MACCSkeys.GenMACCSKeys(mol)
            fpBits = fp.ToBitString()
            FeatureVector.append(split(fpBits))
        else:
            BadParticles.append(string)
    X = np.array(FeatureVector[:5000])
    Z = np.array(FeatureVector[5000:10000])
    print(X.shape)
    print(Z.shape)
    for keys in BadParticles:
        del GapData[keys]
    GapData = list(GapData.values())
    y3 = np.array(GapData[:5000])
    w3 = np.array(GapData[5000:10000])
    return X, Z, w3, y3

In [19]:
def split(bitString):
    
    return [int(char) for char in bitString] 

def readCSV(filepath):

    Smile_Strings = {}
    Gap_Id = {}
    with open(filepath, newline = '') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            Smile_Strings[row['mol_id']] = row['smiles']
            Gap_Id[row['mol_id']] = float(row['gap'])
    
    return Smile_Strings, Gap_Id
    

In [26]:
def Create_Model():
    
    model3 = KernelRidge(kernel='laplacian')
    X, Z, w3, y3= Transform_Data()
    print("X: ", X)
    print("y3: ", y3)
    model3.fit(X, y3)
    predict3 = model3.predict(Z)
    '''
    c = np.random.rand(25000)
    pyplot.scatter(w3, predict3, c=c)
    pyplot.ylabel('Test Data')
    pyplot.xlabel('Predict Data')
    pyplot.title('Gap')
    pyplot.show()
    '''
    
    R_squared3 = r2_score(w3, predict3)
    RMSE3 = math.sqrt(mean_squared_error(w3, predict3))
    MAE3 = mean_absolute_error(w3, predict3)
    print("The R^2 score of GAP model is: ",R_squared3)
    print("The RMSE score of GAP model is: ", RMSE3)
    print("The MAE score of GAP model is: ", MAE3)

In [27]:
if __name__ == "__main__":
    Create_Model()

(5000, 167)
(5000, 167)
X:  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 1 1 0]]
y3:  [0.5048 0.3399 0.3615 ... 0.1936 0.1738 0.1668]
The R^2 score of GAP model is:  0.799187862981124
The RMSE score of GAP model is:  0.021262721519055183
The MAE score of GAP model is:  0.015984104240918173
