In [17]:
from matplotlib import pyplot
import numpy as np
import csv
import sqlite3
import pandas as pd

In [60]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.kernel_ridge import KernelRidge

In [19]:
from rdkit import Chem
from rdkit.Chem import MACCSkeys

In [55]:
def Transform_Data():
    
    SMILESdata, HomoData, LumoData = readCSV('../data/qm9.csv')
    SMILESstrings = list(SMILESdata.keys())
    FeatureVector = []
    BadParticles = []
    for string in SMILESstrings:
        mol = Chem.MolFromSmiles(SMILESdata[string])
        if(mol != None):
            fp = MACCSkeys.GenMACCSKeys(mol)
            fpBits = fp.ToBitString()
            FeatureVector.append(split(fpBits))
        else:
            BadParticles.append(string)
    X = np.array(FeatureVector[:75000])
    Z = np.array(FeatureVector[75000:100000])
    print(X.shape)
    print(Z.shape)
    for keys in BadParticles:
        del HomoData[keys]
        del LumoData[keys]
    HomoData = list(HomoData.values())
    LumoData = list(LumoData.values())
    y1 = np.array(HomoData[:75000])
    y2 = np.array(LumoData[:75000])
    w1 = np.array(HomoData[75000:100000])
    w2 = np.array(LumoData[75000:100000])
    print(y1.shape)
    print(w1.shape)
    return X, Z, w1, w2, y1, y2

def split(bitString):
    
    return [int(char) for char in bitString] 

def readCSV(filepath):

    Smile_Strings = {}
    Homo_Id = {}
    Lumo_Id = {}
    with open(filepath, newline = '') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            Smile_Strings[row['mol_id']] = row['smiles']
            Homo_Id[row['mol_id']] = float(row['homo'])
            Lumo_Id[row['mol_id']] = float(row['lumo'])
    
    return Smile_Strings, Homo_Id, Lumo_Id

In [57]:
Transform_Data()
print()

(75000, 167)
(25000, 167)
(75000,)
(25000,)



In [65]:
def CreateModel():
    model = KernelRidge(kernel="laplacian")
    X, Z, w1, w2, y1, y2 = Transform_Data()
    model.fit(X, y1)
    predict = model.predict(Z)
    print(predict)
    R_squared = r2_score(predict, w)
    RMSE = mean_squared_error(predict, w)
    MAE = mean_absolute_error(predict, w)
    print("The R^2 score is: ", R_squared)
    print("The RMSE score is: ", RMSE)
    print("The MAE score is: ", MAE)

In [None]:
CreateModel()

(75000, 167)
(25000, 167)
(75000,)
(25000,)
