# Training OPEP2 

This notebook is intended for users to retrain the RF/NN ensemble model, referred to as OPEP2, to predict the PCE of non-fullerene acceptor (NFA) and donor pairs for organic solar cells. Previous calculations on each acceptor and donor are required, such as GFN2-xTB, sTD-DFT-xTB, and single-point xTB for solvation in water and hexane.

Import packages

In [None]:
import pandas as pd
import numpy as np
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit import Chem
from rdkit.Chem import DataStructs
from numpy import linalg
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import math
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import VotingRegressor

import warnings
warnings.filterwarnings("ignore")

## Descriptors

### Parse GFN2-xTB output file for polarizability and dipole moment

Polarizability is in units of $au^3$ and dipole moment is in units of Debye

In [None]:
def parse_GFN2(filename, data):
    '''
    Parses through GFN2-xTB output files

    Parameters
    -----------
    filename: str
        path to output file
    data: list
        list of descriptors to add to
    '''
    
    with open(filename, 'r', encoding = 'utf-8') as file:
        line = file.readline()
        while line:
            if 'molecular dipole' in line:
                line = file.readline()
                line = file.readline()
                line = file.readline()
                line_list = line.split()
                dipole_moment = float(line_list[-1])
                
            elif 'Mol. C8AA' in line:
                line = file.readline()
                line_list = line.split()
                polarizability = float(line_list[-1])

            line = file.readline()  
        line = file.readline()

        outputs = [dipole_moment, polarizability]
        data.extend(outputs)

        return data

### Parse sTD-DFT-xTB output files (*.stda) for energy levels and absorption spectrum

Extracts/computes the following descriptors:  
1. HOMO (eV)
2. HOMO-1 (eV)
3. LUMO (eV)
4. LUMO+1 (eV)
5. Fundamental Band Gap (eV) (LUMO-HOMO)
6. deltaHOMO - difference in energy between HOMO and HOMO-1 (eV)
7. deltaLUMO - difference in energy between LUMO and LUMO+1 (eV)
8. Optical Band Gap (eV) - energy of the first transition within the first 12 transition with an oscillator strength greater than 0.5
9. Max abs (nm) - transition with the strongest absorption
10. summed_oscs - Sum of oscillator strengths (unitless)
11. area_spectra - area under the absorption spectrum curve using trapzoidal rule integration
12. area_sim_solar_spectra - area under the curve of the simulated spectrum multiplied by the normalized solar spectrum
13. chemical_potential (eV): $(HOMO+LUMO)/2$
14. electrophilicity (eV): $chemical potential^2 / 2(LUMO-HOMO)$


In [None]:
def parse_sTDA(filename, data):
    '''
    Parses through sTD-DFT-xTB output files

    Parameters
    -----------
    filename: str
        path to output file
    data: list
        list of descriptors to add to
    '''
    with open(filename, 'r', encoding = 'utf-8') as file:
        line = file.readline()
        oscs = []
        wavelength = []
        energyEV = []
        while line:
            if 'ordered frontier orbitals' in line:
                for x in range(11):
                    line = file.readline()
                line_list = line.split()
                HOMOminus1 = float(line_list[1])
                
                line = file.readline()
                line_list = line.split()
                HOMO = float(line_list[1])
                
                line = file.readline()
                line = file.readline()
                line_list = line.split()
                LUMO = float(line_list[1])
                line = file.readline()
                line_list = line.split()
                LUMOplus1 = float(line_list[1])

                deltaHOMO = abs(HOMOminus1 - HOMO)
                deltaLUMO = abs(LUMO - LUMOplus1)
                fundbg = abs(HOMO-LUMO)

            elif 'excitation energies, transition moments and TDA amplitudes' in line:
                line = file.readline()
                line = file.readline()
                line_list = line.split()
                while line != '\n':
                    line_list = line.split()
                    oscs.append(float(line_list[3]))
                    wavelength.append(float(line_list[2]))
                    energyEV.append(float(line_list[1]))
                    line = file.readline()

            line = file.readline()  
        line = file.readline()
        
    chemical_potential = (HOMO + LUMO)/2
    hardness =  LUMO - HOMO
    # https://xtb-docs.readthedocs.io/en/latest/sp.html#global-electrophilicity-index
    electrophilicity = chemical_potential**2 / 2*hardness
   
    if len(oscs) != 0:
        summed_oscs = np.sum(oscs)
        highest_oscs = 0.0
        opt_bg = round(energyEV[0], 2)
        
        # Opt bg is the energy of the first transition within the first 12 transition with an oscillator strength greater than 0.5 
        if len(oscs) < 12:
            for i in range(len(oscs)):
                if  oscs[i] > 0.5:
                    opt_bg = round(energyEV[i], 2)
                    break
        else:
            for x in range(12):
                if  oscs[x] > 0.5:
                    opt_bg = round(energyEV[x], 2)
                    break

        # max abs is the tallest peak in the spectrum
        for x in range(len(oscs)):
            if  oscs[x] > highest_oscs:
                    highest_oscs = oscs[x]
                    max_abs = wavelength[x]
                    
        # Creates full spectrum
        (spectraEV, spectraNM, spectraIntensity) = spectra(energyEV, oscs)
        
        # Calculates the area under the curve using trapz rule for integration
        area_spectra = np.trapz(spectraIntensity, spectraNM, dx=0.1, axis=- 1)
        
        # Calculates the area under the curve of the simulated spectrum multiplied by the normalized solar spectrum
        area_sim_solar_spectra = solar_integrated_desc(spectraNM, spectraIntensity)
        
        outputs = [HOMO, HOMOminus1, LUMO, LUMOplus1, fundbg, deltaHOMO, deltaLUMO, opt_bg, max_abs, summed_oscs, area_spectra, area_sim_solar_spectra, chemical_potential, electrophilicity]
        data.extend(outputs)

        return data
    
    else:
        print(filename)
        print('something is wrong')

### Calculate the integration of the absorption spectra using trapezoidal integration

Multiplies the molecule's absorption spectra by the normalized solar spectrum. This new curve is then integrated with trapezoidal integration and the area if the descriptor used. 

In [None]:
def spectra(etens, etoscs, low = 0.5, high = 10.0, resolution = 0.01, smear = 0.04):
    """
    Return arrays of the energies and intensities of a Lorentzian-blurred spectrum

    Parameters
    ----------
    etens: list
        list of transition energies in units of eV
    etoscs: list
        list of oscillator strengths
    low: float
        transition in eV to start spectrum at
    high: float
        transition in eV to end spectrum at
    resolution: float
        increments of eV for spectrum
    smear: float
        blurs intensities of peaks across 0.04 eV

    Returns
    -------
    Lists of the spectra in eV, nm, and their oscillator strengths
    """
    maxSlices = int((high - low) / resolution) + 1
    peaks = len(etens)

    spectraEV = []
    spectraNM = []
    spectraIntensity = []
    for i in range(0, maxSlices):
        energy = float(i * resolution + low) # units of eV
        wavelength = energy * 1239.84193 # convert eV to nm  
        intensity = 0.0

        for trans in range(0, peaks):
            this_smear = smear / 0.2 * (-0.046 * etoscs[trans] + 0.20)
            deltaE = etens[trans] - energy
            intensity = intensity + etoscs[trans] * this_smear**2 / (deltaE**2 + this_smear**2)

        spectraEV.append(energy)
        spectraNM.append(wavelength) 
        spectraIntensity.append(intensity)
        
    return spectraEV, spectraNM, spectraIntensity

def custom_round(x, base=5):
    return float(base * round(float(x)/base))

def solar_integrated_desc(spectraNM, spectraIntensity):
    solar = pd.read_csv('Solar_radiation_spectrum.csv', index_col = 'wavelength')
    new_spectrum_intensities = []
    
    # the 1.5AM solar spectra does not have constant increments of wavelengths
    for x in range(len(spectraNM)):
        
        if 280 <= spectraNM[x] < 400:
            int_wavelength = custom_round(spectraNM[x], 0.5)
        if 400 <= spectraNM[x] < 1700:
            int_wavelength = custom_round(spectraNM[x], 1)
        if 1700 <= spectraNM[x] < 1702:
            int_wavelength = custom_round(spectraNM[x], 2)
        if 1702 <= spectraNM[x] <=4000:
            int_wavelength = custom_round(spectraNM[x], 5)

        solar_intensity = solar.loc[int_wavelength][-1]
        
        new_spectrum_intensities.append(float(solar_intensity) * spectraIntensity[x])
        
    area_altered_spectra = np.trapz(new_spectrum_intensities, spectraNM, dx=0.1, axis=- 1)
    
    return area_altered_spectra

### Descriptor to calculate the overlap between an acceptor and donor absorption spectra

Multiplies the simulated absorption spectra of the donor and acceptors using the sTD-DFT-xTB stda files. When multiplied, the only peaks left are where there is absorption overlap. For an ideal OSC, we want the materials to absorb in different regions to maximize the number of photons absorbed from the sun. This overlap is integrated and the area is the descriptor used for the models. Theoretically, the smaller the area, the higher the PCE.

Simpson's integration was tried but sometimes led to a negative area. Trapezoidal integration fit our type of data better.

In [None]:
def overlap(acceptor, donor, data):
    
    with open(acceptor, 'r', encoding = 'utf-8') as file:
        line = file.readline()
        oscs = []
        energyEV = []
        while line:
            if 'excitation energies, transition moments and TDA amplitudes' in line:
                line = file.readline()
                line = file.readline()
                line_list = line.split()
                while line != '\n':
                    line_list = line.split()
                    oscs.append(float(line_list[3]))
                    energyEV.append(float(line_list[1]))
                    line = file.readline()
            line = file.readline()  
        line = file.readline()
    
        # Creates full spectrum
        (acc_spectraEV, acc_spectraNM, acc_spectraIntensity) = spectra(energyEV, oscs)
        
    with open(donor, 'r', encoding = 'utf-8') as file:
        line = file.readline()
        oscs = []
        energyEV = []
        while line:
            if 'excitation energies, transition moments and TDA amplitudes' in line:
                line = file.readline()
                line = file.readline()
                line_list = line.split()
                while line != '\n':
                    line_list = line.split()
                    oscs.append(float(line_list[3]))
                    energyEV.append(float(line_list[1]))
                    line = file.readline()
            line = file.readline()  
        line = file.readline()

        # Creates full spectrum
        (don_spectraEV,  don_spectraNM, don_spectraIntensity) = spectra(energyEV, oscs)
        
    overlapped_spectra_intensities = [don_spectraIntensity[i] * acc_spectraIntensity[i] for i in range(len(don_spectraIntensity))]

    area_altered_spectra = np.trapz(overlapped_spectra_intensities, don_spectraNM, dx=0.1, axis=- 1)

    outputs = [area_altered_spectra]
    
    data.extend(outputs)
        
    return data

### Calculates the number of atoms in the conjugation path

In [None]:
def getPiSystemSize(mol):
    mol = AllChem.RemoveHs(mol)
    AllChem.Kekulize(mol)
    pi_systems = [pi_system(mol,x.GetIdx(),[x.GetIdx()]) for x in mol.GetAtoms()]
    largest_pi_system = max(pi_systems, key=lambda coll: len(coll))
    pi_system_size = len(largest_pi_system)
    return pi_system_size

def pi_system(mol, current, seen):
    atom = mol.GetAtomWithIdx(current)
    for neighbor in atom.GetNeighbors():
        if (neighbor.GetIdx() not in seen) and (mol.GetBondBetweenAtoms(atom.GetIdx(),neighbor.GetIdx()).GetIsConjugated() or mol.GetBondBetweenAtoms(atom.GetIdx(),neighbor.GetIdx()).GetBondTypeAsDouble() > 1):
            seen.append(neighbor.GetIdx())
            pi_system(mol,neighbor.GetIdx(),seen)
    return seen

def pi_sys_size(filename, molecule, data):
    mol = AllChem.MolFromMolFile(filename)
    pi_size = getPiSystemSize(mol)
    
    outputs = [pi_size]

    data.extend(outputs)

    return data

### Calculate planarity of pi system

https://github.com/rdkit/rdkit/tree/master/Contrib/PBF

Modified from 
J. Chem. Inf. Model. 2012, 52, 10, 2516–2525
https://pubs.acs.org/doi/10.1021/ci300293f

In [None]:
def GetBestFitPlane(pts, weights=None):
    # number of atoms
    wSum = len(pts)
    # sets the origin as the sum of the coordinates for x, y, and z
    origin = np.sum(pts, 0)
    # finds the average of each coordinate and sets as the origin
    origin /= wSum

    # initiates blank coordinates
    sums = np.zeros((3, 3), np.double)
    
    # finds the distance of each point to origin
    for pt in pts:
        # finds the distance of each point to origin
        dp = pt - origin
        
        # sets the 3x3 matrix
        for i in range(3):
            sums[i, i] += dp[i] * dp[i]
            for j in range(i + 1, 3):
                sums[i, j] += dp[i] * dp[j]
                sums[j, i] += dp[i] * dp[j]
    # Averages each number in matrix by the total number of atoms
    sums /= wSum
    
    # Finds the eigenvalues and eigenvectors 
    vals, vects = linalg.eigh(sums)

    # gives indices sorted from smallest to largest
    order = np.argsort(vals)
    
    # smallest eigenvector
    normal = vects[:, order[0]]    
    
    # sets plane coordinates
    plane = np.zeros((4, ), np.double)
    plane[:3] = normal
    plane[3] = -1 * normal.dot(origin)
    
    return plane


def PBFRD(mol, largest_pi_system, confId=-1):
    conf = mol.GetConformer(confId)
    if not conf.Is3D():
        return 0
    
    pts = np.array([list(conf.GetAtomPosition(x)) for x in largest_pi_system])
    plane = GetBestFitPlane(pts)
    
    #distance to point
    denom = np.dot(plane[:3], plane[:3])
    denom = denom**0.5
    # add up the distance from the plane for each point:
    res = 0.0
    for pt in pts:
        res += np.abs(pt.dot(plane[:3]) + plane[3])
        
    res /= denom
    res /= len(pts)
    
    # higher the number, the less planar it is
    return res

def planarity (filename, data):
    mol = Chem.MolFromMolFile(filename)
    mol = Chem.RemoveHs(mol)
    Chem.Kekulize(mol)
    pi_systems = [pi_system(mol,x.GetIdx(),[x.GetIdx()]) for x in mol.GetAtoms()]
    largest_pi_system = max(pi_systems, key=lambda coll: len(coll))

    planarity = PBFRD(mol, largest_pi_system)
    
    outputs = [planarity]
    data.extend(outputs)
    
    return data

### RDKit descriptors

In [None]:
def rdkit_descriptors(filename, data):
    mol = Chem.MolFromMolFile(filename)
    num_rot_bonds = Descriptors.NumRotatableBonds(mol)
    MolLogP = Descriptors.MolLogP(mol)
    TPSA = Descriptors.TPSA(mol)
    NumHAcceptors = Descriptors.NumHAcceptors(mol)
    NumHDonors = Descriptors.NumHDonors(mol)
    RingCount = Descriptors.RingCount(mol)

    outputs = [num_rot_bonds, MolLogP, TPSA, NumHAcceptors, NumHDonors, RingCount]
    
    data.extend(outputs)
    return data

## Morgan Fingerprint Counts

In [None]:
def numpy_2_fp(array):
    fp = DataStructs.cDataStructs.UIntSparseIntVect(len(array))
    for ix, value in enumerate(array):
        fp[ix] = int(value)
    return fp
data = []
def morgan_fp_counts(filename, data):
    mol = Chem.MolFromMolFile(filename)
    fp3 = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=2048)
    array = np.zeros((0,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp3, array)
    
    fp4 = numpy_2_fp(array)
    
    outputs = list(fp4)
    
    data.extend(outputs)
    return data

### Solvation energy

https://xtb-docs.readthedocs.io/en/latest/gbsa.html

In [None]:
def solvation(filename, data):
    with open(filename, 'r', encoding = 'utf-8') as file:
        line = file.readline()
        while line:
            if '-> Gsolv' in line:
                line_list = line.split()
                solvation_energy = float(line_list[3])
                break
                
            line = file.readline()  
        line = file.readline()

    outputs = [solvation_energy]
        
    data.extend(outputs)

    return data

### Offsets in energy levels

1. HOMO offset between donor and acceptor
2. LUMO offset between donor and acceptor
3. difference in energy between donor's HOMO and acceptor's LUMO (thought to be enegry of CT state)

In [None]:
def energy_offsets(acceptor, donor, data):
    with open(acceptor, 'r', encoding = 'utf-8') as file:
        line = file.readline()
        while line:
            if 'ordered frontier orbitals' in line:
                for x in range(12):
                    line = file.readline()
                line_list = line.split()
                acc_HOMO = float(line_list[1])
                line = file.readline()
                line = file.readline()
                line_list = line.split()
                acc_LUMO = float(line_list[1])
                break
            line = file.readline()  
        line = file.readline()
        
    with open(donor, 'r', encoding = 'utf-8') as file:
        line = file.readline()
        while line:
            if 'ordered frontier orbitals' in line:
                for x in range(12):
                    line = file.readline()
                line_list = line.split()
                don_HOMO = float(line_list[1])
                line = file.readline()
                line = file.readline()
                line_list = line.split()
                don_LUMO = float(line_list[1])
                break
            line = file.readline()  
        line = file.readline()
        
    HOMO_offset = don_HOMO - acc_HOMO
    LUMO_offset = don_LUMO - acc_LUMO
    DonHOMO_accLUMO = acc_LUMO - don_HOMO
                    
    outputs = [HOMO_offset, LUMO_offset, DonHOMO_accLUMO]
    data.extend(outputs)

    return data

# Create dataframe with all descriptors

Make code to create new dataframe based on the experimental pairs. Then can add the HOMO offset between A and D, etc. 

Import the experimetnal dataset

In [None]:
experimental = pd.read_csv('filtered_experimental_PCE_params.csv')

In [None]:
column_names = ['acceptor', 'donor', 'reference', 'Voc', 'Jsc', 'FF', 'PCE', 'A-HOMO', 'A-HOMOminus1', 'A-LUMO', 'A-LUMOplus1', 'A-fundbg', 'A-deltaHOMO', 'A-deltaLUMO', 'A-opt_bg', 'A-max_abs', 'A-summed_oscs', 'A-area_spectra', 'A-area_sim_solar_spectra', 'A-chemical_potential', 'A-electrophilicity', 'A-pi_sys_size', 'A-num_rot_bonds', 'A-MolLogP', 'A-TPSA', 'A-NumHAcceptors', 'A-NumHDonors', 'A-RingCount','A-planarity','A-dipole_moment', 'A-polarizability', 'A-SolvationEnergy_water', 'A-SolvationEnergy_hexane', 'D-HOMO', 'D-HOMOminus1', 'D-LUMO', 'D-LUMOplus1', 'D-fundbg', 'D-deltaHOMO', 'D-deltaLUMO', 'D-opt_bg', 'D-max_abs', 'D-summed_oscs', 'D-area_spectra', 'D-area_sim_solar_spectra', 'D-chemical_potential', 'D-electrophilicity', 'D-pi_sys_size', 'D-num_rot_bonds', 'D-MolLogP', 'D-TPSA', 'D-NumHAcceptors', 'D-NumHDonors', 'D-RingCount',  'D-planarity','D-dipole_moment', 'D-polarizability', 'D-SolvationEnergy_water', 'D-SolvationEnergy_hexane', 'AD-overlap', 'AD-HOMOoffset', 'AD-LUMOoffset', 'DHOMO_ALUMO_offset']

# add column names for 2048 bit morgan fingerprints
for x in range(2048):
    col_name = 'A-ECFP_' + str(x)
    column_names.append(col_name)
for x in range(2048):
    col_name = 'D-ECFP_' + str(x)
    column_names.append(col_name)

data = pd.DataFrame(columns = column_names)

for x in range(len(experimental)):
    try:
    
        data = []

        acceptor = experimental.iloc[x][1]
        donor = experimental.iloc[x][2]
        reference = experimental.iloc[x][4]
        Voc = experimental.iloc[x][5]
        Jsc = experimental.iloc[x][6]
        FF = experimental.iloc[x][7]
        PCE = experimental.iloc[x][8]

        data.extend([acceptor, donor, reference, Voc, Jsc, FF, PCE])

        
        # paths to the GFN2-xTB, sTD-DFT-xTB, and xTB calculations
        acc_stda = '../Calculations/acceptors/sTDDFT-xTB/' + acceptor + '.stda'
        acc_mol = '../Calculations/acceptors/GFN2/' + acceptor + '.mol'
        acc_GFN2 = '../Calculations/acceptors/GFN2/' + acceptor + '.out'
        acc_solv_water = '../Calculations/acceptors/xtb_solvation_water/' + acceptor + '.out'
        acc_solv_hexane = '../Calculations/acceptors/xtb_solvation_hexane/' + acceptor + '.out'
        don_stda = '../Calculations/donors/sTDDFT-xTB/' + donor + '.stda'
        don_mol = '../Calculations/donors/GFN2/' + donor + '.mol'
        don_GFN2 = '../Calculations/donors/GFN2/' + donor + '.out'
        don_solv_water = '../Calculations/donors/xtb_solvation_water/' + donor + '.out'
        don_solv_hexane = '../Calculations/donors/xtb_solvation_hexane/' + donor + '.out'

        # parse sTDDFT-xtb output files of acceptors
        parse_sTDA(acc_stda, acceptor, data) #HOMO, HOMOminus1, LUMO, LUMOplus1, fundbg, deltaHOMO, deltaLUMO, opt_bg, max_abs, summed_oscs, area_spectra, area_sim_solar_spectra, chemical_potential, electrophilicity

        # parse GFN2-xtb files for acceptor
        pi_sys_size(acc_mol, acceptor, data) #pi_size
        rdkit_descriptors(acc_mol, data) #num_rot_bonds, MolLogP, TPSA, NumHAcceptors, NumHDonors
        planarity(acc_mol, data) # planarity

        # calculate pi system size of acceptor
        parse_GFN2(acc_GFN2, acceptor, data) #dipole_moment, polarizability

        # calculate solvation free energy of acceptor in water
        solvation(acc_solv_water, data) #solvation_energy

        # calculate solvation free energy of acceptor in hexane
        solvation(acc_solv_hexane, data) #solvation_energy

        # parse sTDDFT-xtb output files of donors
        parse_sTDA(don_stda, donor, data) #HOMO, HOMOminus1, LUMO, LUMOplus1, fundbg, deltaHOMO, deltaLUMO, opt_bg, max_abs, summed_oscs, area_spectra, area_sim_solar_spectra, chemical_potential, electrophilicity

        # parse GFN2-xtb files for donor
        pi_sys_size(don_mol, donor, data) #pi_size
        rdkit_descriptors(don_mol, data) #num_rot_bonds, MolLogP, TPSA, NumHAcceptors, NumHDonors
        planarity(don_mol, data) # planarity

        # calculate pi system size of donor
        parse_GFN2(don_GFN2, donor, data) #dipole_moment, polarizability

        # calculate solvation free energy of donor in water
        solvation(don_solv_water, data) #solvation_energy

        # calculate solvation free energy of donor in hexane
        solvation(don_solv_hexane, data) #solvation_energy

        # calculate overlap between acceptor and donor
        overlap(acc_stda, don_stda, data) #area_altered_spectra

        # Energy offsets between donor and acceptor
        energy_offsets(acc_stda, don_stda, data)
        
        # For morgan fingerprint counts
        morgan_fp_counts(acc_mol, data)
        morgan_fp_counts(don_mol, data)
        

        data.loc[len(data.index)] = data
        
    except:
        print(acceptor)
        print(donor)
        continue
data

# Training OPEP2 model

Let's look at our dataset

In [None]:
print('Number of donor-acceptor pairs is: ' + str(len(data)))

data_highPCE = data[data['PCE'] > 10]
print('Number of donor-acceptor pairs with a PCE greater than 10% is: ' + str(len(data_highPCE)))

We will first stanardize the descriptors

In [None]:
X_unstandardized = data_highPCE.iloc[:, 8:64]
y = data_highPCE.iloc[:, 7:8]

#standardize
x_labels = X_unstandardized.columns
scaler = StandardScaler().fit(X_unstandardized[x_labels]) 
X_unstandardized[x_labels] = scaler.transform(X_unstandardized[x_labels])

fps = data_highPCE.iloc[:,64:]
X = pd.concat([X_unstandardized, fps], axis=1)

Now we will randomly split the dataset into 80% training and 20% testing sets

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=14)

This is how we will evaluate the performance of the model, using cross-validation to calculate R2, MAE, and RMSE

In [None]:
def model_metrics(model, X, y, cv=5):
    score = cross_val_score(model, X, y, cv=cv, scoring = 'r2')
    SEM_r2 = round((score.std() / math.sqrt(5)) , 2)
    r2 = round(float(score.mean()), 2)
    print('R^2: ' + str(r2) + ' +/- ' +  str(SEM_r2))
    
    score = cross_val_score(model, X, y, cv=cv, scoring = 'neg_mean_absolute_error')
    score = np.multiply(score, -1)
    SEM_MAE = round((score.std() / math.sqrt(5)) , 2)
    MAE = round(score.mean(), 2) 
    print('MAE: ' + str(MAE) + ' +/- ' +  str(SEM_MAE))
    
    score = cross_val_score(model, X, y, cv=cv, scoring = 'neg_mean_squared_error')
    score = np.sqrt(score * -1)
    SEM_RMSE = round((score.std() / math.sqrt(5)) , 2)
    RMSE = round(score.mean(), 2)
    print('RMSE: ' + str(RMSE) + ' +/- ' +  str(SEM_RMSE))
    
    return r2, SEM_r2, RMSE, SEM_RMSE

This is where we will train the model. The hyperparameter tuning was previously performed in a separate notebook.

In [None]:
rf_model =RandomForestRegressor(n_estimators=80,max_depth=8, min_samples_split=2, min_samples_leaf=2, max_features=57, random_state=20)
ann_model = MLPRegressor(hidden_layer_sizes = (50, 50, 50), learning_rate = 'adaptive', random_state=14, max_iter=500)

em_rf_ann = VotingRegressor([('rf', rf_model), ('ann', ann_model)])
em_rf_ann.fit(x_train, y_train.values.ravel())

print('training set')
R2_train, R2_SEM_train, RMSE_train, RMSE_SEM_train = model_metrics(em_rf_ann, x_train, y_train, cv=5)
print('test set')
R2_test, R2_SEM_test, RMSE_test, RMSE_SEM_test = model_metrics(em_rf_ann, x_test, y_test, cv=5)

em_rf_ann_predictions_test = em_rf_ann.predict(x_test)
em_rf_ann_predictions_train = em_rf_ann.predict(x_train)

We can plot the predictions vs experimental:

In [None]:
fig, ax = plt.subplots(figsize = (5.5, 5))
ax.scatter(em_rf_ann_predictions_train, y_train, color = '#882255')
ax.scatter(em_rf_ann_predictions_test, y_test, color = '#44AA99')

# x=y line
lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
    np.max([ax.get_xlim(), ax.get_ylim()])]  # max of both axes

# now plot both limits against eachother
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)

ax.set_ylabel('Experimental PCE (%)', labelpad=10, weight='bold', size=16)
ax.set_xlabel('Predicted PCE (%)', labelpad=10, weight='bold', size=16)
ax.tick_params(axis = 'x', labelsize=14)
ax.tick_params(axis = 'y', labelsize=14)

text_train = 'Training Set\n$R^2$=' + str(R2_train) + '$\pm$' + str(R2_SEM_train) + '\nRMSE=' + str(RMSE_train)+ '$\pm$' + str(RMSE_SEM_train)
text_test = 'Test Set\n$R^2$=' + str(R2_test) + '$\pm$' + str(R2_SEM_test) + '\nRMSE=' + str(RMSE_test)+ '$\pm$' + str(RMSE_SEM_test)

ax.text(15.4, 11.5, text_train, ha='left', size=13, color='#882255')
ax.text(15.4, 9.5, text_test, ha='left', size=13, color='#44AA99')
ax.set_aspect('equal')
plt.tight_layout()

plt.show()