In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from rdkit import Chem
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_excel('data/HTL_TABLE.xlsx') 
smiles_list = df['SMILES'].dropna().tolist() 
labels = df['PCE (%)'].dropna().tolist()

In [21]:
unique_smiles = []
unique_labels = []
for i, smiles in enumerate(smiles_list):
    # Make sure SMILES is a string
    if isinstance(smiles, str):
        molecule = Chem.MolFromSmiles(smiles)
        if molecule is not None:  # Ensures the molecule was created successfully
            canonical_smiles = Chem.MolToSmiles(molecule, canonical=True)
            if canonical_smiles not in unique_smiles:
                unique_smiles.append(canonical_smiles)
                unique_labels.append(labels[i])

In [22]:
from deepchem.feat import RDKitDescriptors
featurizer = RDKitDescriptors()
features = featurizer.featurize(unique_smiles)
print(f"Number of generated molecular descriptors: {features.shape[1]}")

# Drop the features containing invalid values
features = features[:, ~np.isnan(features).any(axis=0)]
print(f"Number of molecular descriptors without invalid values: {features.shape[1]}")

Number of generated molecular descriptors: 209
Number of molecular descriptors without invalid values: 197


In [23]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.0)
features = selector.fit_transform(features)
print(f"Number of molecular descriptors after removing zero-variance features: {features.shape[1]}")

Number of molecular descriptors after removing zero-variance features: 145


In [24]:
X = features
X_train, X_test, y_train, y_test = train_test_split(X, unique_labels, test_size=0.2, random_state=42)

In [25]:
xgb_reg = xgb.XGBRegressor(n_estimators=10, random_state=0) 

In [26]:
from sklearn.metrics import mean_squared_error, r2_score

def train_test_model(model, X_train, y_train, X_test, y_test):
    """
    Function that trains a model, and tests it.
    Inputs: sklearn model, train_data, test_data
    """
    # Train model
    model.fit(X_train, y_train)
    
    # Calculate RMSE on training and testing
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    model_train_mse = mean_squared_error(y_train, y_pred_train)
    model_test_mse = mean_squared_error(y_test, y_pred_test)
    model_train_rmse = model_train_mse ** 0.5
    model_test_rmse = model_test_mse ** 0.5

    # Calculate R^2 on training and testing
    model_train_r2 = r2_score(y_train, y_pred_train)
    model_test_r2 = r2_score(y_test, y_pred_test)
    
    print(f"RMSE on train set: {model_train_rmse:.3f}, test set: {model_test_rmse:.3f}.")
    print(f"R^2 on train set: {model_train_r2:.3f}, test set: {model_test_r2:.3f}.\n")


# Train and test XGBoost model
print("Evaluating XGBoost model.")
train_test_model(xgb_reg, X_train, y_train, X_test, y_test)


Evaluating XGBoost model.
RMSE on train set: 1.429, test set: 3.442.
R^2 on train set: 0.890, test set: 0.215.



In [27]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10], 
    'learning_rate': [0.01, 0.1, 0.2], 
    'n_estimators': [100, 500, 1000],
    'reg_lambda': [0.01, 0.1, 1],
    # Add more parameters here as needed
}

# Initialize the grid search
grid_search = GridSearchCV(xgb_reg, param_grid, cv=5, scoring='r2', n_jobs=-1)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

print("Best parameters found: ", best_params)

Best parameters found:  {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500, 'reg_lambda': 1}


In [28]:
model = xgb.XGBRegressor(**best_params)
train_test_model(model, X_train, y_train, X_test, y_test)

RMSE on train set: 1.090, test set: 3.306.
R^2 on train set: 0.936, test set: 0.275.

