In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler  
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr

# Read data
train_df = pd.read_excel('10fold-data/homonuclear-159-fold6-train-test.xlsx', sheet_name='Train')
test_df = pd.read_excel('10fold-data/homonuclear-159-fold6-train-test.xlsx', sheet_name='Test')
y_train = train_df.iloc[:, 1].values
X_train = train_df.iloc[:, 2:]  
y_test = test_df.iloc[:, 1].values
X_test = test_df.iloc[:, 2:]  

In [None]:
#RFR
param_grid = {'n_estimators': list(range(200, 350, 10))}
rf_model = RandomForestRegressor(random_state=2)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=10, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

#predict
y_pred_train = best_rf_model.predict(X_train)
y_pred_test = best_rf_model.predict(X_test)
train_r, _ = pearsonr(y_train, y_pred_train)
test_r, _ = pearsonr(y_test, y_pred_test)
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(f"Random Forest (Best Model):")
print(f"  Train_r: {train_r:.2f}, Test_r: {test_r:.2f}")
print(f"  Train RMSE: {train_rmse:.2f}, Test RMSE: {test_rmse:.2f}")

train_label = f"Training: r/{train_r:.2f}, RMSE/{train_rmse:.2f}"
test_label = f"Test: r/{test_r:.2f}, RMSE/{test_rmse:.2f}"
plt.figure(figsize=(6, 6))
plt.plot([np.min(y_train), np.max(y_train)], [np.min(y_train), np.max(y_train)], color='black', linestyle='--', linewidth=1)
plt.scatter(y_train, y_pred_train, color='#8DB8F1', label=train_label, s=30)
plt.scatter(y_test, y_pred_test, color='#F47575', label=test_label, s=30)
plt.xlabel(r'Experimental lg$(k_1)$', fontsize=16)
plt.ylabel(r'Predicted lg$(k_1)$', fontsize=16)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize=15)
plt.show()

In [None]:
# GBR
param_grid = {'n_estimators': list(range(100, 200, 10)), 'learning_rate': [0.08, 0.10, 0.12]}
gbr_model = GradientBoostingRegressor(random_state=2)
grid_search = GridSearchCV(estimator=gbr_model, param_grid=param_grid, cv=10, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_gbr_model = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

# Predict
y_pred_train = best_gbr_model.predict(X_train)
y_pred_test = best_gbr_model.predict(X_test)
train_r, _ = pearsonr(y_train, y_pred_train)
test_r, _ = pearsonr(y_test, y_pred_test)
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(f"Gradient Boosting Regressor (Best Model):")
print(f"  Train_r: {train_r:.2f}, Test_r: {test_r:.2f}")
print(f"  Train RMSE: {train_rmse:.2f}, Test RMSE: {test_rmse:.2f}")

train_label = f"Training: r/{train_r:.2f}, RMSE/{train_rmse:.2f}"
test_label = f"Test: r/{test_r:.2f}, RMSE/{test_rmse:.2f}"

# Plot results
plt.figure(figsize=(6, 6))
plt.plot([np.min(y_train), np.max(y_train)], [np.min(y_train), np.max(y_train)], color='black', linestyle='--', linewidth=1)
plt.scatter(y_train, y_pred_train, color='#8DB8F1', label=train_label, s=40)
plt.scatter(y_test, y_pred_test, color='#F47575', label=test_label, s=40)
plt.xlabel(r'Experimental lg$(k_1)$', fontsize=16)
plt.ylabel(r'Predicted lg$(k_1)$', fontsize=16)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize=15)
plt.show()

In [None]:
# MLR 
mlr_model = LinearRegression()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

mlr_model.fit(X_train_scaled, y_train)

# Predict
y_pred_train = mlr_model.predict(X_train_scaled)
y_pred_test = mlr_model.predict(X_test_scaled)

train_r, _ = pearsonr(y_train, y_pred_train)
test_r, _ = pearsonr(y_test, y_pred_test)
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Multiple Linear Regression (Best Model):")
print(f"  Train_r: {train_r:.2f}, Test_r: {test_r:.2f}")
print(f"  Train RMSE: {train_rmse:.2f}, Test RMSE: {test_rmse:.2f}")

train_label = f"Training: r/{train_r:.2f}, RMSE/{train_rmse:.2f}"
test_label = f"Test: r/{test_r:.2f}, RMSE/{test_rmse:.2f}"

plt.figure(figsize=(6, 6))
plt.plot([np.min(y_train), np.max(y_train)], [np.min(y_train), np.max(y_train)], color='black', linestyle='--', linewidth=1)
plt.scatter(y_train, y_pred_train, color='#8DB8F1', label=train_label, s=40)
plt.scatter(y_test, y_pred_test, color='#F47575', label=test_label, s=40)
plt.xlabel(r'Experimental lg$(k_1)$', fontsize=16)
plt.ylabel(r'Predicted lg$(k_1)$', fontsize=16)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize=15)
plt.show()

In [None]:
# SVR
param_grid = {
    'kernel': ['rbf', 'linear'],
    'C': [0.5, 1.0, 1.5],
    'gamma': ['scale', 'auto'],
    'epsilon': [0.05, 0.1, 0.15, 0.20]
}
svr_model = SVR() 
grid_search = GridSearchCV(estimator=svr_model, param_grid=param_grid, cv=10, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_svr_model = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

# predict
y_pred_train = best_svr_model.predict(X_train)
y_pred_test = best_svr_model.predict(X_test)
train_r, _ = pearsonr(y_train, y_pred_train)
test_r, _ = pearsonr(y_test, y_pred_test)
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Support Vector Regression (Best Model):")
print(f"  Train_r: {train_r:.2f}, Test_r: {test_r:.2f}")
print(f"  Train RMSE: {train_rmse:.2f}, Test RMSE: {test_rmse:.2f}")
train_label = f"Training: r/{train_r:.2f}, RMSE/{train_rmse:.2f}"
test_label = f"Test: r/{test_r:.2f}, RMSE/{test_rmse:.2f}"

plt.figure(figsize=(6, 6))
plt.plot([np.min(y_train), np.max(y_train)], [np.min(y_train), np.max(y_train)], color='black', linestyle='--', linewidth=1)
plt.scatter(y_train, y_pred_train, color='#8DB8F1', label=train_label, s=30)
plt.scatter(y_test, y_pred_test, color='#F47575', label=test_label, s=30)
plt.xlabel(r'Experimental lg$(k_1)$', fontsize=16)
plt.ylabel(r'Predicted lg$(k_1)$', fontsize=16)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize=15)
plt.show()