In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

# Load dataset
df = pd.read_csv("../data/data_prediction_updated.csv")

# Drop unwanted columns
X = df.drop(columns=["perovskite_composition", "passivating_molecule", "passivating_molecule_cleaned", "passivating_molecule_SMILES", "treated_pce"])
y = df["treated_pce"]

# Handle missing values (if any)
X = X.fillna(X.mean())

# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model Initialization
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Neural Network": MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate models
print("Regression Results:")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"{name} RMSE (Treated PCE): {rmse:.4f}")

# Store predictions
predictions = pd.DataFrame({
    "Actual Treated PCE": y_test.values,
    "Random Forest Predictions": models["Random Forest"].predict(X_test),
    "Neural Network Predictions": models["Neural Network"].predict(X_test),
    "Gradient Boosting Predictions": models["Gradient Boosting"].predict(X_test)
})

# Save results to CSV
predictions.to_csv("model_predictions.csv", index=False)

print("Predictions saved to 'model_predictions.csv'.")


Regression Results:
Random Forest RMSE (Treated PCE): 2.1492
Neural Network RMSE (Treated PCE): 8.3017




Gradient Boosting RMSE (Treated PCE): 2.2734
Predictions saved to 'model_predictions.csv'.




In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from joblib import load

# Load trained models
rf_model = load("random_forest_model.joblib")
nn_model = load("neural_network_model.joblib")
gb_model = load("gradient_boosting_model.joblib")

# Load permutation dataset
permutation_df = pd.read_csv("permutation_df.csv")

# Drop non-numeric columns
X_perm = permutation_df.drop(columns=["passivating_molecule", "perovskite_composition"])

# Handle missing values (if any)
X_perm = X_perm.fillna(X_perm.mean())

# Standardize using the same scaler used during training
scaler = load("scaler.joblib")  # Load the scaler used in training
X_perm_scaled = scaler.transform(X_perm)

# Make predictions
predictions = pd.DataFrame({
    "passivating_molecule": permutation_df["passivating_molecule"],
    "perovskite_composition": permutation_df["perovskite_composition"],
    "Random Forest Predictions": rf_model.predict(X_perm_scaled),
    "Neural Network Predictions": nn_model.predict(X_perm_scaled),
    "Gradient Boosting Predictions": gb_model.predict(X_perm_scaled),
})

# Save results
predictions.to_csv("permutation_predictions.csv", index=False)

print("Predictions saved to 'permutation_predictions.csv'.")


FileNotFoundError: [Errno 2] No such file or directory: 'random_forest_model.joblib'