In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df_expanded = pd.read_csv('../data/df_expanded.csv')

# Select relevant features and target variables
X = df_expanded[['perovskite_composition', 'passivating_molecule']]  # Features
y = df_expanded[['treated_pce', 'treated_voc']]  # Targets

# Drop rows with missing target variables (treated_pce, treated_voc)
df_expanded_clean = df_expanded.dropna(subset=['treated_pce', 'treated_voc'])

# Now you can proceed with the cleaned dataset
X = df_expanded_clean[['perovskite_composition', 'passivating_molecule']]
y = df_expanded_clean[['treated_pce', 'treated_voc']]

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: One-hot encode the 'passivating_molecule' and 'perovskite_composition'
preprocessor = ColumnTransformer(
    transformers=[
        ('molecule', OneHotEncoder(handle_unknown='ignore'), ['passivating_molecule']),  # Handle unknown molecules
        ('composition', OneHotEncoder(handle_unknown='ignore'), ['perovskite_composition'])  # Handle unknown compositions
    ]
)

# Build the pipeline with MLPRegressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42))  # Neural Network with 100 nodes in 1 hidden layer
])

# Train the model
model.fit(X_train, y_train)

# Evaluate the model (optional)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


Mean Squared Error: 61.41598403786389
R-squared: -68.50054675511504


In [4]:
# Load the composition and passivating molecule permutations CSV
df_permutations = pd.read_csv('../data/composition_passivator_permutations.csv')

# Select relevant features (first 10 rows) from the permutations CSV
X_permutations_first_10 = df_permutations[['perovskite_composition', 'passivating_molecule']].head(10)

# Make predictions on the first 10 permutations
y_permutations_pred_first_10 = model.predict(X_permutations_first_10)

# Convert predictions into a DataFrame for better visualization
predictions_first_10_df = pd.DataFrame(y_permutations_pred_first_10, columns=['predicted_treated_pce', 'predicted_treated_voc'])

# Concatenate the original permutations with the predictions
df_first_10_with_predictions = pd.concat([df_permutations.head(10), predictions_first_10_df], axis=1)

# Show the predictions
print(df_first_10_with_predictions)

# Optionally: Evaluate model performance on the original training/testing set
# Evaluate on the test set if you haven't already
y_pred = model.predict(X_test)  # Assuming X_test and y_test are already defined in previous steps
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f'Mean Squared Error (Test Set): {mse}')
print(f'R-squared (Test Set): {r2}')


   Unnamed: 0                   perovskite_composition  \
0           0                 FA0.83Cs0.17Pb0.5Sn0.5I3   
1           1                         Cs0.05FA0.95PbI3   
2           2                               CH3NH3PbI3   
3           3                         FA0.98Cs0.02PbI3   
4           4                formamidinium lead iodide   
5           5                           MA0.7FA0.3PbI3   
6           6    formamidinium lead triiodide (FAPbI3)   
7           7     Cs0.05(MA0.10FA0.85)Pb(I0.90Br0.10)3   
8           8  (FAPbI3)0.77 (MAPbBr3)0.14 (CsPbI3)0.09   
9           9                (FAPbI3)0.97(MAPbBr3)0.03   

                                passivating_molecule  predicted_treated_pce  \
0  2,2',6,6'-bis(4-methoxy-2,4,6-trimethylphenyl)...               8.434920   
1                                           PEA2ZnX4              16.873306   
2                         phenylethylammonium iodide              13.512776   
3                   m-TiO2, QD-SnO2, and paa-