# Training RF and Gradient Boosting models

In [None]:
import pandas as pd
import re

# Load the CSV file
file_path = "perovskite_database_query.csv"
data = pd.read_csv(file_path)

# Define the columns to keep
columns_to_keep = [
    'Cell_stack_sequence', 'Cell_architecture',
    'Substrate_stack_sequence', 'Substrate_thickness',
    'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations',
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 
    'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness',
    'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations',
    'Backcontact_stack_sequence', 'Backcontact_thickness', 
    'Backcontact_additives_compounds', 'Backcontact_additives_concentrations',
    'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 
    'Add_lay_front_additives_compounds', 'Add_lay_front_additives_concentrations',
    'Add_lay_back', 'Add_lay_back_function', 'Add_lay_back_stack_sequence', 'Add_lay_back_thickness_list', 
    'Add_lay_back_additives_compounds', 'Add_lay_back_additives_concentrations',
    'Encapsulation', 'Encapsulation_stack_sequence',
    'JV_default_PCE', 'JV_default_Voc', 'JV_default_Jsc', 'JV_default_FF'
  # Added the target column
]

# Filter columns to keep only those that exist in the dataset
existing_columns = [col for col in columns_to_keep if col in data.columns]
data = data[existing_columns]

data.columns = data.columns.str.strip()

# Function to clean molecule names
def clean_molecule_name(name):
    name = re.sub(r'[^a-zA-Z0-9\s\-()]+', ' ', name.strip())
    name = re.sub(r'\s+', ' ', name).strip()
    elements = [element for element in name.split() if element and not element.replace('.', '', 1).isdigit()]
    return elements

# Function to clean and convert coefficients to floats
def clean_and_convert_coefficient(coefficient):
    try:
        cleaned_coefficient = re.sub(r'[^0-9.eE-]', '', coefficient.replace(',', '').strip())
        return float(cleaned_coefficient) if cleaned_coefficient else 0.0
    except ValueError:
        return 0.0

# Function to normalize coefficients
def normalize_coefficients(cell):
    if pd.notna(cell):
        try:
            coefficients = [float(x.strip()) for x in re.split(r'[;|]', cell) if x.strip()]
            total_sum = sum(coefficients)
            return ';'.join(f"{val / total_sum:.3f}" for val in coefficients) if total_sum > 0 else cell
        except ValueError:
            return cell
    return cell

# Normalize coefficients in each column
coefficient_columns = [
    'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions_coefficients', 
    'Perovskite_composition_c_ions_coefficients'
]

for col in coefficient_columns:
    data[col] = data[col].apply(normalize_coefficients)

# Create a set of unique molecules and add new columns
unique_molecules = set()
for index, row in data.iterrows():
    for column_group in ['a', 'b', 'c']:
        ions_column = f'Perovskite_composition_{column_group}_ions'
        coefficients_column = f'Perovskite_composition_{column_group}_ions_coefficients'
        ions = clean_molecule_name(str(row.get(ions_column, "")))
        coefficients = [clean_and_convert_coefficient(c) for c in str(row.get(coefficients_column, "")).split(';')]
        unique_molecules.update(ions)

# Create columns for each unique molecule and initialize to zero
for molecule in unique_molecules:
    data[molecule] = 0.0

# Populate the molecule columns with coefficients
for index, row in data.iterrows():
    for column_group in ['a', 'b', 'c']:
        ions_column = f'Perovskite_composition_{column_group}_ions'
        coefficients_column = f'Perovskite_composition_{column_group}_ions_coefficients'
        ions = clean_molecule_name(str(row.get(ions_column, "")))
        coefficients = [clean_and_convert_coefficient(c) for c in str(row.get(coefficients_column, "")).split(';')]
        total_coeff = sum(coefficients) if sum(coefficients) != 0 else 1
        for ion, coeff in zip(ions, coefficients):
            data.at[index, ion] += coeff / total_coeff

# Create a new column 'Layer Type' to indicate if the row is multilayered or single-layered
ion_columns = [
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients'
]

data['Layer Type'] = data.apply(
    lambda row: 'Multi-layered Perovskite' if any('|' in str(row[col]) for col in ion_columns) else 'Single-layered Perovskite',
    axis=1
)

# Add combined ions and coefficients columns
data['combined_ions'] = data.apply(
    lambda row: f"{row.get('Perovskite_composition_a_ions', '')},{row.get('Perovskite_composition_b_ions', '')},{row.get('Perovskite_composition_c_ions', '')}", 
    axis=1
)

data['combined_coefficients'] = data.apply(
    lambda row: f"{row.get('Perovskite_composition_a_ions_coefficients', '')},{row.get('Perovskite_composition_b_ions_coefficients', '')},{row.get('Perovskite_composition_c_ions_coefficients', '')}", 
    axis=1
)

# Convert combined columns to lists and clean coefficients
import re

def convert_to_list(entry):
    if isinstance(entry, str):
        entry = re.sub(r'[;|]', ',', entry)
        return [item.strip() for item in entry.split(',') if item.strip()]
    elif isinstance(entry, list):
        return entry
    else:
        return []

def safe_convert_to_float(entry):
    try:
        return float(entry)
    except ValueError:
        return 0.0

def generate_combined_sites(row):
    ions = row['combined_ions']
    coefficients = row['combined_coefficients']
    sites = []
    site_labels = ['a', 'b', 'c']
    for site, ions_col, coeff_col in zip(site_labels, 
                                         ['Perovskite_composition_a_ions', 'Perovskite_composition_b_ions', 'Perovskite_composition_c_ions'], 
                                         ['Perovskite_composition_a_ions_coefficients', 'Perovskite_composition_b_ions_coefficients', 'Perovskite_composition_c_ions_coefficients']):
        num_ions = len(clean_molecule_name(str(row.get(ions_col, ""))))
        num_coefficients = len(str(row.get(coeff_col, "")).split(';'))
        sites.extend([site] * max(num_ions, num_coefficients))
    return sites

data['combined_ions'] = data['combined_ions'].apply(convert_to_list)
data['combined_coefficients'] = data['combined_coefficients'].apply(
    lambda x: [safe_convert_to_float(item) for item in convert_to_list(x)]
)
data['combined_sites'] = data.apply(generate_combined_sites, axis=1)

def clean_coefficients(coefficients):
    cleaned = []
    for c in coefficients:
        if isinstance(c, float):
            cleaned.append(c)
        elif isinstance(c, str) and c.replace('.', '', 1).isdigit():
            cleaned.append(float(c))
        else:
            cleaned.append(0.0)
    return cleaned

def normalize_coefficients_within_cell(row):
    ions = row['combined_ions']
    coefficients = row['combined_coefficients']
    sites = row['combined_sites']
    site_a_coeffs = []
    site_b_coeffs = []
    site_c_coeffs = []
    for coeff, site in zip(coefficients, sites):
        try:
            coeff = float(coeff)
        except ValueError:
            coeff = 0.0
        if site == 'a':
            site_a_coeffs.append(coeff)
        elif site == 'b':
            site_b_coeffs.append(coeff)
        elif site == 'c':
            site_c_coeffs.append(coeff)
    def normalize(site_coeffs):
        total = sum(site_coeffs)
        return [coeff / total if total > 0 else 0.0 for coeff in site_coeffs]
    site_a_coeffs = normalize(site_a_coeffs)
    site_b_coeffs = normalize(site_b_coeffs)
    site_c_coeffs = normalize(site_c_coeffs)
    normalized_coeffs = site_a_coeffs + site_b_coeffs + site_c_coeffs
    return normalized_coeffs

data['combined_coefficients'] = data['combined_coefficients'].apply(clean_coefficients)
data['combined_coefficients'] = data.apply(normalize_coefficients_within_cell, axis=1)

# Drop the original ion columns
data = data.drop(columns=ion_columns, errors='ignore')

# Save the modified DataFrame
output_file_path = 'data_with_layer_type_and_combined.csv'
data.to_csv(output_file_path, index=False)
print("CSV file with layer type information modified and saved as:", output_file_path)


In [None]:
# This is for vecotorised, with molecules and JV_default_Voc', 'JV_default_Jsc','JV_default_FF'
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import csv
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# 1. Load and Filter Data
def load_and_filter_data(file_path):
    """
    Loads the dataset from the specified CSV file and filters rows where 'Cell_architecture' is 'nip'.
    """
    data = pd.read_csv(file_path)
    data = data[data['Cell_architecture'].str.strip().str.lower() == 'nip']
    data = data.reset_index(drop=True)
    return data

# 2. Define Layer Columns
def define_layer_columns():
    """
    Defines the mapping between stack sequence columns and their corresponding layer names.
    """
    layer_columns = {
        'Cell_stack_sequence': 'Cell',
        'Substrate_stack_sequence': 'Substrate',
        'ETL_stack_sequence': 'ETL',
        'HTL_stack_sequence': 'HTL',
        'Backcontact_stack_sequence': 'Backcontact',
        'Add_lay_back_stack_sequence': 'Add_Lay_Back',
        'Encapsulation_stack_sequence': 'Encapsulation'
    }
    return layer_columns

# 3. Parse Sequences from Multiple Columns
def parse_sequences_from_columns(dataframe, layer_columns):
    """
    Parses material sequences from multiple layer-specific columns and maps materials to their layers.
    """
    sequences = []
    material_layer_map = {}
    layer_names = list(layer_columns.values())
    
    for idx, row in dataframe.iterrows():
        sequence = []
        for col, layer_name in layer_columns.items():
            seq_str = row.get(col, "")
            if pd.isna(seq_str) or not seq_str.strip():
                continue
            sub_layers = seq_str.split(' | ')
            for sub_layer in sub_layers:
                materials = [material.strip() for material in sub_layer.split('; ') if material.strip()]
                sequence.extend(materials)
                for material in materials:
                    if material not in material_layer_map:
                        material_layer_map[material] = {}
                    if layer_name not in material_layer_map[material]:
                        material_layer_map[material][layer_name] = 0
                    material_layer_map[material][layer_name] += 1
        sequences.append(sequence)
    
    return sequences, material_layer_map, layer_names

# 4. Train Word2Vec Model
def train_word2vec(sequences, vector_size=50, window=5, min_count=1, workers=4, sg=1):
    """
    Trains a Word2Vec model on the provided material sequences.
    """
    model = Word2Vec(
        sentences=sequences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers,
        sg=sg
    )
    return model

# 5. Aggregate Embeddings for Each Sample
def aggregate_embeddings(sequences, model, vector_size=50):
    """
    Aggregates material embeddings for each sample by averaging.
    """
    aggregated_features = []
    for seq in sequences:
        if len(seq) == 0:
            aggregated_features.append(np.zeros(vector_size))
            continue
        vectors = [model.wv[material] for material in seq if material in model.wv]
        if vectors:
            aggregated = np.mean(vectors, axis=0)
        else:
            aggregated = np.zeros(vector_size)
        aggregated_features.append(aggregated)
    return np.array(aggregated_features)

# 6. Prepare Features and Targets
def prepare_features_targets(aggregated_features, dataframe, target_column='JV_default_PCE'):
    """
    Prepares the feature matrix and target vector for model training.
    Includes embeddings and specified molecule columns.
    """
    # Molecule columns as a list
    molecule_columns = [
        '(DAP)', '(PEI)', '(ThFA)', 'Sn', 'S', 'Tb', 'Sm', 'TN', '(PPA)', '(PDMA)', '(FEA)', 
        '(PyrEA)', 'OA', '(PBA)', '(PTA)', '(CPEA)', '(TEA)', '(mF1PEA)', 'FA', '(BI)', 'IM', 
        '(oF1PEA)', '(PA)', '(iPA)', 'Mg', 'Y', 'PR', '(PF6)', '(ODA)', 'F', 'BU', '(Ada)', 
        'Ca', 'NEA', '(SCN)', '(N-EtPy)', 'HA', '(MIC1)', 'Br', '(AVA)', '((CH3)3S)', '(BIM)', 
        'Mn', 'MA', '(4AMP)', '(A43)', '(CH3)3S', '(PPEA)', '(F5PEA)', '(C4H9N2H6)', '(5-AVAI)', 
        'Sr', '(DMA)', 'CA', 'Al', '(NH4)', '(4AMPY)', 'PN', 'Sb', '(PDA)', '(ALA)', 'Nb', 'Te', 
        'TA', '(MTEA)', '(Cl-PEA)', '(iso-BA)', '(DPA)', '(BYA)', 'DA', 'Bi', '(HTAB)', 'AN', 
        'NMABr', '(CHMA)', '(F3EA)', 'In', '(6-ACA)', 'GU', '(ImEA)', '(HEA)', 'IA', 'Aa', 
        '(APMim)', '(C8H17NH3)', '(Br-PEA)', 'PMA', '(MIC2)', '(PGA)', 'I', '(5-AVA)', '(PEA)', 
        'K', '(BEA)', '(PMA)', 'Eu', 'Cl', '(3AMP)', '(F-PEA)', '(C6H4NH2)', '(CH3ND3)', '(4FPEA)', 
        '(DAT)', '(Anyl)', '(TBA)', '(4ApyH)', 'Ba', '(pF1PEA)', '(TMA)', 'Rb', '(3AMPY)', '(IEA)', 
        '(nan)', '(NMA)', 'Ni', '(pFPEA)', '(BE)', '(EU-pyP)', '(PyEA)', '(BzDA)', 'Co', '(Ace)', 
        'Hg', 'Pb', '(EDA)', '(oFPEA)', 'Bn', '(f-PEA)', '(C4H9NH3)', '(CIEA)', '(mFPEA)', 'BA', 
        'DI', '(HdA)', '(PDA)', '(GABA)', 'Cu', 'PA', '(DMA)', 'Na', '(EPA)', '(OdA)', '(THM)', 
        'Ge', '(HDA)', '(BF4)', '(FPEA)', '(MIC3)', 'GA', '(ThMA)', 'Cs', '(BZA)', 'Au', '(H-PEA)', 
        'Ag', '(SCN)', '(TFEA)', 'EA', 'FPEAI', 'Fe', '(n-C3H7NH3)', '(BdA)', '(EDA)', 'BDA', 'Cr', 
        'Pt', 'Ti', '(C6H13NH3)', '(HAD)', 'Li', '(BDA)', 'O', 'La', 'Zn', 'JV_default_Voc', 'JV_default_Jsc','JV_default_FF'
    ]
    
    # Keep only columns that exist in the dataframe
    existing_molecule_columns = [col for col in molecule_columns if col in dataframe.columns]
    missing_columns = [col for col in molecule_columns if col not in dataframe.columns]
    if missing_columns:
        print(f"The following specified molecule columns are not in the dataframe and will be skipped: {missing_columns}")
    
    # Extract the specified molecule columns from the dataframe
    molecule_features = dataframe[existing_molecule_columns]
    
    # Handle missing values in molecule features
    molecule_features = molecule_features.fillna(0.0)
    
    # Combine embeddings and molecule features
    combined_features = np.hstack([aggregated_features, molecule_features.values])
    
    # Handle missing values in the combined features
    combined_features = np.nan_to_num(combined_features, nan=0.0)
    
    # Combine features and target into a DataFrame for easier handling
    feature_df = pd.DataFrame(combined_features)
    target_series = dataframe[target_column]
    
    # Concatenate features and target
    combined_df = pd.concat([feature_df, target_series], axis=1)
    
    # Drop rows where target is NaN
    initial_count = combined_df.shape[0]
    combined_df = combined_df.dropna(subset=[target_column])
    final_count = combined_df.shape[0]
    dropped = initial_count - final_count
    if dropped > 0:
        print(f"Dropped {dropped} samples due to NaN in target '{target_column}'.")
    
    # Separate features and target
    X = combined_df.drop(columns=[target_column]).values
    y = combined_df[target_column].values
    
    return X, y

# 7. Train and Evaluate Models
def train_evaluate_models_with_grid_search(X, y, cv=3):
    """
    Trains and evaluates models using GridSearchCV for hyperparameter tuning.
    """
    models = {
        'RandomForest': {
            'model': RandomForestRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 150],
                'max_depth': [None, 10],
                'min_samples_split': [2, 4],
                'min_samples_leaf': [1, 2]
            }
        },
        'GradientBoosting': {
            'model': GradientBoostingRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 150],
                'learning_rate': [0.05, 0.1],
                'max_depth': [3, 4],
                'min_samples_split': [2, 4]
            }
        }
    }
    results = []
    
    for model_name, config in models.items():
        print(f"Training {model_name}...")
        
        grid_search = GridSearchCV(
            estimator=config['model'],
            param_grid=config['params'],
            cv=cv,
            scoring='r2',
            n_jobs=-1,
            verbose=0
        )
        
        grid_search.fit(X, y)
        
        best_params = grid_search.best_params_
        best_estimator = grid_search.best_estimator_
        y_pred = best_estimator.predict(X)
        
        mae = mean_absolute_error(y, y_pred)
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        
        results.append({
            'Model': model_name,
            'Parameters': best_params,
            'MAE': mae,
            'MSE': mse,
            'R2': r2
        })
        
        print(f"{model_name} Best Params: {best_params}")
        print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, R2: {r2:.4f}")
    
    return results

# 8. Save Results
def save_results_to_csv(results, filename='model_results.csv'):
    """
    Saves the model training results to a CSV file.
    """
    if not results:
        print("No results to save.")
        return
    
    keys = results[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(results)
    print(f"Results saved to {filename}")

# 9. Main Execution Function
def main():
    # File path to the CSV dataset
    file_path = 'data_with_layer_type_and_combined.csv'
    
    # Load and filter data
    data = load_and_filter_data(file_path)
    print(f"Loaded data with {data.shape[0]} samples.")
    
    # Define layer columns
    layer_columns = define_layer_columns()
    
    # Parse sequences
    tokenized_sequences, material_layer_map, layer_names = parse_sequences_from_columns(data, layer_columns)
    print("Parsed sequences from columns.")
    
    # Train Word2Vec model
    model = train_word2vec(tokenized_sequences)
    print("Trained Word2Vec model.")
    
    # Aggregate embeddings
    aggregated_features = aggregate_embeddings(tokenized_sequences, model)
    print("Aggregated embeddings for each sample.")
    
    # Prepare features and target
    X, y = prepare_features_targets(aggregated_features, data)
    print(f"Prepared feature matrix with shape {X.shape} and target vector with shape {y.shape}.")
    
    # Train and evaluate models
    results = train_evaluate_models_with_grid_search(X, y)
    
    # Save results
    save_results_to_csv(results)

if __name__ == "__main__":
    main()
