In [None]:
import numpy as np
import time
import re
import argparse
import sys
import keras
from keras.preprocessing import sequence
from keras.models import Sequential, Model, load_model
from keras.optimizers import Adam
from keras.layers import Dense, Dropout, Activation, Input, Reshape, BatchNormalization
from keras.layers import (
    Conv1D,
    GlobalAveragePooling1D,
    MaxPooling1D,
    GlobalAveragePooling1D,
    Reshape,
    AveragePooling1D,
    Flatten,
    Concatenate,
)
from keras import backend
from keras.callbacks import TensorBoard, LearningRateScheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

parser = argparse.ArgumentParser(description="ML framework")

parser.add_argument(
    "--data_dir",
    type=str,
    help="path to file containing DOS and targets",
)
parser.add_argument(
    "--run_mode",
    default=0,
    type=int,
    help="run regular (0) or 5-fold CV (1) (default: 0)",
)
parser.add_argument(
    "--split_ratio", default=0.2, type=float, help="train/test ratio (default:0.2)"
)
parser.add_argument(
    "--epochs", default=60, type=int, help="number of total epochs to run (default:60)"
)
parser.add_argument(
    "--batch_size", default=32, type=int, help="batch size (default:32)"
)
parser.add_argument(
    "--orbitals", default=9, type=int, help="number of orbitals"
)
parser.add_argument(
    "--seed",
    default=0,
    type=int,
    help="seed for data split(epochs), 0=random (default:0)",
)
parser.add_argument(
    "--save_model",
    default=0,
    type=int,
    help="save model (1) or not (0) (default: 0)",
)
parser.add_argument(
    "--load_model",
    default=0,
    type=int,
    help="load pre-trained model (0)or not (1)? (default: 0)",
)
args = parser.parse_args(sys.argv[1:])
def load_data(args.data_dir): 
    initial_site_dos = df['initial site dos']
    final_site_dos = df['final site dos']
    def convert_string(input_str):
        pattern = r"<(.*?): (.*?): \[(.*?)\]"
        matches = re.findall(pattern, input_str)

        result = {}
        for match in matches:
            if len(match) >= 3:
                orbital = match[0].split(': ')[1] if ': ' in match[0] else match[0]  
                spin = match[1].split(': ')[1] if ': ' in match[1] else match[1]   
                values = [float(val) for val in match[2].split(',')]
                if orbital not in result:
                    result[orbital] = {}
                result[orbital][spin] = values

        final_result = {}
        for orbital, values in result.items():
            if orbital not in final_result:
                final_result[orbital] = {}
            final_result[orbital] = values

        return final_result

    initial_site_dos = [convert_string(string) for string in initial_site_dos]
    final_site_dos = [convert_string(string) for string in final_site_dos]
    max_length = max(len(row) for row in initial_site_dos)
    pra = []
    for row in initial_site_dos:
        pr = row + [0] * (max_length - len(row))
        pra.append(pr)
    initial_dos_arr = pra
    max_length = max(len(row) for row in final_site_dos)
    pra = []
    for row in final_site_dos:
        pr = row + [0] * (max_length - len(row))
        pra.append(pr)
    final_dos_arr = pra
    final_dos_arr = np.array(final_dos_arr)
    final_dos_arr.shape

    dos_training = np.concatenate(initial_dos_arr,final_dos_arr)
    VFE_tr = np.concatenate(initial_VFE,final_VFE)
    
def preprocess_data(X_dos, Y_vfedata):
    X_initial_train, X_initial_test, Y_initial_train, Y_initial_test = train_test_split(
        X_dos[0], Y_vfedata[0], test_size=0.2, random_state=42
    )
    X_final_train, X_final_test, Y_final_train, Y_final_test = train_test_split(
        X_dos[1], Y_vfedata[1], test_size=0.2, random_state=42
    )

    scaler = StandardScaler()
    X_initial_train = scaler.fit_transform(X_initial_train.reshape(-1, X_initial_train.shape[2])).reshape(X_initial_train.shape)
    X_initial_test = scaler.transform(X_initial_test.reshape(-1, X_initial_test.shape[2])).reshape(X_initial_test.shape)
    X_final_train = scaler.fit_transform(X_final_train.reshape(-1, X_final_train.shape[2])).reshape(X_final_train.shape)
    X_final_test = scaler.transform(X_final_test.reshape(-1, X_final_test.shape[2])).reshape(X_final_test.shape)

    return (
        X_initial_train, X_initial_test, Y_initial_train, Y_initial_test,
        X_final_train, X_final_test, Y_final_train, Y_final_test
    )

def main():
    start_time = time.time()

    X_dos, Y_vfedata = load_data()

    X_train, X_test, Y_train, Y_test = preprocess_data(X_dos, Y_vfedata)

    model = build_model()
    train_model(model, X_train, Y_train, X_test, Y_test)

def build_model():
    input_initial_dos = Input(shape=(1573, 5447))
    input_final_dos = Input(shape=(1573, 5447))
    shared_conv = dos_featurizer()
    initial_features = shared_conv(input_initial_dos)
    final_features = shared_conv(input_final_dos)
    merged_features = Concatenate(axis=-1)([initial_features, final_features])
    merged_features = Flatten()(merged_features)
    merged_features = Dropout(0.2)(merged_features)
    output_initial = Dense(1, activation='linear', name='initial_energy')(merged_features)
    output_final = Dense(1, activation='linear', name='final_energy')(merged_features)
    model = Model(inputs=[input_initial_dos, input_final_dos], outputs=[output_initial, output_final])
    model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=['mae'])
    
    return model

def train_model(model, X_train, Y_train, X_test, Y_test):
    model.fit(X_train, [Y_train, Y_train], validation_data=(X_test, [Y_test, Y_test]), epochs=10, batch_size=32)
    
def dos_featurizer():
    input_dos = Input(shape=(1573, 5447))
    x = input_dos
    x = Conv1D(filters=32, kernel_size=5, padding='same', activation='relu')(x)
    x = AveragePooling1D(pool_size=2, strides=2, padding="same")(x)
    x = Conv1D(filters=64, kernel_size=5, padding='same', activation='relu')(x)
    x = AveragePooling1D(pool_size=2, strides=2, padding="same")(x)
    x = Conv1D(filters=128, kernel_size=5, padding='same', activation='relu')(x)
    x = AveragePooling1D(pool_size=2, strides=2, padding="same")(x)
    output = Flatten()(x)
    model = Model(inputs=input_dos, outputs=output)
    return model

if __name__ == "__main__":
    main()