### Data Collection
1. Electronic distribution of the elements of the periodic table
2. Simulations from AFLOW, components, stechiometry, thermal conductivity

In [None]:
import pickle
import pandas as pd
import numpy as np

with open("matrix_dic.pkl","rb") as f:
    elec_data = pickle.load(f)

Unifying DataFrames from Dictionary to Single Dataframe

In [None]:

data_aflow = pd.read_pickle("data_AFLOW.pkl")
keys = data_aflow.keys()

simulation_data = []

# Unifying the Dataframes
for key, df in data_aflow.items():
    subset = df[['species', 'stoichiometry', 'agl_thermal_conductivity_300K']].copy()
    subset['source'] = key   # optional: label where it came from
    simulation_data.append(subset)

simulation_data = pd.concat(simulation_data, ignore_index=True)
simulation_data.rename(columns={'agl_thermal_conductivity_300K':'thermal_conductivity'}, inplace=True)

## Splitting the Species and stoichiometry data
simulation_data["species_list"] = simulation_data["species"].str.replace(" ","").str.split(",")
simulation_data["stoich_list"] = simulation_data["stoichiometry"].str.replace(" ","").str.split(",")


## Create the table for input from the simulated data

In [95]:
unique_elements = simulation_data['species_list'].explode().unique().tolist()
unique_elements.append('Thermal_conductivity') # Adding the thermal conductivity column

df_composition = pd.DataFrame(columns=unique_elements)

## Build rows
row = []

for species_list,stoich_list,thermal in zip (simulation_data['species_list'],
                                             simulation_data['stoich_list'],
                                             simulation_data['thermal_conductivity']):
    row_dict = {}
    
    # Fill composition for each specie
    for element, compo in zip(species_list,stoich_list):
        row_dict[element] = compo
        df_composition[element] = compo
    
    # Fill missing species with 0
    for elem in unique_elements:
        if elem not in row_dict and elem != 'Thermal_conductivity':
            row_dict[elem] = 0
    
    # Add thermal conductivity
    #row_dict['Thermal_conductivity'] = thermal
    row.append(row_dict)
    
df_composition = pd.DataFrame(row)

Combine the compositions dataframe with the electron distribution composition for VAE algorithm 

In [110]:
df_new = pd.DataFrame()

blocks = []
elec_col_names =[] 
n_rows = len(df_composition)

for col in df_composition.columns:
    if col in elec_data and elec_data[col] is not None:
        
        mat = np.asarray(elec_data[col])
        flat = mat.ravel()
        flat = flat / (np.linalg.norm(flat) + 1e-8)
        

        # Create column names for this elementâ€™s matrix
        flat_cols = [f"{col}_m_{i}" for i in range(len(flat))]

        # Build a block: same flat vector repeated for each row of df_composition
        flat_block = pd.DataFrame(
            [flat] * n_rows,
            columns=flat_cols,
            index=df_composition.index
        )

        # Append: first matrix columns, then the composition column itself
        blocks.append(df_composition[[col]])
        blocks.append(flat_block)
        
        elec_col_names.extend(flat_cols)   
    else:
        # Column is not an element with electron matrix -> copy as is
        blocks.append(df_composition[[col]])

# Final dataframe: everything horizontally concatenated, in order
df_new = pd.concat(blocks, axis=1)

df_new.to_pickle("Input_Data_VAE.pkl")

VAEs for the composition and electronic structure

In [None]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from VAE_Model import VAE

## Training

In [None]:
def train_vae(X_np, latent_dim=16, beta=1.0, lr=1e-3, batch_size=128, epochs=200, device=None):
    device =device or ("cuda" if torch.cuda.is_available() else "cpu")
    
    X = torch.tensor(X_np, dtype=torch.float32)
    loader = DataLoader(TensorDataset(X), batch_size=batch_size, shuffle=True)
    
    model = VAE(input_dim=X.shape[1], latent_dim=latent_dim).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    
    model.train()
    for epoch in range (1, epochs+1):
        total,rtot,ktot = 0.0,0.0,0.0
        for (xb,) in loader:
            xb = xb.to(device)
            x_hat, mu, logvar, _ = model(xb)
            loss, recon,kl = vae_loss(xb,x_hat,mu,logvar,beta=beta)
            
            opt.zero_grad()
            loss.backward()
            opt.step()
            
            total += loss.item()* xb.size(0)
            rtot += recon.item() * xb.size(0)
            ktot += kl.item() * xb.size(0)
            
        if epoch % 20 == 0 or epoch == 1:
            n = len(loader.dataset)
            print(f"Epoch {epoch:4d} | loss {total/n:.6f} | recon {rtot/n:.6f} | KL {ktot/n:.6f}")
        
        return model
    

In [None]:
df_X = df_new
X_np = df_X.values.astype("float32")

model = train_vae(X_np)
torch.save(model.state_dict(), "vae_model_AFLOW.pth")

#### Saved model
Load the save model

Find the input dimension

In [1]:
import torch

model_path = "/Users/linarojas/Desktop/Research/Papers/Combinatorial_Ternary/Hybrid-Experimental-Data-Driven-Workflow/Models/VAE/vae_model_AFLOW.pth"
state = torch.load(model_path, map_location="cpu")

for name, weight in state.items():
    print(name, weight.shape)
    break

enc.0.weight torch.Size([256, 2204])


In [4]:
import sys

sys.path.append(
    "/Users/linarojas/Desktop/Research/Papers/Combinatorial_Ternary/Hybrid-Experimental-Data-Driven-Workflow"
)

In [8]:
import torch
from Models.VAE.VAE_Model import VAE

device = "cuda" if torch.cuda.is_available() else "cpu"
model_path = "/Users/linarojas/Desktop/Research/Papers/Combinatorial_Ternary/Hybrid-Experimental-Data-Driven-Workflow/Models/VAE/vae_model_AFLOW.pth"


model = VAE(
    input_dim = 2204,
    latent_dim = 16,
    hidden_dims = (256,128)
)

state = torch.load(model_path, map_location=torch.device("cpu"))
model.to(device)
model.eval()

VAE(
  (enc): Sequential(
    (0): Linear(in_features=2204, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
  )
  (mu): Linear(in_features=128, out_features=16, bias=True)
  (logvar): Linear(in_features=128, out_features=16, bias=True)
  (dec): Sequential(
    (0): Linear(in_features=16, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=2204, bias=True)
  )
)

Extract encoder/latent variables
1. Get the information
2. Load the data and extract the encoder and latent variable

In [None]:
latents = []

with torch.no_grad():
    for x in dataloader:   # x shape: (batch, input_dim)
        x = x.to(device)
        
        h = model.enc(x)
        mu = model.mu(h)        # (batch, latent_dim)
        
        latents.append(mu.cpu())

In [None]:
Z = torch.cat(latents, dim=0).numpy()
print(Z.shape)  # (N_samples, latent_dim)

Plot the latent space (since the dimension is equal to 16) i will apply PCA to dimensionality reduction and see it

In [None]:
from sklearn.decomposition import PCA

Z_pca = PCA(n_components=2).fit_transform(Z)

plt.figure(figsize=(6,5))
plt.scatter(Z_pca[:,0], Z_pca[:,1], s=10, alpha=0.7)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Latent space (PCA)")
plt.show()

## Latent space for regression

In [None]:
def encode_to_latent(model, X_np, device=None, use_mu = True):
    device =device or ("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    X = torch.tensor(X_np, dtype=torch.float32).to(device)
    
    h = model.enc(X)
    mu = model.mu(h)
    logvar = model.logvar(h)
    if use_mu:
        return mu.cpu().numpy()
    else:
        z = model.reparameterize(mu,logvar)
        return z.cpu().numpy()

In [None]:
Z =  encode_to_latent(model, X_np, use_mu=True)