In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, RegressorMixin
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.patheffects as path_effects
import math
import random
from scipy import stats

In [None]:
file_path = 'your_data.csv'
data = pd.read_csv(file_path)
categorical_columns = ['Formula', 'Crystal type', 'Inversion_Symmetry', 'Magnetic']
numerical_columns = [col for col in data.columns if col not in categorical_columns + ['Formula']]

for column in categorical_columns:
    data[column] = data[column].astype('category')

for column in numerical_columns:
    data[column] = pd.to_numeric(data[column], errors='coerce')

In [None]:
target = 'HSE06_Band_Gap'
features = [col for col in data.columns if col != target and col != 'Formula']

categorical_columns = ['Crystal type', 'Inversion_Symmetry', 'Magnetic']
numerical_columns = [col for col in features if col not in categorical_columns]

print("Categorical_columns：", categorical_columns)
print("Numerical_columns：", numerical_columns)

One-hot encoding

In [None]:
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)
print(data_encoded.head())

In [None]:
target = 'HSE06_Band_Gap'
features = [col for col in data_encoded.columns if col != target and col != 'Formula']

data_numeric = data_encoded[features + [target]]

corr_matrix = data_numeric.corr()

threshold = 0.15
target_corr = corr_matrix[target].abs().sort_values(ascending=False)
top_features = target_corr[target_corr > threshold].index.tolist()

print(top_features)

plt.figure(figsize=(16, 14))
sns.set(style='white') 

corr_subset = corr_matrix.loc[top_features, top_features]

mask = np.triu(np.ones_like(corr_subset, dtype=bool))

#cmap = 'RdGy'  #：'viridis', 'YlGnBu', 'coolwarm'
#cmap = sns.color_palette("Purples_d", as_cmap=True)
#cmap = sns.color_palette("rocket_r", as_cmap=True)
#cmap = sns.color_palette("PuBu", as_cmap=True)
cmap = sns.blend_palette([(255/255, 255/255, 255/255), (70/255, 50/255, 180/255)], as_cmap=True) 

heatmap = sns.heatmap(
    corr_subset,
    #annot=True,
    #fmt=".2f",
    cmap=cmap,
    linewidths=0.1,
    linecolor='white',
    #mask=mask,
    cbar_kws={"shrink": .8},
    square=True,
    annot_kws={"size": 12},
    vmin=-1, vmax=1  
)

plt.title('Correlation Heatmap of Top Features', fontsize=28, fontweight='bold', pad=20)
plt.xticks(rotation=45, ha='right', fontsize=20)
plt.yticks(rotation=0, fontsize=20)

cbar = heatmap.collections[0].colorbar
cbar.ax.tick_params(labelsize=20)

for text in heatmap.texts:
    text.set_path_effects([
        mpl.patheffects.Stroke(linewidth=1, foreground='white'),
        mpl.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()

# plt.savefig('correlation_heatmap.png', dpi=1200)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data_numeric.drop([target], axis=1))
y = data_numeric[target]

In [None]:
selector = SelectKBest(score_func=f_regression, k=50)
X_selected = selector.fit_transform(X_scaled, y)
selected_feature_names = np.array(data_numeric.drop([ target], axis=1).columns)[selector.get_support()]

print("Nums：", X_selected.shape[1])
print("Fets：", selected_feature_names.tolist())

In [None]:
feature_scores = selector.scores_
feature_importance = pd.DataFrame({'Feature': selected_feature_names, 'Score': feature_scores[selector.get_support()]})
print(feature_importance.sort_values(by='Score', ascending=False))

In [None]:
pca = PCA(n_components=0.99, random_state=42)
X_pca = pca.fit_transform(X_selected)
print("PCA：", X_pca.shape[1])

indices = np.arange(len(data_encoded))

X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(
    X_pca, y, indices, test_size=0.2, random_state=42
)

Autoencoder

In [None]:
class AdvancedAutoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim=20):
        super(AdvancedAutoencoder, self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
            nn.Dropout(0.2),  
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(),
            nn.Dropout(0.2),  
            nn.Linear(256, encoding_dim),
            nn.BatchNorm1d(encoding_dim),
            nn.LeakyReLU()
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(),
            nn.Dropout(0.2),  
            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
            nn.Dropout(0.2),  
            nn.Linear(512, input_dim),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


input_dim_auto = X_train.shape[1]
encoding_dim = 30  
autoencoder = AdvancedAutoencoder(input_dim=input_dim_auto, encoding_dim=encoding_dim)


criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.0001, weight_decay=1e-6)  


In [None]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, X_train_tensor)
test_dataset = TensorDataset(X_test_tensor, X_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
num_epochs = 200  
patience = 15  
best_val_loss = float('inf')
patience_counter = 0
history_loss = []
history_val_loss = []

In [None]:
for epoch in range(num_epochs):
    autoencoder.train()
    running_loss = 0.0
    for data, _ in train_loader:
        optimizer.zero_grad()
        outputs = autoencoder(data)
        loss = criterion(outputs, data)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * data.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)
    
    autoencoder.eval()
    val_running_loss = 0.0
    with torch.no_grad():
        for data, _ in test_loader:
            outputs = autoencoder(data)
            loss = criterion(outputs, data)
            val_running_loss += loss.item() * data.size(0)
    val_epoch_loss = val_running_loss / len(test_loader.dataset)
    
    history_loss.append(epoch_loss)
    history_val_loss.append(val_epoch_loss)
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {epoch_loss:.4f}, Val Loss: {val_epoch_loss:.4f}')
    
    
    if val_epoch_loss < best_val_loss:
        best_val_loss = val_epoch_loss
        patience_counter = 0
        
        best_model_state = autoencoder.state_dict()
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break


autoencoder.load_state_dict(best_model_state)

In [None]:
autoencoder.eval()
with torch.no_grad():
    X_train_encoded = autoencoder.encoder(X_train_tensor).numpy()
    X_test_encoded = autoencoder.encoder(X_test_tensor).numpy()


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_train_encoded[:, 0], y=X_train_encoded[:, 1], hue=y_train, palette='viridis')
plt.title("Encoded Features of Training Set")
plt.xlabel("Encoded Feature 1")
plt.ylabel("Encoded Feature 2")
plt.legend()
plt.show()

In [None]:
X_train_reconstructed = autoencoder.decoder(torch.tensor(X_train_encoded, dtype=torch.float32)).detach().numpy()
reconstruction_loss = np.mean(np.square(X_train_tensor.numpy() - X_train_reconstructed))
print(f"Training Reconstruction Loss: {reconstruction_loss:.4f}")

In [None]:
X_train_encoded.to_csv('Encoded_train_data.csv', index=False)
X_test_encoded.to_csv('Encoded_test_data.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)