# Analysis of Feature Abstractions from Deep Learning Models  

This notebook investigates the feature abstractions captured in the penultimate layer of predictive deep learning models.  
By examining these intermediate representations, we explore how the models transform raw inputs into higher-level abstractions that ultimately support bandgap prediction.  


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


Model loading

In [None]:
# Load DL trained model
modelo = tf.keras.models.load_model('Best_Models/July_21_211501_2024_0.019fgsm_0.0001lr_1200_0.0_32_4_hidden_600__b/July_21_211501_2024_0.019fgsm_0.0001lr_1200_0.0_32_4_hidden_600.keras')
modelo.trainable = False
modelo.summary()

The model is cut off just before the output layer, retaining activations from the penultimate layer.

In [None]:
modelCropped = tf.keras.models.Model(inputs = modelo.inputs, outputs = modelo.get_layer('add_1').output, name = 'Add')

Test set features and target variables are loaded

In [None]:
xtest = np.load('data/patolli_generated_data/patolli_xtest.npy')
ytest= np.load('data/patolli_generated_data/patolli_ytest.npy')
print(xtest.shape,ytest.shape)
Add_layer_output = modelCropped.predict(xtest[:,0,:], batch_size=32, verbose=1)

Feature Abstraction Analysis from the Penultimate Layer

In [None]:

def pca_analysis_and_plot(data, n_components=10):
    """
    Applies PCA to the data, prints explained variance ratios, and plots the principal components.
    
    Parameters:
    - data: The input data for PCA.
    - n_components: The number of principal components to keep.
    """
    pca = PCA(n_components=n_components)  # Reduce to n_components dimensions
    X_pca = pca.fit_transform(data)

    print("Explained variance ratio:", pca.explained_variance_ratio_)

    print("Cumulative explained variance:", np.cumsum(pca.explained_variance_ratio_))
    
    for i in range(4):
        for j in range(i + 1, 4):  # Ensure j > i to avoid repetition
            plt.figure(figsize=(8, 6))
            plt.scatter(X_pca[:, i], X_pca[:, j])
            plt.xlabel(f'Principal Component {i+1}')
            plt.ylabel(f'Principal Component {j+1}')
            plt.title(f'PC{i+1} vs PC{j+1}')
            plt.show()
    return X_pca

x_pca=pca_analysis_and_plot(Add_layer_output, n_components=120)
print(x_pca.shape)

In [None]:
def plot_elbow_method(data, max_clusters=20):
    """
    Plots the Elbow Method graph to determine the optimal number of clusters.
    
    Parameters:
    - data: The input data for KMeans clustering.
    - max_clusters: The maximum number of clusters to test. Default is 20.
    """
    wcss = []
    for i in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=i, random_state=0)
        kmeans.fit(data)
        wcss.append(kmeans.inertia_)
    
    plt.figure(figsize=(8, 6))
    plt.plot(range(1, max_clusters + 1), wcss, marker='o')
    plt.title('Elbow Method')
    plt.xlabel('Number of clusters')
    plt.ylabel('WCSS')
    plt.xticks(np.arange(0, max_clusters+1, step=1))
    plt.grid(False)
    plt.show()

plot_elbow_method(x_pca, max_clusters=20)

In [None]:
from sklearn.metrics import silhouette_score

def plot_silhouette_scores(data, min_clusters=2, max_clusters=20):
    """
    Plots the silhouette scores for different numbers of clusters to determine the optimal number of clusters.
    
    Parameters:
    - data: The input data for KMeans clustering.
    - min_clusters: The minimum number of clusters to test. Default is 2.
    - max_clusters: The maximum number of clusters to test. Default is 20.
    """
    silhouette_scores = []
    
    for i in range(min_clusters, max_clusters + 1):
        kmeans = KMeans(n_clusters=i, random_state=0)
        kmeans.fit(data)
        score = silhouette_score(data, kmeans.labels_)
        silhouette_scores.append(score)
    
    plt.figure(figsize=(8, 6))
    plt.plot(range(min_clusters, max_clusters + 1), silhouette_scores, marker='o')
    plt.title('Silhouette Score')
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    plt.xticks(np.arange(0, max_clusters+1, step=1))
    plt.grid(False)
    plt.show()

plot_silhouette_scores(x_pca, min_clusters=2, max_clusters=20)


In [None]:
def compute_gap_statistic(X, k_max, B=10):
    gaps = []
    ks = range(1, k_max + 1)
    
    # Compute WCSS for original data
    wcss = []
    for k in ks:
        kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
        wcss.append(kmeans.inertia_)
    
    # Generate reference datasets and compute WCSS
    ref_wcss = np.zeros((B, k_max))
    for i in range(B):
        ref_data = np.random.random_sample(size=X.shape)
        for k in ks:
            kmeans = KMeans(n_clusters=k, random_state=42).fit(ref_data)
            ref_wcss[i, k-1] = kmeans.inertia_
    
    # Compute the Gap Statistic
    log_wcss_ref = np.log(ref_wcss)
    log_wcss = np.log(np.array(wcss))
    gap = np.mean(log_wcss_ref, axis=0) - log_wcss
    
    return gap, ks

max_clusters = 30
gap, ks = compute_gap_statistic(x_pca, max_clusters)

# Plot 
plt.plot(ks, gap, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Gap Statistic')
plt.title('Gap Statistic vs. Number of Clusters')
plt.xticks(np.arange(0, max_clusters+1, step=1))
plt.grid(False)
plt.show()


In [None]:
def Kmeans_clustering(x,y,n_clusters=3):

    # Add the cluster labels to the reduced data
    reduced_data = pd.DataFrame(x, columns=[f'PC{i+1}' for i in range(x.shape[-1])])
    kmeans = KMeans(n_clusters=n_clusters,init='k-means++',tol=0.001, random_state=0,verbose=0)
    kmeans.fit_predict(reduced_data)
    kmeans_labels =kmeans.labels_
    reduced_data['Cluster'] = kmeans_labels
    plt.figure(figsize=(10, 7))
    
    for cluster in np.unique(kmeans_labels):
        cluster_data = reduced_data[reduced_data['Cluster'] == cluster]

        plt.scatter(cluster_data['PC1'], cluster_data['PC2'], label=f'Cluster {cluster}')
    plt.title('KMeans Clustering')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend()
    plt.show()

    return None

Kmeans_clustering(x_pca,ytest,7)


In [None]:
from minisom import MiniSom
def train_and_plot_som(data, x_size=3, y_size=2, input_len=1200, sigma=0.5, learning_rate=0.001, 
                       random_seed=0, num_iterations=1000):
    """
    Trains a Self-Organizing Map (SOM) and plots the resulting clusters and centroids.
    
    Parameters:
    - data: The input data for SOM training.
    - x_size: Number of neurons in the x-direction of the SOM grid. Default is 3.
    - y_size: Number of neurons in the y-direction of the SOM grid. Default is 2.
    - input_len: Length of the input vector. Default is 1200.
    - sigma: Spread of the Gaussian function. Default is 0.5.
    - learning_rate: Learning rate of the SOM. Default is 0.001.
    - random_seed: Seed for random number generation. Default is 0.
    - num_iterations: Number of iterations for training. Default is 10009.
    """
    # Initialize and train the SOM
    som = MiniSom(x_size, y_size, input_len, sigma=sigma, learning_rate=learning_rate, random_seed=random_seed)
    som.random_weights_init(data)
    som.train_random(data, num_iterations, verbose=True)
    
    # Compute the winner coordinates
    winner_coordinates = np.array([som.winner(x) for x in data]).T
    cluster_index = np.ravel_multi_index(winner_coordinates, (x_size, y_size))
    
    # Plotting the clusters
    plt.figure(figsize=(10, 7))
    for c in np.unique(cluster_index):
        plt.scatter(data[cluster_index == c, 0], data[cluster_index == c, 1], 
                    label=f'Cluster {c}')
    
    # Plotting centroids
    for centroid in som.get_weights():
        plt.scatter(centroid[:, 0], centroid[:, 1], marker='x', 
                    s=100, linewidths=3, color='k', label='Centroid')
    
    plt.title('SOM Clusters and Centroids')
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.legend()
    plt.grid(False)
    plt.show()


train_and_plot_som(x_pca, x_size=2, y_size=2, input_len=x_pca.shape[-1], sigma=0.05, 
                  learning_rate=0.1, random_seed=0, num_iterations=10000)
