This neural network takes in labelled embeddings and learns relationship between embedding and FG label classification. Saliency maps are used to study which parts of the embedding contributing to which FG label and more.

Load the labelled embedding data

In [1]:
import pandas as pd

#define label path embedding
embslabeled_datafilepath = '../data/embs/model1-10000/layer6/Oembs/embstransform.csv'

#use pandas to load the embedding data as a dataframe
embslabeled_data = pd.read_csv(embslabeled_datafilepath,delimiter=',')

#testing fractions...  
traintest_fraction = 0.5  #should be large enough to have >1 members in each class
testval_fraction = 0.2

n_features = 5

Visualize the distribution of the labels

In [2]:
import seaborn as sns
sns.countplot(x='ldalabel', data=embslabeled_data)

ModuleNotFoundError: No module named 'seaborn'

Remove classes that have one member only (i.e water, methanol, formaldehyde... in certain classificaiton systems)

In [3]:
import numpy as np

#class counts is a pandas df, with index, that is the counts of each label value in ldalabel column
class_counts = embslabeled_data['ldalabel'].value_counts()

#which classes are less than two? this is the 
less_than_2 = class_counts[class_counts < 3].index
print(less_than_2)

#remove rows with classes that have less than 2 members
#use boolean indexing with isin function, ~ means those NOT satisfying condition
embslabeled_data_proc = embslabeled_data[~embslabeled_data['ldalabel'].isin(less_than_2)]

#Find the now unique labels, and sort them
unique_labels = np.sort(embslabeled_data_proc['ldalabel'].unique())

#Create a label map, such that for each label is an ordered index labeling system
#No such thing as missing label, 0-N, because we deleted things we have to make a dictionary of the transformation
#For each label there is index i 
label_map = {label: i for i, label in enumerate(unique_labels)}

#Use the label map on the ldalabel column, and the map function for a pandas column df
embslabeled_data_proc['ldalabel'] = embslabeled_data_proc['ldalabel'].map(label_map)

#print(embslabeled_data_proc)
# Remove rows with missing values
sns.countplot(x='ldalabel', data=embslabeled_data_proc)


print('count_label_21', embslabeled_data_proc['ldalabel'].value_counts()[21])

Int64Index([6, 14, 0], dtype='int64')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


NameError: name 'sns' is not defined

Split data into training, validation, testing

In [4]:
from sklearn.model_selection import train_test_split

#The embedding data is the first 128 columns
X = embslabeled_data_proc.iloc[:, 0:n_features]
#the ldalabel is in the 133th column
y = embslabeled_data_proc.iloc[:, 133]

#split into train, test,
#then split into val
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=traintest_fraction,stratify=y,random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test,y_test,test_size=testval_fraction,stratify=y_test,random_state=30)

Define hyperparamters

In [5]:
EPOCHS = 100
BATCH_SIZE = 100
LEARNING_RATE = 0.001
NUM_FEATURES = n_features
NUM_CLASSES = len(unique_labels)


Define the multiclass classification neural network model

In [6]:
import torch
import torch.nn as nn

class MulticlassClassification(nn.Module):
    def __init__(self, num_feature, num_class):
        super(MulticlassClassification, self).__init__()
        
        self.layer_1 = nn.Linear(num_feature, 512)
        self.layer_2 = nn.Linear(512, 128)
        self.layer_3 = nn.Linear(128, 64)
        self.layer_out = nn.Linear(64, num_class) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.batchnorm1 = nn.BatchNorm1d(512)
        self.batchnorm2 = nn.BatchNorm1d(128)
        self.batchnorm3 = nn.BatchNorm1d(64)
        
    def forward(self, x):
        x = self.layer_1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        
        x = self.layer_2(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_3(x)
        x = self.batchnorm3(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_out(x)
        
        return x



Define the Dataset class which will be used conjointly with DataLoader to efficiently load batches

In [7]:
from torch.utils.data import Dataset, WeightedRandomSampler


#Define the dataset class that will initialize the X,y dataset 
#As either train, val, test_dataset
#This class is cool because it can be used conjugtly with DataLoader 
#To efficiently load batches
class ClassifierDataset(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data

    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]

    def __len__ (self):
        return len(self.X_data)


dtype = torch.float32
train_dataset = ClassifierDataset(torch.from_numpy(X_train.values).type(dtype), torch.from_numpy(y_train.values).long())
val_dataset = ClassifierDataset(torch.from_numpy(X_val.values).type(dtype), torch.from_numpy(y_val.values).long())
test_dataset = ClassifierDataset(torch.from_numpy(X_test.values).type(dtype), torch.from_numpy(y_test.values).long())



Important Tangent: we need to define a weight sampler, that will put weights on smaller classes so that they are represented more in the data

In [8]:
#recount class counts  after embslabeled data has been processed as above
class_counts = embslabeled_data_proc['ldalabel'].value_counts()

#get the target labels as a list
target_list = []
for _, t in train_dataset:
    target_list.append(t)
#tensor the list
target_list = torch.tensor(target_list)

#Determine class weight for each class using the 1/class counts
class_weights = 1./torch.tensor(class_counts, dtype=torch.float) 

#weigh each of the targets using the class weights
#in other words apply the weights on the target list
class_weights_all = class_weights[target_list]


#Define the weighted random sampler
weighted_sampler = WeightedRandomSampler(
    weights=class_weights_all,
    num_samples=len(class_weights_all),
    replacement=True
)




Define the train_loader, val_loader and test_loader which will load the data in batches

In [9]:
from torch.utils.data import DataLoader

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          sampler=weighted_sampler
)
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE)


Define multi_class accuracy function, takes in the y_pred and y_test batches

In [10]:
def multi_acc(y_pred, y_test):

    #This transfroms the NUM_CLASS dimensional output 
    #To a probability scores for each class
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)

    
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    

    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc * 100)
    
    return acc

In [11]:

import torch.optim as optim
from tqdm.notebook import tqdm

#Define device that will do the training/testing
device = torch.device("cpu")

#Initialize model
model = MulticlassClassification(num_feature = NUM_FEATURES, num_class=NUM_CLASSES)
model.to(device)

#Define loss criterion and adam optimizer
criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

#Initialize accuracy/loss stats
accuracy_stats = {
    'train': [],
    "val": []
}
loss_stats = {
    'train': [],
    "val": []
}

#Begin training
print("Begin training.")
for epoch in tqdm(range(1,EPOCHS+1)):

    #initialize the epoch loss/acc
    train_epoch_loss = 0
    train_epoch_acc = 0

    #initialize model training
    model.train()
    #for each batch in train loader, perform a training loop
    for X_train_batch, y_train_batch in train_loader:
        
        #Load batches onto device
        X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
        #zero the gradient of the optimizer
        optimizer.zero_grad()

        #Run model on X_batch to get y_pred
        y_train_pred = model(X_train_batch)

        #find the loss using CrossEntropyLoss
        train_loss = criterion(y_train_pred,y_train_batch)
        #compute accuracy
        train_acc = multi_acc(y_train_pred,y_train_batch)

        #run the backpropagater tuner on batch
        train_loss.backward()
        optimizer.step()

        #add train_loss and train_acc across all batches
        train_epoch_loss += train_loss.item()
        train_epoch_acc += train_acc.item()

      
    # VALIDATION , do not use gradient optimization  
    with torch.no_grad():
        
        #
        val_epoch_loss = 0
        val_epoch_acc = 0
        
        model.eval()
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            
            y_val_pred = model(X_val_batch)

            val_loss = criterion(y_val_pred, y_val_batch)
            val_acc = multi_acc(y_val_pred, y_val_batch)
            
            val_epoch_loss += val_loss.item()
            val_epoch_acc += val_acc.item()
            
    loss_stats['train'].append(train_epoch_loss/len(train_loader))
    loss_stats['val'].append(val_epoch_loss/len(val_loader))
    accuracy_stats['train'].append(train_epoch_acc/len(train_loader))
    accuracy_stats['val'].append(val_epoch_acc/len(val_loader))
                              
    
    print(f'Epoch {epoch+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | Val Loss: {val_epoch_loss/len(val_loader):.5f} | Train Acc: {train_epoch_acc/len(train_loader):.3f}| Val Acc: {val_epoch_acc/len(val_loader):.3f}')


Begin training.


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 001: | Train Loss: 1.07663 | Val Loss: 1.21638 | Train Acc: 59.879| Val Acc: 27.857
Epoch 002: | Train Loss: 0.46322 | Val Loss: 0.77634 | Train Acc: 75.636| Val Acc: 30.500
Epoch 003: | Train Loss: 0.34105 | Val Loss: 0.67038 | Train Acc: 78.591| Val Acc: 32.643
Epoch 004: | Train Loss: 0.27112 | Val Loss: 0.65039 | Train Acc: 80.848| Val Acc: 32.357
Epoch 005: | Train Loss: 0.23500 | Val Loss: 0.59991 | Train Acc: 81.591| Val Acc: 33.071
Epoch 006: | Train Loss: 0.23294 | Val Loss: 0.60432 | Train Acc: 80.818| Val Acc: 34.857
Epoch 007: | Train Loss: 0.22059 | Val Loss: 0.65446 | Train Acc: 82.273| Val Acc: 33.714
Epoch 008: | Train Loss: 0.21346 | Val Loss: 0.55643 | Train Acc: 82.303| Val Acc: 35.286
Epoch 009: | Train Loss: 0.22095 | Val Loss: 0.50667 | Train Acc: 83.500| Val Acc: 38.500
Epoch 010: | Train Loss: 0.20592 | Val Loss: 0.51306 | Train Acc: 83.000| Val Acc: 45.000
Epoch 011: | Train Loss: 0.20583 | Val Loss: 0.50548 | Train Acc: 84.424| Val Acc: 45.357
Epoch 012:

Saliency Study

Prepare a training data of a certain class

In [12]:
import torch
import torch.nn as nn
from captum.attr import Saliency


means_saliency = np.zeros((1,n_features+3))
for _class in range(NUM_CLASSES):

    Xy_eval = embslabeled_data_proc[embslabeled_data_proc.iloc[:,133] == _class] 
    X_eval = Xy_eval.iloc[:,0:n_features]
    y_eval = Xy_eval.iloc[:,133]

    color_y = Xy_eval.iloc[0,134]
    marker_y = Xy_eval.iloc[0,135]
    ldalabel_y = Xy_eval.iloc[0,133]

    eval_dataset = ClassifierDataset(torch.from_numpy(X_eval.values).type(dtype), torch.from_numpy(y_eval.values).long())
    eval_loader = DataLoader(dataset=eval_dataset, batch_size=1)

    # Set the model in evaluation mode
    model.eval()

    saliency_maps = np.zeros((1,n_features))
    for X_eval_batch, y_eval_batch in eval_loader:

        # Choose a data point from your dataset
        input_data = torch.tensor(X_eval_batch, requires_grad=True)

        #output on the whole batch
        output_data = model(input_data)

        # Calculate the loss with respect to class 8
        loss = criterion(output_data,y_eval_batch)

        # Backward pass to compute gradients
        loss.backward()

        # Compute the saliency map
        saliency = Saliency(model)
        saliency_map = saliency.attribute(input_data,target=_class,abs=False)


        saliency_maps = np.vstack((saliency_maps,saliency_map))

    saliency_maps = np.delete(saliency_maps,0,0)

    mean_saliency_forthis_class = np.mean(saliency_maps,axis=0)
    mean_saliency_forthis_class = np.hstack((mean_saliency_forthis_class,ldalabel_y,color_y,marker_y))
    means_saliency = np.vstack((means_saliency,mean_saliency_forthis_class))



bef [ 0.08330302  1.23542741 -1.47281274  0.78042333 -0.22600995]
aft [ 8.33030236e-02  1.23542741e+00 -1.47281274e+00  7.80423329e-01
 -2.26009949e-01  0.00000000e+00  7.00000000e+00  1.67475200e+07]




bef [-0.18955577  0.57104083 -0.05912879  0.32964634  0.21362102]
aft [-1.89555774e-01  5.71040826e-01 -5.91287895e-02  3.29646336e-01
  2.13621018e-01  1.00000000e+00  1.10000000e+01  1.67769600e+07]




bef [-0.32842306 -0.03065422 -0.80085608 -1.39765748 -0.44264568]
aft [-3.28423058e-01 -3.06542243e-02 -8.00856075e-01 -1.39765748e+00
 -4.42645675e-01  2.00000000e+00  1.00000000e+00  6.73732200e+06]




bef [-0.59899834 -0.22641683 -0.8796634  -0.23121758 -0.91690324]
aft [-5.98998340e-01 -2.26416832e-01 -8.79663397e-01 -2.31217581e-01
 -9.16903236e-01  3.00000000e+00  1.50000000e+01  3.97809700e+06]




bef [-0.8116795  -0.79953939 -0.7884131   1.42098654 -1.12047175]
aft [-8.11679501e-01 -7.99539391e-01 -7.88413103e-01  1.42098654e+00
 -1.12047175e+00  4.00000000e+00  9.00000000e+00  1.00258800e+07]




bef [-0.03812138  0.10823903  0.09791816 -1.13576734 -1.08831313]
aft [-3.81213753e-02  1.08239030e-01  9.79181632e-02 -1.13576734e+00
 -1.08831313e+00  5.00000000e+00  2.00000000e+00  1.13932540e+07]
bef [-0.12185964  0.08073783  0.04723216 -0.07309917 -0.87333118]
aft [-1.21859635e-01  8.07378315e-02  4.72321616e-02 -7.30991728e-02
 -8.73331183e-01  6.00000000e+00  5.00000000e+00  6.26652800e+06]




bef [-0.12534745  0.07725732  0.14164998 -0.19544933 -1.14312897]
aft [-1.25347452e-01  7.72573151e-02  1.41649984e-01 -1.95449331e-01
 -1.14312897e+00  7.00000000e+00  1.30000000e+01  8.90034600e+06]
bef [ 0.79765427 -0.46972378  0.29647861  1.60795973 -1.47328702]
aft [ 7.97654271e-01 -4.69723775e-01  2.96478610e-01  1.60795973e+00
 -1.47328702e+00  8.00000000e+00  1.30000000e+01  4.28694500e+06]




bef [ 0.2935596  -0.76812825  0.19897345  2.51037422 -1.86712784]
aft [ 2.93559602e-01 -7.68128250e-01  1.98973453e-01  2.51037422e+00
 -1.86712784e+00  9.00000000e+00  3.00000000e+00  6.59198100e+06]




bef [-0.49645403 -0.96702043  0.51487412 -1.05049753  0.09753011]
aft [-4.96454031e-01 -9.67020428e-01  5.14874116e-01 -1.05049753e+00
  9.75301149e-02  1.00000000e+01  7.00000000e+00  1.57876600e+07]
bef [ 0.34824706 -0.25229758  0.77133481  0.26016119  1.86415666]
aft [ 3.48247061e-01 -2.52297583e-01  7.71334806e-01  2.60161188e-01
  1.86415666e+00  1.10000000e+01  5.00000000e+00  1.67667200e+07]




bef [ 0.08347426  2.25184652 -2.41087377 -2.0402628  -1.13587044]
aft [ 8.34742567e-02  2.25184652e+00 -2.41087377e+00 -2.04026280e+00
 -1.13587044e+00  1.20000000e+01  1.10000000e+01  1.67584650e+07]




bef [-1.05193841 -0.74484513 -0.2047871   0.06988304 -1.26961428]
aft [-1.05193841e+00 -7.44845132e-01 -2.04787099e-01  6.98830393e-02
 -1.26961428e+00  1.30000000e+01  1.00000000e+00  1.67702730e+07]




bef [ 0.4366788   1.16590284 -0.94284366  0.61007726 -0.61481461]
aft [ 4.36678803e-01  1.16590284e+00 -9.42843656e-01  6.10077260e-01
 -6.14814614e-01  1.40000000e+01  3.00000000e+00  1.67169470e+07]




bef [ 1.045968   -0.33225498  0.14204668 -2.05279416  0.52437029]
aft [ 1.04596800e+00 -3.32254978e-01  1.42046681e-01 -2.05279416e+00
  5.24370289e-01  1.50000000e+01  1.50000000e+01  1.67387400e+07]




bef [-1.33190516 -3.34033616 -1.14005482  2.92309501  0.56534286]
aft [-1.33190516e+00 -3.34033616e+00 -1.14005482e+00  2.92309501e+00
  5.65342860e-01  1.60000000e+01  3.00000000e+00  1.30471730e+07]




bef [ 1.44113251  0.07267799 -0.85621606  0.31304795 -0.69622087]
aft [ 1.44113251e+00  7.26779930e-02 -8.56216061e-01  3.13047953e-01
 -6.96220875e-01  1.70000000e+01  5.00000000e+00  1.45246370e+07]




bef [ 1.14233987 -0.43838522 -1.35263806  4.76788886 -0.66860901]
aft [ 1.14233987e+00 -4.38385219e-01 -1.35263806e+00  4.76788886e+00
 -6.68609007e-01  1.80000000e+01  5.00000000e+00  8.38873600e+06]
bef [ 1.36522333 -0.17242349  1.78657395 -4.23765283  1.83251629]
aft [ 1.36522333e+00 -1.72423489e-01  1.78657395e+00 -4.23765283e+00
  1.83251629e+00  1.90000000e+01  9.00000000e+00  1.43812030e+07]




bef [-2.00403701  0.72483358  2.90160164  1.17436398 -2.70977255]
aft [-2.00403701e+00  7.24833581e-01  2.90160164e+00  1.17436398e+00
 -2.70977255e+00  2.00000000e+01  7.00000000e+00  1.22116670e+07]
bef [-0.32583115 -1.59119911 -1.03237908  0.59798536  2.23468998]
aft [-3.25831149e-01 -1.59119911e+00 -1.03237908e+00  5.97985361e-01
  2.23468998e+00  2.10000000e+01  7.00000000e+00  1.08242340e+07]




In [15]:
np.savetxt('../data/saliency/model1qm9-0-10000Oembs/means_saliency_per_class.csv',means_saliency,delimiter=',')

In [16]:
means_saliency = np.delete(means_saliency,0,0)


In [17]:

# Create the inner product matrix
inner_product_matrix = np.dot(means_saliency[:,0:5], means_saliency[:,0:5].T)

# Calculate the magnitudes of the rows in the original matrix
row_magnitudes = np.linalg.norm(means_saliency[:,0:5], axis=1)


# Normalize the inner product matrix
normalized_matrix = inner_product_matrix / (row_magnitudes[:, np.newaxis] * row_magnitudes)

print(normalized_matrix)

[[ 1.00000000e+00 -3.77282575e-01 -2.26902674e-01 -1.64474916e-02
  -4.77162838e-01 -2.24862826e-01 -2.93427389e-01 -1.28744838e-01
  -3.31091049e-02 -7.05672913e-01  1.52296594e-01  1.70658570e-01
  -3.51679702e-01  5.50290325e-01 -5.57675293e-01 -1.45237195e-01
  -1.70483873e-01  2.80714111e-01 -3.96362000e-01  1.28223818e-01
  -5.13152361e-02]
 [-3.77282575e-01  1.00000000e+00  6.66963807e-01 -1.46253106e-01
   7.43237981e-01  3.22404140e-01  3.51353402e-01 -5.11653340e-01
  -5.28086280e-01  4.42458262e-01 -5.40389519e-01  7.53749826e-01
   3.21213943e-01 -1.51506486e-03  5.38028871e-01 -3.52287558e-01
   2.57486721e-02 -6.44062642e-01  3.70055392e-01 -2.68593330e-01
  -1.64395828e-01]
 [-2.26902674e-01  6.66963807e-01  1.00000000e+00  6.26878963e-01
   5.14789514e-01  6.47763559e-01  6.04594432e-01  1.00767232e-01
   2.04318803e-01  9.40344539e-02 -8.67300867e-01  5.27508113e-01
   8.09718307e-01  2.82208360e-01 -1.98046386e-01  1.96708244e-01
   1.65385403e-01  1.55616856e-02 -4.1

In [18]:
labeled_matrix = np.column_stack((normalized_matrix,means_saliency[:,-3:]))

np.savetxt('../saliency_innerproduct.csv',labeled_matrix,delimiter=',')