In [1]:
from graph import Graph
from part import Part

# Util libraries
import pickle
from typing import List, Set, Dict, Tuple

# ML libraries
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset

In [2]:
with open('data/graphs.dat', 'rb') as file:
    all_graphs: List[Graph] = pickle.load(file)
    X_train, X_temp, y_train, y_temp = train_test_split(list(map(lambda g: g.get_parts(), all_graphs)), all_graphs, test_size=0.3, random_state=0)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=0)


print(X_train[0])
print(y_train[0])
print(len(y_train))
print(y_train[0].get_edges())

{Part(PartID=119, FamilyID=46), Part(PartID=38, FamilyID=3), Part(PartID=118, FamilyID=0), Part(PartID=83, FamilyID=2), Part(PartID=127, FamilyID=48), Part(PartID=686, FamilyID=8)}
<graph.Graph object at 0x14e235310>
7811
{Node(NodeID=0, Part=Part(PartID=118, FamilyID=0)): [Node(NodeID=2, Part=Part(PartID=119, FamilyID=46)), Node(NodeID=1, Part=Part(PartID=127, FamilyID=48)), Node(NodeID=3, Part=Part(PartID=38, FamilyID=3)), Node(NodeID=5, Part=Part(PartID=686, FamilyID=8)), Node(NodeID=4, Part=Part(PartID=83, FamilyID=2))], Node(NodeID=1, Part=Part(PartID=127, FamilyID=48)): [Node(NodeID=0, Part=Part(PartID=118, FamilyID=0))], Node(NodeID=2, Part=Part(PartID=119, FamilyID=46)): [Node(NodeID=0, Part=Part(PartID=118, FamilyID=0))], Node(NodeID=3, Part=Part(PartID=38, FamilyID=3)): [Node(NodeID=0, Part=Part(PartID=118, FamilyID=0))], Node(NodeID=4, Part=Part(PartID=83, FamilyID=2)): [Node(NodeID=0, Part=Part(PartID=118, FamilyID=0))], Node(NodeID=5, Part=Part(PartID=686, FamilyID=8)): [N

In [3]:
# Prepare the data for embedding:
# 1. Map parts to indices.
# 2. Create a training set for the embedding. Each training sample will consist of a unique part idx and a unique part neighbor idx

# returns a dictionary of all part ids in the training set and the size of the dictionary
# the value determines the index
def map_parts_to_index(X_train: List[Set[Part]]) -> (Dict[Part, int], int):
    parts_list = [part for parts in X_train for part in parts]
    parts_dict = {}
    for i, part in enumerate(parts_list):
        if part.get_part_id() not in parts_dict:
            parts_dict[part.get_part_id()] = i
    return parts_dict, len(parts_dict)

# returns a list of tuples. Each tuple contains an idx that represents a part and an idx that represents one of its neighbors
def create_embedding_training_set(X_train: List[Set[Part]], graphs: List[Graph]) -> List[Tuple[int, int]]:
    training_set = []
    mapped_parts, _ = map_parts_to_index(X_train)
    # iterate through each graph in the training set
    for graph in graphs:
        # iterate through all edges in a graph. One part is the key, its neighbors are the values
        for node in graph.get_edges():
            part = node.get_part()
            part_index = mapped_parts[part.get_part_id()]
            # iterate through all neighbors af a part. 
            for neighbor_node in graph.get_edges()[node]:
                neighbor_part = neighbor_node.get_part()
                neighbor_part_index = mapped_parts[neighbor_part.get_part_id()]
                training_set.append((part_index, neighbor_part_index))

    return training_set




In [4]:
# Create neural network to learn the embeddings

# Train on an efficient device if possible
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")


# Define the network architecture
class EmbeddingNetwork(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int):
        super(EmbeddingNetwork, self).__init__()
        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.output_layer = nn.Linear(embedding_dim, vocab_size)

    
    def forward(self, x):
        embedded = self.embeddings(x)
        out = self.output_layer(embedded)
        # Note: No softmax here when using nn.CrossEntropyLoss
        return out
    

# Define the model
vocab_size = map_parts_to_index(X_train)[1] # Number of different parts in the training set
embedding_dim = 10 # Number of embedding dimension
model = EmbeddingNetwork(vocab_size, embedding_dim)
model.to(device)

print("Vocab size: ", vocab_size)
print("Embedding Dimensions: ", embedding_dim)

print("Model: ", model)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss() # Cross-entropy loss
optimizer = torch.optim.Adam(model.parameters()) # Example optimizer

# Train the model
train_dataset = create_embedding_training_set(X_train, y_train)
print("Size of training set: ", len(train_dataset))
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)

num_epochs = 300
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}")
    for i, (parts, neighbors) in enumerate(train_loader):
        optimizer.zero_grad()
        parts = parts.to(device)
        neighbors = neighbors.to(device)
        output = model(parts)
        loss = criterion(output, neighbors)
        loss.backward()
        optimizer.step()

# Extract the embeddings from the model and save them in pytorch format
embeddings = model.embeddings.weight.data
detached_embeddings = embeddings.detach().cpu()
print(len(embeddings))
print(embeddings)
torch.save(detached_embeddings, './models/embeddings.pt')



Using mps device
Vocab size:  1079
Embedding Dimensions:  10
Model:  EmbeddingNetwork(
  (embeddings): Embedding(1079, 10)
  (output_layer): Linear(in_features=10, out_features=1079, bias=True)
)
Size of training set:  103332
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57
Epoch 58
Epoch 59
Epoch 60
Epoch 61
Epoch 62
Epoch 63
Epoch 64
Epoch 65
Epoch 66
Epoch 67
Epoch 68
Epoch 69
Epoch 70
Epoch 71
Epoch 72
Epoch 73
Epoch 74
Epoch 75
Epoch 76
Epoch 77
Epoch 78
Epoch 79
Epoch 80
Epoch 81
Epoch 82
Epoch 83
Epoch 84
Epoch 85
Epoch 86
Epoch 87


In [8]:
# Example code to load the embeddings in a new environment
loaded_weights = torch.load('./models/embeddings.pt')

# If you're using a specific device in your new model/environment, move the weights to that device
# For example, if you're using MPS in your new setup
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
loaded_weights = loaded_weights.to(device)
print("Example embedding for one part: ", loaded_weights[-1]) 
print("Embedding dimensions: ", len(loaded_weights[0]))
print("Vocabulary size :", len(loaded_weights))

Example embedding for one part:  tensor([ 3.3273, -3.5478, -2.4442,  4.1097,  3.7043,  0.4720,  3.2587,  3.2172,
         0.7442,  1.5733], device='mps:0')
Embedding dimensions:  10
Vocabulary size : 1079


In [6]:
# check equivalence implementation
part1 = Part(202, 2)
part2 = Part(202, 2)
part3 = Part(203, 3)
print(f"Part1: id={id(part1)}, hash={hash(part1)}")
print(f"Part2: id={id(part2)}, hash={hash(part2)}")

print(part1.equivalent(part2))

print(part1.get_part_id() == part2.get_part_id())  
print(part1.get_family_id() == part2.get_family_id()) 
print(Part(202, 2) == Part(202, 2)) 
print(Part(202, 2).__eq__(Part(202, 2)))#
print(part1 == part2)
print(part1 == part3)

Part1: id=5871143136, hash=-7479318313413528548
Part2: id=5871143568, hash=-7479318313413528548
True
True
True
False
NotImplemented
False
False
