In [None]:
import os
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.cluster import KMeans
import torch
from torch import nn
from torch.optim import Adam
import networkx as nx
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.utils import from_networkx
import plotly.express as px
import plotly.graph_objects as go

In [None]:
# Define the autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Step 1: Load the datasets
data_files = []
for root, dirs, files in os.walk('data'):
    for file in files:
        if file.endswith('.csv'):
            data_files.append(os.path.join(root, file))

In [None]:
# Initialize the results list
results = []

for file in data_files:
    # Create a dictionary for each dataset
    result = {'dataset_name': file}

    # Load the dataset
    dataset = pd.read_csv(file)
    result['data'] = dataset

    # One-Hot Encoding for categorical features
    dataset = pd.get_dummies(dataset)

    # Handle missing values by filling them with the mean of each column
    dataset = dataset.fillna(dataset.mean())

    # Ensure all data is numeric
    dataset = dataset.apply(pd.to_numeric, errors='coerce')
    
    # Convert boolean columns to integers
    for col in dataset.select_dtypes(include='bool').columns:
        dataset[col] = dataset[col].astype(int)

    # Fill any remaining NaN values that could result from the conversion
    dataset = dataset.fillna(0)

    # PCA for numerical data
    pca = PCA(n_components=3)
    result['pca_embeddings'] = pca.fit_transform(dataset)

    # t-SNE for numerical data
    tsne = TSNE(n_components=3)
    result['t-SNE_embeddings'] = tsne.fit_transform(dataset)

    # UMAP for numerical data
    umap = UMAP(n_components=3)
    result['umap_embeddings'] = umap.fit_transform(dataset)

    # Define the size of the encoded representations
    encoding_dim = 32

    # Define the autoencoder model
    autoencoder = Autoencoder(dataset.shape[1], encoding_dim)

    # Define the optimizer and loss function
    optimizer = Adam(autoencoder.parameters())
    criterion = nn.MSELoss()

    # Convert the dataset to PyTorch tensors
    dataset_torch = torch.tensor(dataset.values, dtype=torch.float32)

    # Normalize the data to be between 0 and 1
    dataset_torch = (dataset_torch - dataset_torch.min()) / (dataset_torch.max() - dataset_torch.min())

    # Train the autoencoder
    for epoch in range(50):
        autoencoder.train()
        optimizer.zero_grad()
        outputs = autoencoder(dataset_torch)
        loss = criterion(outputs, dataset_torch)
        loss.backward()
        optimizer.step()

    # Switch the model to evaluation mode
    autoencoder.eval()

    # Generate the embeddings
    with torch.no_grad():
        result['autoencoder_embeddings'] = autoencoder.encoder(dataset_torch).numpy()

    # Clusterize the embeddings
    kmeans = KMeans(n_clusters=10)
    result['pca_cluster'] = kmeans.fit_predict(result['pca_embeddings'])
    result['t-SNE_cluster'] = kmeans.fit_predict(result['t-SNE_embeddings'])
    result['umap_cluster'] = kmeans.fit_predict(result['umap_embeddings'])
    result['autoencoder_cluster'] = kmeans.fit_predict(result['autoencoder_embeddings'])

    # Append the result to the results list
    results.append(result)


In [None]:
results

In [None]:
# # Load the DAGs from txt files
# dag_files = []
# for root, dirs, files in os.walk('dags'):
#     for file in files:
#         if file.endswith('.txt'):
#             dag_files.append(os.path.join(root, file))
# 
# # Initialize the graph embeddings dictionary
# graph_embeddings = {}
# 
# for file in dag_files:
#     with open(file, 'r') as f:
#         edges = [tuple(line.strip().split()) for line in f]
# 
#     # Create a directed graph
#     G = nx.DiGraph()
#     G.add_edges_from(edges)
# 
#     # Convert node labels to integers
#     mapping = {node: idx for idx, node in enumerate(G.nodes())}
#     G = nx.relabel_nodes(G, mapping)
# 
#     # Generate embeddings using Node2Vec
#     node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)
#     model = node2vec.fit(window=10, min_count=1, batch_words=4)
# 
#     # Save the embeddings for each node
#     embeddings = {node: model.wv[str(node)] for node in G.nodes()}
#     graph_embeddings[file] = embeddings
# 
# # Now `graph_embeddings` contains the node embeddings for each DAG

In [None]:
import os
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
import numpy as np
from sklearn.cluster import KMeans
import torch
from torch import nn
from torch.optim import Adam
import networkx as nx
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.utils import from_networkx
import plotly.express as px

# Define GCN model
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, hidden_dim, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# Function to load graphs from txt files
def load_graphs_from_txt(directory):
    graph_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                graph_files.append(os.path.join(root, file))
    return graph_files

# Load graphs from the directory
graph_files = load_graphs_from_txt('graphs')

# Initialize the graph embeddings dictionary
graph_embeddings = []

for file in graph_files:
    with open(file, 'r') as f:
        edges = [tuple(line.strip().split()) for line in f]
    
    # Create a directed graph
    G = nx.DiGraph()
    G.add_edges_from(edges)
    
    # Create a mapping from node labels to numeric indices
    mapping = {node: idx for idx, node in enumerate(G.nodes())}
    G = nx.relabel_nodes(G, mapping)
    
    # Add dummy node features
    for i in G.nodes:
        G.nodes[i]['feature'] = [1.0] * 10
    
    data = from_networkx(G)
    
    # Convert node features to tensor
    data.x = torch.tensor([G.nodes[i]['feature'] for i in G.nodes], dtype=torch.float)
    
    # Initialize model, optimizer, and loss function
    model = GCN(num_node_features=10, hidden_dim=16, num_classes=3)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()
    
    # Training loop
    model.train()
    for epoch in range(200):
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, torch.tensor([0 for _ in G.nodes], dtype=torch.long))  # Dummy labels
        loss.backward()
        optimizer.step()
    
    # Get GCN embeddings
    model.eval()
    with torch.no_grad():
        embeddings = model.conv1(data.x, data.edge_index).numpy()
    
    # Append embeddings to the list
    graph_embeddings.append((file, embeddings))

# Flatten the embeddings for clustering and plotting
all_embeddings = np.vstack([emb for _, emb in graph_embeddings])
kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit_predict(all_embeddings)

# Create a list of labels for the embeddings
embedding_labels = []
for i, (file, embeddings) in enumerate(graph_embeddings):
    embedding_labels.extend([f"{file}-{j}" for j in range(embeddings.shape[0])])

# Plot embeddings with Plotly
def plot_3d_scatter(embeddings, clusters, labels, title):
    fig = px.scatter_3d(
        x=embeddings[:, 0], 
        y=embeddings[:, 1], 
        z=embeddings[:, 2], 
        color=clusters, 
        title=title,
        text=labels
    )
    fig.show()

# Plot GCN embeddings
plot_3d_scatter(all_embeddings, clusters, embedding_labels, "GCN Embeddings for Graphs")

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.cluster import KMeans
import torch
from torch import nn
from torch.optim import Adam
import networkx as nx
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.utils import from_networkx
import plotly.express as px

# Define the autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Define GCN model
class GCN(nn.Module):
    def __init__(self, num_node_features, hidden_dim, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# Function to load graphs from txt files
def load_graphs_from_txt(directory):
    graph_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                graph_files.append(os.path.join(root, file))
    return graph_files

# Load the datasets
data_files = []
for root, dirs, files in os.walk('data/bnlearn_data'):
    for file in files:
        if file.endswith('.csv'):
            data_files.append(os.path.join(root, file))

# Initialize the lists
data_embeddings = []
graph_embeddings = []
file_names = []

for file in data_files:
    # Load the dataset
    dataset = pd.read_csv(file)

    # One-Hot Encoding for categorical features
    dataset = pd.get_dummies(dataset)

    # Handle missing values by filling them with the mean of each column
    dataset = dataset.fillna(dataset.mean())

    # Ensure all data is numeric
    dataset = dataset.apply(pd.to_numeric, errors='coerce')
    
    # Convert boolean columns to integers
    for col in dataset.select_dtypes(include='bool').columns:
        dataset[col] = dataset[col].astype(int)

    # Fill any remaining NaN values that could result from the conversion
    dataset = dataset.fillna(0)

    # PCA for numerical data
    pca = PCA(n_components=3)
    pca_embeddings = pca.fit_transform(dataset)

    # t-SNE for numerical data
    tsne = TSNE(n_components=3)
    tsne_embeddings = tsne.fit_transform(dataset)

    # UMAP for numerical data
    umap = UMAP(n_components=3)
    umap_embeddings = umap.fit_transform(dataset)

    # Define the size of the encoded representations
    encoding_dim = 3

    # Define the autoencoder model
    autoencoder = Autoencoder(dataset.shape[1], encoding_dim)

    # Define the optimizer and loss function
    optimizer = Adam(autoencoder.parameters())
    criterion = nn.MSELoss()

    # Convert the dataset to PyTorch tensors
    dataset_torch = torch.tensor(dataset.values, dtype=torch.float32)

    # Normalize the data to be between 0 and 1
    dataset_torch = (dataset_torch - dataset_torch.min()) / (dataset_torch.max() - dataset_torch.min())

    # Train the autoencoder
    for epoch in range(50):
        autoencoder.train()
        optimizer.zero_grad()
        outputs = autoencoder(dataset_torch)
        loss = criterion(outputs, dataset_torch)
        loss.backward()
        optimizer.step()

    # Switch the model to evaluation mode
    autoencoder.eval()

    # Generate the embeddings
    with torch.no_grad():
        autoencoder_embeddings = autoencoder.encoder(dataset_torch).numpy()

    # Append the embeddings to the list
    data_embeddings.append((file, pca_embeddings, tsne_embeddings, umap_embeddings, autoencoder_embeddings))

    # Load the corresponding graph
    graph_file = file.replace('.csv', '.txt')
    with open(graph_file, 'r') as f:
        edges = [tuple(line.strip().split()) for line in f]

    # Create a directed graph
    G = nx.DiGraph()
    G.add_edges_from(edges)

    # Create a mapping from node labels to numeric indices
    mapping = {node: idx for idx, node in enumerate(G.nodes())}
    G = nx.relabel_nodes(G, mapping)

    # Add dummy node features
    for i in G.nodes:
        G.nodes[i]['feature'] = [1.0] * 10

    data = from_networkx(G)

    # Convert node features to tensor
    data.x = torch.tensor([G.nodes[i]['feature'] for i in G.nodes], dtype=torch.float)

    # Initialize model, optimizer, and loss function
    model = GCN(num_node_features=10, hidden_dim=16, num_classes=3)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()

    # Training loop
    model.train()
    for epoch in range(200):
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, torch.tensor([0 for _ in G.nodes], dtype=torch.long))  # Dummy labels
        loss.backward()
        optimizer.step()

    # Get GCN embeddings
    model.eval()
    with torch.no_grad():
        gcn_embeddings = model.conv1(data.x, data.edge_index).numpy()

    # Append embeddings to the graph embeddings list
    graph_embeddings.append(gcn_embeddings)
    file_names.append(file)

# Flatten the graph embeddings for clustering
all_graph_embeddings = np.vstack(graph_embeddings)
kmeans = KMeans(n_clusters=10)
graph_clusters = kmeans.fit_predict(all_graph_embeddings)

# Create a mapping from file name to cluster
file_to_cluster = {file_names[i]: graph_clusters[i] for i in range(len(file_names))}

# Plot embeddings with Plotly
def plot_3d_scatter(embeddings, file, title):
    cluster = file_to_cluster[file]
    fig = px.scatter_3d(
        x=embeddings[:, 0], 
        y=embeddings[:, 1], 
        z=embeddings[:, 2], 
        color=[cluster] * len(embeddings), 
        title=title
    )
    fig.show()

# Plot PCA, t-SNE, UMAP, and Autoencoder embeddings
for file, pca_emb, tsne_emb, umap_emb, autoencoder_emb in data_embeddings:
    plot_3d_scatter(pca_emb, file, f"PCA Embeddings: {file}")
    plot_3d_scatter(tsne_emb, file, f"t-SNE Embeddings: {file}")
    plot_3d_scatter(umap_emb, file, f"UMAP Embeddings: {file}")
    plot_3d_scatter(autoencoder_emb, file, f"Autoencoder Embeddings: {file}")


In [17]:
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.cluster import KMeans
import torch
from torch import nn
from torch.optim import Adam
import networkx as nx
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.utils import from_networkx
import plotly.express as px

# Define the autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Define GCN model
class GCN(nn.Module):
    def __init__(self, num_node_features, hidden_dim, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# Function to load graphs from txt files
def load_graphs_from_txt(directory):
    graph_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                graph_files.append(os.path.join(root, file))
    return graph_files

# Load the datasets
data_files = []
for root, dirs, files in os.walk('data/bnlearn_data'):
    for file in files:
        if file.endswith('.csv'):
            data_files.append(os.path.join(root, file))

# Initialize the lists
data_embeddings = []
graph_embeddings = []
file_names = []

for file in data_files:
    # Load the dataset
    dataset = pd.read_csv(file)

    # One-Hot Encoding for categorical features
    dataset = pd.get_dummies(dataset)

    # Handle missing values by filling them with the mean of each column
    dataset = dataset.fillna(dataset.mean())

    # Ensure all data is numeric
    dataset = dataset.apply(pd.to_numeric, errors='coerce')
    
    # Convert boolean columns to integers
    for col in dataset.select_dtypes(include='bool').columns:
        dataset[col] = dataset[col].astype(int)

    # Fill any remaining NaN values that could result from the conversion
    dataset = dataset.fillna(0)

    # PCA for numerical data
    pca = PCA(n_components=3)
    pca_embeddings = pca.fit_transform(dataset).mean(axis=0)

    # t-SNE for numerical data
    tsne = TSNE(n_components=3)
    tsne_embeddings = tsne.fit_transform(dataset).mean(axis=0)

    # UMAP for numerical data
    umap = UMAP(n_components=3)
    umap_embeddings = umap.fit_transform(dataset).mean(axis=0)

    # Define the size of the encoded representations
    encoding_dim = 3

    # Define the autoencoder model
    autoencoder = Autoencoder(dataset.shape[1], encoding_dim)

    # Define the optimizer and loss function
    optimizer = Adam(autoencoder.parameters())
    criterion = nn.MSELoss()

    # Convert the dataset to PyTorch tensors
    dataset_torch = torch.tensor(dataset.values, dtype=torch.float32)

    # Normalize the data to be between 0 and 1
    dataset_torch = (dataset_torch - dataset_torch.min()) / (dataset_torch.max() - dataset_torch.min())

    # Train the autoencoder
    for epoch in range(50):
        autoencoder.train()
        optimizer.zero_grad()
        outputs = autoencoder(dataset_torch)
        loss = criterion(outputs, dataset_torch)
        loss.backward()
        optimizer.step()

    # Switch the model to evaluation mode
    autoencoder.eval()

    # Generate the embeddings
    with torch.no_grad():
        autoencoder_embeddings = autoencoder.encoder(dataset_torch).mean(axis=0).numpy()

    # Append the embeddings to the list
    data_embeddings.append((file, pca_embeddings, tsne_embeddings, umap_embeddings, autoencoder_embeddings))

    # Load the corresponding graph
    graph_file = file.replace('.csv', '.txt')
    with open(graph_file, 'r') as f:
        edges = [tuple(line.strip().split()) for line in f]

    # Create a directed graph
    G = nx.DiGraph()
    G.add_edges_from(edges)

    # Create a mapping from node labels to numeric indices
    mapping = {node: idx for idx, node in enumerate(G.nodes())}
    G = nx.relabel_nodes(G, mapping)

    # Add dummy node features
    for i in G.nodes:
        G.nodes[i]['feature'] = [1.0] * 10

    data = from_networkx(G)

    # Convert node features to tensor
    data.x = torch.tensor([G.nodes[i]['feature'] for i in G.nodes], dtype=torch.float)

    # Initialize model, optimizer, and loss function
    model = GCN(num_node_features=10, hidden_dim=16, num_classes=3)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()

    # Training loop
    model.train()
    for epoch in range(200):
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, torch.tensor([0 for _ in G.nodes], dtype=torch.long))  # Dummy labels
        loss.backward()
        optimizer.step()

    # Get GCN embeddings
    model.eval()
    with torch.no_grad():
        gcn_embeddings = model.conv1(data.x, data.edge_index).mean(axis=0).numpy()

    # Append embeddings to the graph embeddings list
    graph_embeddings.append(gcn_embeddings)
    file_names.append(file)

# Convert to numpy arrays for clustering
data_embeddings_np = np.array([emb[1:] for emb in data_embeddings])  # Skip the file name
graph_embeddings_np = np.array(graph_embeddings)

# Flatten the graph embeddings for clustering
kmeans = KMeans(n_clusters=10)
graph_clusters = kmeans.fit_predict(graph_embeddings_np)

# Create a mapping from file name to cluster
file_to_cluster = {file_names[i]: graph_clusters[i] for i in range(len(file_names))}

# Extract data embeddings and assign clusters
pca_embs = np.array([emb[1] for emb in data_embeddings])
tsne_embs = np.array([emb[2] for emb in data_embeddings])
umap_embs = np.array([emb[3] for emb in data_embeddings])
autoencoder_embs = np.array([emb[4] for emb in data_embeddings])

# Plot embeddings with Plotly and save them as HTML files
def save_3d_scatter(embeddings, clusters, title, filename):
    fig = px.scatter_3d(
        x=embeddings[:, 0], 
        y=embeddings[:, 1], 
        z=embeddings[:, 2], 
        color=clusters, 
        title=title
    )
    fig.write_html(filename)

# Save PCA, t-SNE, UMAP, and Autoencoder embeddings plots
save_3d_scatter(pca_embs, graph_clusters, "PCA Embeddings", "result_plots/pca_embeddings.html")
save_3d_scatter(tsne_embs, graph_clusters, "t-SNE Embeddings", "result_plots/tsne_embeddings.html")
save_3d_scatter(umap_embs, graph_clusters, "UMAP Embeddings", "result_plots/umap_embeddings.html")
save_3d_scatter(autoencoder_embs, graph_clusters, "Autoencoder Embeddings", "result_plots/autoencoder_embeddings.html")


In [20]:
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.cluster import KMeans
import torch
from torch import nn
from torch.optim import Adam
import networkx as nx
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.utils import from_networkx
import plotly.express as px

# Define the autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Define GCN model
class GCN(nn.Module):
    def __init__(self, num_node_features, hidden_dim, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# Function to load graphs from txt files
def load_graphs_from_txt(directory):
    graph_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                graph_files.append(os.path.join(root, file))
    return graph_files

# Load the datasets
data_files = []
for root, dirs, files in os.walk('data/bnlearn_data'):
    for file in files:
        if file.endswith('.csv'):
            data_files.append(os.path.join(root, file))

# Initialize the lists
data_embeddings = []
graph_embeddings = []
file_names = []

for file in data_files:
    # Load the dataset
    dataset = pd.read_csv(file)

    # One-Hot Encoding for categorical features
    dataset = pd.get_dummies(dataset)

    # Handle missing values by filling them with the mean of each column
    dataset = dataset.fillna(dataset.mean())

    # Ensure all data is numeric
    dataset = dataset.apply(pd.to_numeric, errors='coerce')
    
    # Convert boolean columns to integers
    for col in dataset.select_dtypes(include='bool').columns:
        dataset[col] = dataset[col].astype(int)

    # Fill any remaining NaN values that could result from the conversion
    dataset = dataset.fillna(0)

    # PCA for numerical data
    pca = PCA(n_components=3)
    pca_embeddings = pca.fit_transform(dataset).mean(axis=0)

    # t-SNE for numerical data
    tsne = TSNE(n_components=3)
    tsne_embeddings = tsne.fit_transform(dataset).mean(axis=0)

    # UMAP for numerical data
    umap = UMAP(n_components=3)
    umap_embeddings = umap.fit_transform(dataset).mean(axis=0)

    # Define the size of the encoded representations
    encoding_dim = 3

    # Define the autoencoder model
    autoencoder = Autoencoder(dataset.shape[1], encoding_dim)

    # Define the optimizer and loss function
    optimizer = Adam(autoencoder.parameters())
    criterion = nn.MSELoss()

    # Convert the dataset to PyTorch tensors
    dataset_torch = torch.tensor(dataset.values, dtype=torch.float32)

    # Normalize the data to be between 0 and 1
    dataset_torch = (dataset_torch - dataset_torch.min()) / (dataset_torch.max() - dataset_torch.min())

    # Train the autoencoder
    for epoch in range(50):
        autoencoder.train()
        optimizer.zero_grad()
        outputs = autoencoder(dataset_torch)
        loss = criterion(outputs, dataset_torch)
        loss.backward()
        optimizer.step()

    # Switch the model to evaluation mode
    autoencoder.eval()

    # Generate the embeddings
    with torch.no_grad():
        autoencoder_embeddings = autoencoder.encoder(dataset_torch).mean(axis=0).numpy()

    # Append the embeddings to the list
    data_embeddings.append((file, pca_embeddings, tsne_embeddings, umap_embeddings, autoencoder_embeddings))

    # Load the corresponding graph
    graph_file = file.replace('.csv', '.txt')
    with open(graph_file, 'r') as f:
        edges = [tuple(line.strip().split()) for line in f]

    # Create a directed graph
    G = nx.DiGraph()
    G.add_edges_from(edges)

    # Create a mapping from node labels to numeric indices
    mapping = {node: idx for idx, node in enumerate(G.nodes())}
    G = nx.relabel_nodes(G, mapping)

    # Add dummy node features
    for i in G.nodes:
        G.nodes[i]['feature'] = [1.0] * 10

    data = from_networkx(G)

    # Convert node features to tensor
    data.x = torch.tensor([G.nodes[i]['feature'] for i in G.nodes], dtype=torch.float)

    # Initialize model, optimizer, and loss function
    model = GCN(num_node_features=10, hidden_dim=16, num_classes=3)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()

    # Training loop
    model.train()
    for epoch in range(200):
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, torch.tensor([0 for _ in G.nodes], dtype=torch.long))  # Dummy labels
        loss.backward()
        optimizer.step()

    # Get GCN embeddings
    model.eval()
    with torch.no_grad():
        gcn_embeddings = model.conv1(data.x, data.edge_index).mean(axis=0).numpy()

    # Append embeddings to the graph embeddings list
    graph_embeddings.append(gcn_embeddings)
    file_names.append(file)

# Convert to numpy arrays for clustering
data_embeddings_np = np.array([emb[1:] for emb in data_embeddings])  # Skip the file name
graph_embeddings_np = np.array(graph_embeddings)

# Flatten the graph embeddings for clustering
kmeans = KMeans(n_clusters=3)
graph_clusters = kmeans.fit_predict(graph_embeddings_np)

# Create a mapping from file name to cluster
file_to_cluster = {file_names[i]: graph_clusters[i] for i in range(len(file_names))}

# Extract data embeddings and assign clusters
pca_embs = np.array([emb[1] for emb in data_embeddings])
tsne_embs = np.array([emb[2] for emb in data_embeddings])
umap_embs = np.array([emb[3] for emb in data_embeddings])
autoencoder_embs = np.array([emb[4] for emb in data_embeddings])
dataset_names = np.array([emb[0] for emb in data_embeddings])

# Plot embeddings with Plotly and save them as HTML files
def save_3d_scatter(embeddings, clusters, dataset_names, title, filename):
    fig = px.scatter_3d(
        x=embeddings[:, 0], 
        y=embeddings[:, 1], 
        z=embeddings[:, 2], 
        color=clusters, 
        text=dataset_names, 
        title=title
    )
    fig.write_html(filename)

# Save PCA, t-SNE, UMAP, and Autoencoder embeddings plots
save_3d_scatter(pca_embs, graph_clusters, dataset_names, "PCA Embeddings", "result_plots/pca_embeddings.html")
save_3d_scatter(tsne_embs, graph_clusters, dataset_names, "t-SNE Embeddings", "result_plots/tsne_embeddings.html")
save_3d_scatter(umap_embs, graph_clusters, dataset_names, "UMAP Embeddings", "result_plots/umap_embeddings.html")
save_3d_scatter(autoencoder_embs, graph_clusters, dataset_names, "Autoencoder Embeddings",
                "result_plots/autoencoder_embeddings.html")


In [22]:
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA, FastICA, FactorAnalysis
from sklearn.manifold import TSNE, Isomap
from umap import UMAP
from sklearn.cluster import KMeans
import torch
from torch import nn
from torch.optim import Adam
import networkx as nx
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.utils import from_networkx
import plotly.express as px

# Define the autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Define GCN model
class GCN(nn.Module):
    def __init__(self, num_node_features, hidden_dim, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# Function to load graphs from txt files
def load_graphs_from_txt(directory):
    graph_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                graph_files.append(os.path.join(root, file))
    return graph_files

# Load the datasets
data_files = []
for root, dirs, files in os.walk('data/bnlearn_data'):
    for file in files:
        if file.endswith('.csv'):
            data_files.append(os.path.join(root, file))

# Initialize the lists
data_embeddings = []
graph_embeddings = []
file_names = []

for file in data_files:
    # Load the dataset
    dataset = pd.read_csv(file)

    # One-Hot Encoding for categorical features
    dataset = pd.get_dummies(dataset)

    # Handle missing values by filling them with the mean of each column
    dataset = dataset.fillna(dataset.mean())

    # Ensure all data is numeric
    dataset = dataset.apply(pd.to_numeric, errors='coerce')
    
    # Convert boolean columns to integers
    for col in dataset.select_dtypes(include='bool').columns:
        dataset[col] = dataset[col].astype(int)

    # Fill any remaining NaN values that could result from the conversion
    dataset = dataset.fillna(0)

    # PCA for numerical data
    pca = PCA(n_components=3)
    pca_embeddings = pca.fit_transform(dataset).mean(axis=0)

    # t-SNE for numerical data
    tsne = TSNE(n_components=3)
    tsne_embeddings = tsne.fit_transform(dataset).mean(axis=0)

    # UMAP for numerical data
    umap = UMAP(n_components=3)
    umap_embeddings = umap.fit_transform(dataset).mean(axis=0)

    # ICA for numerical data
    ica = FastICA(n_components=3)
    ica_embeddings = ica.fit_transform(dataset).mean(axis=0)

    # Factor Analysis for numerical data
    fa = FactorAnalysis(n_components=3)
    fa_embeddings = fa.fit_transform(dataset).mean(axis=0)

    # Isomap for numerical data
    isomap = Isomap(n_components=3)
    isomap_embeddings = isomap.fit_transform(dataset).mean(axis=0)

    # Define the size of the encoded representations
    encoding_dim = 3

    # Define the autoencoder model
    autoencoder = Autoencoder(dataset.shape[1], encoding_dim)

    # Define the optimizer and loss function
    optimizer = Adam(autoencoder.parameters())
    criterion = nn.MSELoss()

    # Convert the dataset to PyTorch tensors
    dataset_torch = torch.tensor(dataset.values, dtype=torch.float32)

    # Normalize the data to be between 0 and 1
    dataset_torch = (dataset_torch - dataset_torch.min()) / (dataset_torch.max() - dataset_torch.min())

    # Train the autoencoder
    for epoch in range(50):
        autoencoder.train()
        optimizer.zero_grad()
        outputs = autoencoder(dataset_torch)
        loss = criterion(outputs, dataset_torch)
        loss.backward()
        optimizer.step()

    # Switch the model to evaluation mode
    autoencoder.eval()

    # Generate the embeddings
    with torch.no_grad():
        autoencoder_embeddings = autoencoder.encoder(dataset_torch).mean(axis=0).numpy()

    # Append the embeddings to the list
    data_embeddings.append((file, pca_embeddings, tsne_embeddings, umap_embeddings, ica_embeddings, fa_embeddings, isomap_embeddings, autoencoder_embeddings))

    # Load the corresponding graph
    graph_file = file.replace('.csv', '.txt')
    with open(graph_file, 'r') as f:
        edges = [tuple(line.strip().split()) for line in f]

    # Create a directed graph
    G = nx.DiGraph()
    G.add_edges_from(edges)

    # Create a mapping from node labels to numeric indices
    mapping = {node: idx for idx, node in enumerate(G.nodes())}
    G = nx.relabel_nodes(G, mapping)

    # Add dummy node features
    for i in G.nodes:
        G.nodes[i]['feature'] = [1.0] * 10

    data = from_networkx(G)

    # Convert node features to tensor
    data.x = torch.tensor([G.nodes[i]['feature'] for i in G.nodes], dtype=torch.float)

    # Initialize model, optimizer, and loss function
    model = GCN(num_node_features=10, hidden_dim=16, num_classes=3)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()

    # Training loop
    model.train()
    for epoch in range(200):
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, torch.tensor([0 for _ in G.nodes], dtype=torch.long))  # Dummy labels
        loss.backward()
        optimizer.step()

    # Get GCN embeddings
    model.eval()
    with torch.no_grad():
        gcn_embeddings = model.conv1(data.x, data.edge_index).mean(axis=0).numpy()

    # Append embeddings to the graph embeddings list
    graph_embeddings.append(gcn_embeddings)
    file_names.append(file)

# Convert to numpy arrays for clustering
data_embeddings_np = np.array([emb[1:] for emb in data_embeddings])  # Skip the file name
graph_embeddings_np = np.array(graph_embeddings)

# Flatten the graph embeddings for clustering
kmeans = KMeans(n_clusters=3)
graph_clusters = kmeans.fit_predict(graph_embeddings_np)

# Create a mapping from file name to cluster
file_to_cluster = {file_names[i]: graph_clusters[i] for i in range(len(file_names))}

# Extract data embeddings and assign clusters
pca_embs = np.array([emb[1] for emb in data_embeddings])
tsne_embs = np.array([emb[2] for emb in data_embeddings])
umap_embs = np.array([emb[3] for emb in data_embeddings])
ica_embs = np.array([emb[4] for emb in data_embeddings])
fa_embs = np.array([emb[5] for emb in data_embeddings])
isomap_embs = np.array([emb[6] for emb in data_embeddings])
autoencoder_embs = np.array([emb[7] for emb in data_embeddings])
dataset_names = np.array([emb[0] for emb in data_embeddings])

# Plot embeddings with Plotly and save them as HTML files
def save_3d_scatter(embeddings, clusters, dataset_names, title, filename):
    fig = px.scatter_3d(
        x=embeddings[:, 0], 
        y=embeddings[:, 1], 
        z=embeddings[:, 2], 
        color=clusters, 
        text=dataset_names, 
        title=title
    )
    fig.write_html(filename)

# Save PCA, t-SNE, UMAP, ICA, Factor Analysis, Isomap, and Autoencoder embeddings plots
save_3d_scatter(pca_embs, graph_clusters, dataset_names, "PCA Embeddings", "result_plots/pca_embeddings.html")
save_3d_scatter(tsne_embs, graph_clusters, dataset_names, "t-SNE Embeddings", "result_plots/tsne_embeddings.html")
save_3d_scatter(umap_embs, graph_clusters, dataset_names, "UMAP Embeddings", "result_plots/umap_embeddings.html")
save_3d_scatter(ica_embs, graph_clusters, dataset_names, "ICA Embeddings", "result_plots/ica_embeddings.html")
save_3d_scatter(fa_embs, graph_clusters, dataset_names, "Factor Analysis Embeddings", "result_plots/fa_embeddings.html")
save_3d_scatter(isomap_embs, graph_clusters, dataset_names, "Isomap Embeddings", "result_plots/isomap_embeddings.html")
save_3d_scatter(autoencoder_embs, graph_clusters, dataset_names, "Autoencoder Embeddings",
                "result_plots/autoencoder_embeddings.html")
