### **`DA324 - Data Mining Project`**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import SpectralEmbedding, LocallyLinearEmbedding, Isomap, MDS, TSNE
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
adjacency = pd.read_csv('/kaggle/input/graph-nodes-data/adjacency.csv')
attributes = pd.read_excel('/kaggle/input/graph-nodes-data/attributes.xlsx')
seeds = pd.read_excel('/kaggle/input/graph-nodes-data/seed.xlsx')

In [None]:
print(f"adjacency.index = {adjacency.index}")
print(f"adjacency.columns = {adjacency.columns}")
# As columns name are in str format so changing it to int so that we can use it simply.
adjacency.columns = adjacency.columns.astype(int)

adjacency.index = RangeIndex(start=0, stop=11952, step=1)
adjacency.columns = Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '11942', '11943', '11944', '11945', '11946', '11947', '11948', '11949',
       '11950', '11951'],
      dtype='object', length=11952)


In [None]:
adjacency.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11942,11943,11944,11945,11946,11947,11948,11949,11950,11951
0,0,1,0,0,1,1,0,0,0,0,...,1,1,1,1,1,1,0,0,1,1
1,1,0,0,0,1,0,1,1,0,1,...,1,1,1,1,1,1,1,0,1,1
2,0,0,0,0,0,1,0,0,0,1,...,0,0,1,1,1,0,0,0,0,1
3,0,0,0,0,1,0,0,0,1,1,...,1,0,0,0,0,1,0,1,1,1
4,1,1,0,1,0,0,0,0,1,1,...,1,1,1,0,1,1,0,1,1,1


In [None]:
adjacency.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11952 entries, 0 to 11951
Columns: 11952 entries, 0 to 11951
dtypes: int64(11952)
memory usage: 1.1 GB


In [None]:
attributes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11952 entries, 0 to 11951
Columns: 103 entries, 0 to 102
dtypes: float64(103)
memory usage: 9.4 MB


In [None]:
attributes.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,93,94,95,96,97,98,99,100,101,102
0,-6.3128,-5.5851,-3.0679,12.988,9.0026,-5.5361,-1.0356,10.165,-8.6108,4.3228,...,-2.8601,-24.768,18.213,7.3551,44.382,18.659,-6.0441,2.941,29.01,-32.906
1,1.3902,-0.44661,-3.7137,-12.842,4.4787,4.8796,3.0049,-5.9097,-7.3532,13.825,...,-29.517,-35.92,7.0253,-12.039,43.115,-71.516,-25.207,-6.4823,96.844,-67.232
2,-8.3044,-3.4569,6.5153,-12.009,-6.762,-1.2466,-5.9627,9.9527,-6.2527,12.321,...,13.855,17.979,-25.794,16.491,13.304,-25.527,-52.493,79.977,8.9831,30.085
3,-6.3335,-5.8373,-1.502,-5.0176,6.4915,-25.894,11.604,5.1414,-2.569,6.2234,...,35.811,18.597,-27.58,12.57,10.744,-38.588,10.753,2.941,29.01,-32.906
4,5.6422,-2.6502,4.1389,-5.6228,0.74712,2.7475,-6.3798,9.0944,-13.554,10.946,...,7.1254,-38.921,-67.478,29.187,27.917,13.592,19.594,31.267,7.3044,-84.121


In [None]:
# Step 1: Convert DataFrame to NumPy array
adjacency_matrix = adjacency.to_numpy()

In [None]:
# Step 2: Convert data to GPU arrays
adjacency_tensor = torch.tensor(adjacency_matrix, device='cuda')

In [None]:
# Step 3: Compute Laplacian matrix
degree_tensor = torch.diag(torch.sum(adjacency_tensor, dim=1))
laplacian_tensor = degree_tensor - adjacency_tensor
degree_tensor

tensor([[5846,    0,    0,  ...,    0,    0,    0],
        [   0, 6999,    0,  ...,    0,    0,    0],
        [   0,    0, 3428,  ...,    0,    0,    0],
        ...,
        [   0,    0,    0,  ..., 5982,    0,    0],
        [   0,    0,    0,  ...,    0, 9141,    0],
        [   0,    0,    0,  ...,    0,    0, 8469]], device='cuda:0')

In [None]:
# Convert Laplacian matrix to float32
laplacian_tensor = laplacian_tensor.to(torch.float32)  # or torch.float64

# Step 4: Eigenvalue decomposition
eigenvalues, eigenvectors = torch.linalg.eigh(laplacian_tensor, UPLO='U')
print(eigenvalues.shape)
print(eigenvalues)

torch.Size([11952])
tensor([-2.1529e-04,  3.9998e+00,  4.9713e+00,  ...,  1.1070e+04,
         1.1085e+04,  1.1178e+04], device='cuda:0')


In [None]:
d = 103  # Dimension of spectral embedding
embedding_indices = torch.argsort(eigenvalues)[-d-1:-1]  # Get indices of last d vectors except the last one
selected_eigenvectors = eigenvectors[:, embedding_indices]

In [None]:
# Step 6: Stack eigenvectors
spectral_embedding = selected_eigenvectors

# Print or use spectral embedding
print("Spectral embedding:")
print(spectral_embedding)

Spectral embedding:
tensor([[ 9.0968e-05, -8.6991e-05,  9.5898e-05,  ..., -9.0604e-05,
         -8.8916e-05,  9.0048e-05],
        [ 9.6204e-05, -8.9045e-05,  1.0622e-04,  ..., -9.6435e-05,
         -9.0890e-05,  9.3975e-05],
        [-4.3267e-05, -9.7918e-05,  9.5097e-05,  ..., -9.1566e-05,
         -9.0223e-05,  9.0955e-05],
        ...,
        [ 8.8916e-05, -9.3041e-05,  9.5539e-05,  ..., -8.9981e-05,
         -8.8893e-05,  9.0116e-05],
        [ 1.0058e-04, -6.6687e-05,  9.7574e-05,  ..., -8.8018e-05,
         -8.6935e-05,  8.7079e-05],
        [ 9.0392e-05, -7.3977e-05,  8.6271e-05,  ..., -8.7131e-05,
         -8.4125e-05,  8.5648e-05]], device='cuda:0')


In [None]:
spectral_embedding[0]

tensor([ 9.0968e-05, -8.6991e-05,  9.5898e-05,  9.4064e-05, -9.2624e-05,
        -9.5889e-05,  8.9805e-05,  9.1975e-05,  9.3498e-05, -9.5701e-05,
        -1.4250e-04,  2.5296e-06, -1.2204e-04, -5.6332e-06,  9.3310e-05,
         8.9928e-05,  9.0722e-05,  9.2840e-05, -9.8520e-05, -1.5752e-04,
         5.6956e-06,  3.1722e-06, -8.9581e-05, -8.6320e-05,  1.3634e-04,
         1.0871e-06,  8.4795e-05, -9.3072e-05, -1.0483e-04, -1.3198e-04,
         3.4207e-06,  9.2167e-05,  1.4965e-04,  1.6738e-05,  7.9311e-06,
         7.7677e-05,  8.6025e-05, -8.8377e-05, -8.8158e-05, -9.0739e-05,
         1.2122e-04,  3.6062e-06,  9.1138e-05, -8.8438e-05,  9.1661e-05,
        -9.3222e-05,  9.1324e-05,  8.9295e-05, -8.6634e-05, -9.0191e-05,
         9.1199e-05, -9.1266e-05,  1.2271e-04,  1.2834e-05, -9.4394e-05,
         1.2574e-04, -1.3817e-07, -9.0083e-05,  8.9885e-05, -9.1667e-05,
         9.0570e-05, -9.1719e-05,  9.4016e-05, -8.9333e-05,  9.0005e-05,
        -9.3614e-05,  9.0551e-05, -8.7763e-05,  9.1

In [None]:
# Assuming 'spectral_embedding' is your PyTorch tensor on the CUDA device
spectral_embedding_cpu = spectral_embedding.cpu()  # Move tensor to CPU
embedding_np = spectral_embedding_cpu.numpy()  # Convert to NumPy array

In [None]:
scaler = StandardScaler()
scaled_features1 = scaler.fit_transform(embedding_np)

In [None]:
attributes_np = attributes.to_numpy()
scaler = StandardScaler()
scaled_features2 = scaler.fit_transform(attributes_np)

In [None]:
# concatenated_data = np.concatenate((scaled_features1, scaled_features2), axis=1)
concatenated_data = scaled_features1+scaled_features2

In [None]:
seeds = [
    [9897, 6314, 2080],
    [5863, 5838, 9831],
    [2193, 5757, 6346],
    [9219, 5958, 9646],
    [6345, 10950, 2174],
    [1942, 935, 4907],
    [1529, 9833, 7709],
    [4566, 902, 5633],
    [8938, 7423, 1853],
    [5793, 10805, 2258]
]

In [None]:
concatenated_data_ = concatenated_data[:10953]

In [None]:
centroids = []
for i in range(10):
    centroids.append(np.mean(concatenated_data[seeds[i]], axis=0))

In [None]:
centroids = np.array(centroids)

In [None]:
centroids.shape

(10, 103)

In [None]:
# Assuming centroids_np is a numpy array containing the manually specified cluster centroids
kmeans = KMeans(n_clusters=10, init=centroids, n_init=1)
kmeans.fit(concatenated_data)

In [None]:
cluster_labels = kmeans.labels_

In [None]:
cluster_labels

array([6, 7, 0, ..., 1, 0, 1], dtype=int32)

In [None]:
for i in seeds:
    for j in i:
        print(cluster_labels[j])

0
7
2
5
6
8
4
2
6
4
3
9
4
2
4
5
3
7
6
8
3
7
5
0
8
4
8
1
9
6


In [None]:
# Split the concatenated_data and kmeans_labels into training (10952) and test (1000) sets
train_features = concatenated_data[:10952]
train_labels = cluster_labels[:10952]
test_features = concatenated_data[10952:]

In [None]:
concatenated_data.shape

(11952, 103)

In [None]:
class SimpleClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # Adjust the number of units in the hidden layer as needed
        self.fc2 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
input_dim = concatenated_data_.shape[1]
output_dim = 10
model = SimpleClassifier(input_dim, output_dim)

In [None]:
# Convert data to PyTorch tensors
train_features_tensor = torch.tensor(train_features, dtype=torch.float32)
train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)
test_features_tensor = torch.tensor(test_features, dtype=torch.float32)

In [None]:
batch_size = 32
# Define a DataLoader for training data
train_dataset = TensorDataset(train_features_tensor, train_labels_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
from torch.optim.lr_scheduler import StepLR

# Define loss function and optimizer
learning_rate = 0.01
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)  # Learning rate scheduler
num_epochs = 30

# Train the neural network
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch_features, batch_labels in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch_features.size(0)  # Accumulate the total loss
    scheduler.step()  # Update the learning rate scheduler
    epoch_loss = total_loss / len(train_loader.dataset)  # Calculate the average loss for the epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.8f}')


Epoch [1/30], Loss: 1.10758981
Epoch [2/30], Loss: 1.35780994
Epoch [3/30], Loss: 1.17409483
Epoch [4/30], Loss: 1.35681690
Epoch [5/30], Loss: 1.31119144
Epoch [6/30], Loss: 1.52881944
Epoch [7/30], Loss: 0.97885101
Epoch [8/30], Loss: 0.95910504
Epoch [9/30], Loss: 0.69274573
Epoch [10/30], Loss: 0.98406268
Epoch [11/30], Loss: 0.62581756
Epoch [12/30], Loss: 0.07045279
Epoch [13/30], Loss: 0.01234375
Epoch [14/30], Loss: 0.00158620
Epoch [15/30], Loss: 0.00000000
Epoch [16/30], Loss: 0.00000000
Epoch [17/30], Loss: 0.00000000
Epoch [18/30], Loss: 0.00000000
Epoch [19/30], Loss: 0.00000000
Epoch [20/30], Loss: 0.00000000
Epoch [21/30], Loss: 0.00000000
Epoch [22/30], Loss: 0.00000000
Epoch [23/30], Loss: 0.00000000
Epoch [24/30], Loss: 0.00000000
Epoch [25/30], Loss: 0.00000000
Epoch [26/30], Loss: 0.00000000
Epoch [27/30], Loss: 0.00000000
Epoch [28/30], Loss: 0.00000000
Epoch [29/30], Loss: 0.00000000
Epoch [30/30], Loss: 0.00000000


In [None]:
# Predict labels for the test features
model.eval()
with torch.no_grad():
    test_outputs = model(test_features_tensor)
    _, predicted_labels = torch.max(test_outputs, 1)
    predicted_labels = predicted_labels.numpy()

# Combine KMeans cluster labels for the first 10952 nodes and predicted labels for the last 1000 nodes
all_labels = np.concatenate((cluster_labels, predicted_labels), axis=0)

In [None]:
all_labels

array([1, 7, 3, ..., 1, 9, 1])

In [None]:
all_labels.max()

9

In [None]:
import pandas as pd

# Create a DataFrame with ID and LABEL columns
df = pd.DataFrame({'ID': range(11952), 'LABEL': cluster_labels})

# Save the DataFrame to a CSV file without storing the index
df.to_csv('all_labels.csv', index=False)

In [None]:
for i in range(10):
    print(np.count_nonzero(cluster_labels==i))

1165
1195
1181
1196
1189
1208
791
2393
1201
433


In [None]:
normalized_embeddings

array([[ 0.00913448, -0.00923027],
       [ 0.0091351 , -0.00923698],
       [ 0.00913986, -0.00924953],
       ...,
       [ 0.00914806, -0.00926099],
       [ 0.00913844, -0.00923829],
       [ 0.00913854, -0.0092379 ]], dtype=float32)

In [None]:
!pip install node2vec

Collecting node2vec
  Downloading node2vec-0.4.6-py3-none-any.whl.metadata (743 bytes)
Collecting networkx<3.0,>=2.5 (from node2vec)
  Downloading networkx-2.8.8-py3-none-any.whl.metadata (5.1 kB)
Downloading node2vec-0.4.6-py3-none-any.whl (7.0 kB)
Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: networkx, node2vec
  Attempting uninstall: networkx
    Found existing installation: networkx 3.2.1
    Uninstalling networkx-3.2.1:
      Successfully uninstalled networkx-3.2.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
momepy 0.7.0 requires shapely>=2, but you have shapely 1.8.5.post1 which is incompatible.
osmnx 1.9.2 requires shapely>=2.0, but you have shapely 1.8.5.post1 which is 

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from node2vec import Node2Vec
import networkx as nx

In [None]:
G = nx.from_numpy_matrix(adjacency)

AttributeError: 'DataFrame' object has no attribute 'dtype'

In [None]:
# Initialize node2vec
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=1)

In [None]:
# Generate embeddings
model = node2vec.fit(window=10, min_count=1, batch_words=4)

In [None]:
# Step 3: Concatenate embeddings with the feature DataFrame
# Get embeddings for all nodes
embeddings = np.array([model.wv[str(node)] for node in range(len(adjacency))])

In [None]:
# Concatenate embeddings with features
combined_features = np.concatenate((embeddings, attributes.values), axis=1)

In [None]:
# Step 4: Apply K-Means clustering
# Assuming num_clusters is the desired number of clusters
num_clusters = 2

# Initialize K-Means
kmeans = KMeans(n_clusters=10, init=centroids, n_init=1)

NameError: name 'KMeans' is not defined

In [None]:
# Fit K-Means to the combined features
cluster_labels = kmeans.fit_predict(combined_features)

In [None]:
G.nodes()

In [None]:
subset_nodes = list(G.nodes())[:15]  # Choose the first 100 nodes as an example
subgraph = G.subgraph(subset_nodes)

# Draw the subgraph
plt.figure(figsize=(20, 20))
nx.draw(subgraph, with_labels=True, node_size=50, node_color='skyblue', font_size=8)
plt.title("Subgraph Visualization")
plt.show()

In [None]:
# Save the graph as a GraphML file
nx.write_graphml(G, "graph.graphml")
