In [85]:
# conda install pyg -c pyg

done
Solving environment: \ ^C

Note: you may need to restart the kernel to use updated packages.


In [5]:
# !pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.1.2+cu121.html

Looking in links: https://data.pyg.org/whl/torch-2.1.2+cu121.html


In [1]:
!pip install torch_geometric


Collecting torch_geometric
  Downloading torch_geometric-2.5.2-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.5.2-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.5.2


In [38]:
import torch
from torch_geometric.data import Data
import pandas as pd
import numpy as np
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
# from torch_scatter import scatter
from torch.optim.optimizer import Optimizer
from sklearn.model_selection import train_test_split


In [5]:
class GCN(torch.nn.Module):
    def __init__(self, dataset, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(42)

        self.conv1 = GCNConv(dataset.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.out = torch.nn.Linear(hidden_channels, dataset.y.max()+1)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.conv2(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)

        x = F.softmax(self.out(x), dim=1)
        return x


In [117]:
base = "/kaggle/input/aist4010-spring2024-a3/"

edges = pd.read_csv(base + 'edges.txt', sep='\t', header=None).values
features = pd.read_csv(base + 'features.txt', sep=' ', header=None)
train_labels = pd.read_csv(base + 'train_labels.csv')
val_labels = pd.read_csv(base + 'val_labels.csv')
test_idx = pd.read_csv(base + 'test_idx.csv')['id'].values

# mapping from node IDs to indices
node_ids = features.iloc[:, 0].astype(int).values  # Node IDs
node_id_to_idx = {node_id: idx for idx, node_id in enumerate(node_ids)}

# remap edge indices
remapped_edges = [
    [node_id_to_idx[edge[0]], node_id_to_idx[edge[1]]]
    for edge in edges if edge[0] in node_id_to_idx and edge[1] in node_id_to_idx
]
edge_index = torch.tensor(remapped_edges, dtype=torch.long).t().contiguous()

# extract node features
x = torch.tensor(features.iloc[:, 1:].values, dtype=torch.float)

# labels tensor
y = torch.full((len(node_ids),), -1, dtype=torch.long)

train_mask = torch.zeros(len(node_ids), dtype=torch.bool)
val_mask = torch.zeros(len(node_ids), dtype=torch.bool)
test_mask = torch.zeros(len(node_ids), dtype=torch.bool)


val_labels_shuffled = val_labels.sample(frac=1).reset_index(drop=True)
split_index = int(len(val_labels_shuffled) * 0.8)  # 80% of the validation set

val_to_train = val_labels_shuffled.iloc[:split_index]
remaining_val = val_labels_shuffled.iloc[split_index:]

for _, row in train_labels.iterrows():
    node_id = int(row['id'])
    if node_id in node_id_to_idx:
        idx = node_id_to_idx[node_id]
        y[idx] = int(row['label'].split('_')[1])
        train_mask[idx] = True

for _, row in val_to_train.iterrows():
    node_id = int(row['id'])
    if node_id in node_id_to_idx:
        idx = node_id_to_idx[node_id]
        y[idx] = int(row['label'].split('_')[1])
        train_mask[idx] = True

val_mask[:] = False
for _, row in remaining_val.iterrows():
    node_id = int(row['id'])
    if node_id in node_id_to_idx:
        idx = node_id_to_idx[node_id]
        y[idx] = int(row['label'].split('_')[1])
        val_mask[idx] = True

# for _, row in val_labels.iterrows():
#     node_id = int(row['id'])
#     if node_id in node_id_to_idx:
#         idx = node_id_to_idx[node_id]
#         y[idx] = int(row['label'].split('_')[1])
#         val_mask[idx] = True

for node_id in test_idx:
    if int(node_id) in node_id_to_idx:
        test_mask[node_id_to_idx[int(node_id)]] = True

data = Data(x=x, edge_index=edge_index, y=y, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
data

Data(x=[2707, 1433], edge_index=[2, 5427], y=[2707], train_mask=[2707], val_mask=[2707], test_mask=[2707])

In [123]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

lr = 0.005
epochs = 200
weight_decay = 0.0005
runs = 10

update_freq=50
eps=0.01
model = GCN(data, hidden_channels=48)

optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
criterion = torch.nn.CrossEntropyLoss()


def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss
    
def val():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    train_correct = pred[data.train_mask] == data.y[data.train_mask]
    train_acc = int(train_correct.sum()) / int(data.train_mask.sum())
    val_correct = pred[data.val_mask] == data.y[data.val_mask]
    val_acc = int(val_correct.sum()) / int(data.val_mask.sum())
#     print(f'Train Accuracy: {train_acc:.4f}, Validation Accuracy: {val_acc:.4f}')
    return train_acc, val_acc
    

best_val_acc = 0
best_model = None
losses = []
for epoch in range(5001):
    loss = train()
    train_acc, val_acc = val()
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model = model.state_dict()
    losses.append(loss)
    if epoch % 100 == 0:
        print(f'Epoch: {epoch:3d}, Train Loss: {loss:.4f} Train Accuracy: {train_acc:.4f}, Validation Accuracy: {val_acc:.4f}')

torch.save(best_model, 'model.pth')
model.load_state_dict(torch.load('model.pth'))


Epoch:   0, Train Loss: 1.9469 Train Accuracy: 0.4427, Validation Accuracy: 0.4077
Epoch: 100, Train Loss: 1.2480 Train Accuracy: 0.9581, Validation Accuracy: 0.8577
Epoch: 200, Train Loss: 1.2288 Train Accuracy: 0.9702, Validation Accuracy: 0.8423
Epoch: 300, Train Loss: 1.2144 Train Accuracy: 0.9758, Validation Accuracy: 0.8462
Epoch: 400, Train Loss: 1.2201 Train Accuracy: 0.9782, Validation Accuracy: 0.8538
Epoch: 500, Train Loss: 1.2162 Train Accuracy: 0.9750, Validation Accuracy: 0.8385
Epoch: 600, Train Loss: 1.2176 Train Accuracy: 0.9782, Validation Accuracy: 0.8346
Epoch: 700, Train Loss: 1.2163 Train Accuracy: 0.9766, Validation Accuracy: 0.8423
Epoch: 800, Train Loss: 1.2161 Train Accuracy: 0.9774, Validation Accuracy: 0.8385
Epoch: 900, Train Loss: 1.2132 Train Accuracy: 0.9782, Validation Accuracy: 0.8423
Epoch: 1000, Train Loss: 1.2119 Train Accuracy: 0.9782, Validation Accuracy: 0.8423
Epoch: 1100, Train Loss: 1.2180 Train Accuracy: 0.9774, Validation Accuracy: 0.8385
Ep

<All keys matched successfully>

In [124]:
def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    predicted_classes = pred[data.test_mask]
    return predicted_classes

result = test()

df = pd.DataFrame({
    'id': test_idx,
    'label': result.tolist()
})
df['label'] = df['label'].apply(lambda label: f'Class_{label}')
df.to_csv('submission.csv', index=False)
df

Unnamed: 0,id,label
0,463825,Class_5
1,1121569,Class_2
2,1105531,Class_6
3,14428,Class_6
4,14429,Class_6
...,...,...
1202,1128975,Class_1
1203,1128977,Class_1
1204,1128978,Class_1
1205,117328,Class_3
