<a href="https://colab.research.google.com/github/lingyixu/CS523-Deep-Learning/blob/main/Non_Graph_NN/MLP_Multiclass_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load packages

In [1]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

2.0.0+cu118
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone


In [2]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
# import torch_geometric
# from torch_geometric.data import Dataset, Data
# from torch_geometric.nn import GCNConv, GATConv
# from torch_geometric.transforms import RandomNodeSplit

In [3]:
import pandas as pd
import numpy as np
import json
import glob
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [4]:
from google.colab import drive
drive.mount('/content/drive')
drive_path = '/content/drive/Shareddrives/CS523/lastFM-data/'

Mounted at /content/drive


### Load dataset: LastFM

In [5]:
df_feat = pd.read_json(drive_path + 'processed_feature.json')   # load features
arr_feat = df_feat.T.values
df_edge = pd.read_csv(drive_path + 'lastfm_asia_edges.csv')   # load edge indices
arr_edge = df_edge.T.values
df_tar = pd.read_csv(drive_path + 'lastfm_asia_target.csv', index_col=0)   # load targets
arr_tar = df_tar.values

num_nodes = arr_feat.shape[0]
num_features = arr_feat.shape[1]
num_classes = len(df_tar.target.unique())
classes = df_tar.target.sort_values().unique()

print('Dataset:')
print('====================')
print(f'Number of users: {num_nodes}')
print(f'Number of features: {num_features}')
# print(f'Number of edges: {arr_edge.shape[1]}')
print(f'Number of distinct regions: {num_classes}')
print(f'All region classes: {classes}')

Dataset:
Number of users: 7624
Number of features: 7842
Number of distinct regions: 18
All region classes: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]


### Create Dataset

In [6]:
class LastFM(Dataset):
    def __init__(self, x, y=None, transform=None):
        self.x = torch.tensor(x, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
        self.transform = transform
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        data = self.x[index,]
        if self.transform is not None:
            data = self.transform(data)
        if self.y is not None:
            return data, self.y[index]
        else:
            return data

In [7]:
# fm_dataset = LastFM(x=arr_feat, y=arr_tar)
x_train, x_test, y_train, y_test = train_test_split(arr_feat, arr_tar, test_size=0.2, random_state=42)
train_set = LastFM(x=x_train, y=y_train)
test_set = LastFM(x=x_test, y=y_test)

In [8]:
print(f'Number of training data: {len(train_set)}')
print(f'Number of testing data: {len(test_set)}')

Number of training data: 6099
Number of testing data: 1525


### Build Model

In [11]:
# ref: https://www.kaggle.com/code/pinocookie/pytorch-simple-mlp/notebook
class MLP(torch.nn.Module):
    def __init__(self, hidden_channels1, hidden_channels2, hidden_channels3):
        super().__init__()
        # torch.manual_seed(42)
        self.lin1 = nn.Linear(num_features, hidden_channels1)
        self.lin2 = nn.Linear(hidden_channels1, hidden_channels2)
        self.lin3 = nn.Linear(hidden_channels2, hidden_channels3)
        self.lin4 = nn.Linear(hidden_channels3, num_classes)

    def forward(self, x):
        x = self.lin1(x)
        x = x.relu()
        x = F.dropout(x, p=0.1)
        x = self.lin2(x)
        x = x.relu()
        x = F.dropout(x, p=0.1)
        x = self.lin3(x)
        x = x.relu()
        x = F.dropout(x, p=0.1)
        x = self.lin4(x)
        return x

In [12]:
model = MLP(512, 256, 128)
print(model)

MLP(
  (lin1): Linear(in_features=7842, out_features=512, bias=True)
  (lin2): Linear(in_features=512, out_features=256, bias=True)
  (lin3): Linear(in_features=256, out_features=128, bias=True)
  (lin4): Linear(in_features=128, out_features=18, bias=True)
)


In [13]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=5e-4)

In [16]:
def train():
      model.train()
      optimizer.zero_grad()
      out = model(train_set.x)
      loss = criterion(out, train_set.y.flatten())
      loss.backward()
      optimizer.step()
      return loss

def test():
      model.eval()
      out = model(test_set.x)
      pred = out.argmax(dim=1)
      test_correct = pred == test_set.y.flatten()
      test_acc = int(test_correct.sum()) / len(test_set)
      return test_acc

for epoch in range(50):
    loss = train()
    print(f'Epoch: {epoch+1:02d}, Loss: {loss:.4f}')

Epoch: 01, Loss: 0.1220
Epoch: 02, Loss: 0.1196
Epoch: 03, Loss: 0.1186
Epoch: 04, Loss: 0.1175
Epoch: 05, Loss: 0.1161
Epoch: 06, Loss: 0.1143
Epoch: 07, Loss: 0.1132
Epoch: 08, Loss: 0.1118
Epoch: 09, Loss: 0.1105
Epoch: 10, Loss: 0.1097
Epoch: 11, Loss: 0.1088
Epoch: 12, Loss: 0.1073
Epoch: 13, Loss: 0.1045
Epoch: 14, Loss: 0.1050
Epoch: 15, Loss: 0.1045
Epoch: 16, Loss: 0.1040
Epoch: 17, Loss: 0.1024
Epoch: 18, Loss: 0.1013
Epoch: 19, Loss: 0.1004
Epoch: 20, Loss: 0.0998
Epoch: 21, Loss: 0.1001
Epoch: 22, Loss: 0.0986
Epoch: 23, Loss: 0.0972
Epoch: 24, Loss: 0.0961
Epoch: 25, Loss: 0.0965
Epoch: 26, Loss: 0.0953
Epoch: 27, Loss: 0.0949
Epoch: 28, Loss: 0.0940
Epoch: 29, Loss: 0.0923
Epoch: 30, Loss: 0.0917
Epoch: 31, Loss: 0.0917
Epoch: 32, Loss: 0.0913
Epoch: 33, Loss: 0.0897
Epoch: 34, Loss: 0.0894
Epoch: 35, Loss: 0.0887
Epoch: 36, Loss: 0.0891
Epoch: 37, Loss: 0.0885
Epoch: 38, Loss: 0.0870
Epoch: 39, Loss: 0.0862
Epoch: 40, Loss: 0.0865
Epoch: 41, Loss: 0.0854
Epoch: 42, Loss:

In [18]:
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.7370
