In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install torch-geometric \
  torch-sparse==latest+cu101 \
  torch-scatter==latest+cu101 \
  torch-cluster==latest+cu101 \
  -f https://pytorch-geometric.com/whl/torch-1.5.0.html

Looking in links: https://pytorch-geometric.com/whl/torch-1.5.0.html
Collecting torch-sparse==latest+cu101
  Using cached https://pytorch-geometric.com/whl/torch-1.5.0/torch_sparse-latest%2Bcu101-cp36-cp36m-linux_x86_64.whl
Collecting torch-scatter==latest+cu101
  Using cached https://pytorch-geometric.com/whl/torch-1.5.0/torch_scatter-latest%2Bcu101-cp36-cp36m-linux_x86_64.whl
Collecting torch-cluster==latest+cu101
  Using cached https://pytorch-geometric.com/whl/torch-1.5.0/torch_cluster-latest%2Bcu101-cp36-cp36m-linux_x86_64.whl
Installing collected packages: torch-sparse, torch-scatter, torch-cluster
  Found existing installation: torch-sparse 0.6.5
    Uninstalling torch-sparse-0.6.5:
      Successfully uninstalled torch-sparse-0.6.5
  Found existing installation: torch-scatter 2.0.4
    Uninstalling torch-scatter-2.0.4:
      Successfully uninstalled torch-scatter-2.0.4
  Found existing installation: torch-cluster 1.5.4
    Uninstalling torch-cluster-1.5.4:
      Successfully uni

In [3]:
import numpy as np
import pandas as pd
import pickle
import csv
import os
import torch
from torch_geometric.data import Data

In [4]:
np.random.seed(42)

In [5]:
import os
os.chdir('/content/drive/My Drive/RecSys')

In [6]:
edge = pd.read_excel('/content/drive/My Drive/RecSys/node_edges.xlsx', 'edge')
edge.columns=['id','label','From','To']
edge

Unnamed: 0,id,label,From,To
0,4eb7eb9d-3ad9-82ad-ed4e-26c0536bb12e,purchased by,49286,Cu5380
1,c2b7eb9d-3ad9-d402-83d8-4f664e132d09,purchased by,49286,Cu5381
2,0ab7ec08-2377-863c-98f1-0aec8069c4d2,purchased by,49340,Cu4476
3,0eb7ec08-2377-cbc8-6477-4fa8b5011fef,purchased by,49340,Cu4477
4,6eb7ec08-2377-ad6a-8403-b7769afb7172,purchased by,49340,Cu4478
...,...,...,...,...
29571,0cb7ebb4-2518-684d-4e63-b1279ba4b968,purchased by,63840,Cu1211
29572,6ab7ec08-d8ce-deba-3355-e403a18d9f5d,purchased by,63864,Cu2829
29573,52b7ec08-d8ce-d9e8-3a6b-23b24a4286ff,purchased by,63864,Cu2830
29574,b0b7ebf3-7ab4-5609-4521-8300d452bb64,purchased by,64008,Cu4470


In [7]:
from sklearn.preprocessing import LabelEncoder

edge_encoder = LabelEncoder()
edge['From'] = edge_encoder.fit_transform(edge.From)
edge['To'] = edge_encoder.fit_transform(edge.To)
edge.head()

Unnamed: 0,id,label,From,To
0,4eb7eb9d-3ad9-82ad-ed4e-26c0536bb12e,purchased by,1508,757
1,c2b7eb9d-3ad9-d402-83d8-4f664e132d09,purchased by,1508,758
2,0ab7ec08-2377-863c-98f1-0aec8069c4d2,purchased by,1511,669
3,0eb7ec08-2377-cbc8-6477-4fa8b5011fef,purchased by,1511,670
4,6eb7ec08-2377-ad6a-8403-b7769afb7172,purchased by,1511,671


In [8]:
nodes = pd.read_excel('/content/drive/My Drive/RecSys/node_edges.xlsx', 'nodes')
nodes.columns=['id','label']
nodes

Unnamed: 0,id,label
0,49286,product
1,49340,product
2,49368,product
3,49372,product
4,49374,product
...,...,...
4021,Cu4145,Customer
4022,Cu6272,Customer
4023,Cu3322,Customer
4024,Cu3717,Customer


In [9]:
edge.nunique()

id       29576
label        1
From      2826
To        1199
dtype: int64

In [10]:
nodes.nunique()

id       4025
label       4
dtype: int64

In [11]:
edge['From'].min(),edge['From'].max()

(0, 2825)

In [12]:
edge['To'].min(),edge['To'].max()

(0, 1198)

In [13]:
edge.isna().sum()

id       0
label    0
From     0
To       0
dtype: int64

In [14]:
# average length of session 
edge.groupby('id')['From'].size().mean()

1.0

In [15]:
transaction_dict = dict(edge.groupby('id')['From'].apply(list))
transaction_dict

{'00b7d72a-949b-ca4d-ceb3-d982817ee686': [175],
 '00b7d72c-0646-bb70-6c7a-35c82c35fcbc': [4],
 '00b7d72c-0af6-1304-44ef-d94ae2179bdd': [3],
 '00b7d72c-ba98-cc80-d348-f229734c3bbe': [20],
 '00b7d72f-449c-a4c8-3d73-ba6316b9f304': [204],
 '00b7d72f-7d86-38d4-a82e-9ca9267c385f': [130],
 '00b7d72f-7e2c-f7c5-1072-bdf98e6888f2': [123],
 '00b7d730-2c32-06da-d22b-c63ff4760e0c': [163],
 '00b7d737-b22d-436e-cfbf-5878967e9f6e': [396],
 '00b7d737-b22d-5fe1-6952-7151431fdbab': [396],
 '00b7d738-39fd-5e81-a802-9b3713b7fe82': [213],
 '00b7d739-1b1e-8db6-d478-7faeac77fa7d': [144],
 '00b7d739-ea0c-7185-3397-ba0ca04afcb8': [246],
 '00b7d73a-1e6d-d398-a618-39d0b2671483': [250],
 '00b7d73a-229f-5564-a612-18db9630446c': [252],
 '00b7d73a-e664-9db2-ee4a-d355a53d09b8': [68],
 '00b7e11e-b00d-f124-1539-ab12682d8a6e': [988],
 '00b7e120-0ffc-452b-73a1-41e3cdd7a19f': [875],
 '00b7e120-c56f-9c7c-b318-2e17098ac2f7': [663],
 '00b7e120-c899-8998-d51b-99f16487e68e': [780],
 '00b7e120-c8eb-5cbb-6ba2-cb4c93848b55': [779]

In [16]:
import torch
from torch_geometric.data import InMemoryDataset
from tqdm import tqdm

class YooChooseDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(YooChooseDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []
    @property
    def processed_file_names(self):
        return ['/content/drive/My Drive/RecSys/transaction_binary.dataset']

    def download(self):
        pass
    
    def process(self):
        
        data_list = []

        # process by session_id
        grouped = edge.groupby('id')
        for id, group in tqdm(grouped):
            le = LabelEncoder()
            item_id = le.fit_transform(group.From)
            group = group.reset_index(drop=True)
            group['item_id'] = item_id
            node_features = group.loc[group.id==id,['item_id','From']].sort_values('item_id')['From'].drop_duplicates().values

            node_features = torch.LongTensor(node_features).unsqueeze(1)
            # target_nodes = group.sess_item_id.values[1:]
            # source_nodes = group.sess_item_id.values[:-1]

            edge_index = torch.tensor([group['From'],
                                   group['To']], dtype=torch.long)
            x = node_features

            if id in transaction_dict:
                positive_indices = le.transform(transaction_dict[id])
                # print(positive_indices)
                label = np.zeros(len(node_features))
                label[positive_indices] = 1
            else:
                # print("Hello")
                label = [0] * len(node_features)


            y = torch.FloatTensor(label)

            data = Data(x=x, edge_index=edge_index, y=y)

            data_list.append(data)
        
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [17]:
# import torch
# from torch_geometric.data import InMemoryDataset
# from tqdm import tqdm

# class YooChooseBinaryDataset(InMemoryDataset):
#     def __init__(self, root, transform=None, pre_transform=None):
#         super(YooChooseBinaryDataset, self).__init__(root, transform, pre_transform)
#         self.data, self.slices = torch.load(self.processed_paths[0])

#     @property
#     def raw_file_names(self):
#         return []
#     @property
#     def processed_file_names(self):
#         return ['../input/yoochoose_click_binary_1M_sess.dataset']

#     def download(self):
#         pass
    
#     def process(self):
        
#         data_list = []

#         # process by session_id
#         grouped = df.groupby('session_id')
#         for session_id, group in tqdm(grouped):
#             sess_item_id = LabelEncoder().fit_transform(group.item_id)
#             group = group.reset_index(drop=True)
#             group['sess_item_id'] = sess_item_id
#             node_features = group.loc[group.session_id==session_id,['sess_item_id','item_id']].sort_values('sess_item_id').item_id.drop_duplicates().values

#             node_features = torch.LongTensor(node_features).unsqueeze(1)
#             target_nodes = group.sess_item_id.values[1:]
#             source_nodes = group.sess_item_id.values[:-1]

#             edge_index = torch.tensor([source_nodes,
#                                    target_nodes], dtype=torch.long)
#             x = node_features

#             y = torch.FloatTensor([group.label.values[0]])

#             data = Data(x=x, edge_index=edge_index, y=y)
#             data_list.append(data)
        
#         data, slices = self.collate(data_list)
#         torch.save((data, slices), self.processed_paths[0])
        

In [18]:
dataset = YooChooseDataset(root='../')

In [19]:
dataset = dataset.shuffle()
one_tenth_length = int(len(dataset) * 0.1)
train_dataset = dataset[:one_tenth_length * 8]
val_dataset = dataset[one_tenth_length*8:one_tenth_length * 9]
test_dataset = dataset[one_tenth_length*9:]
len(train_dataset), len(val_dataset), len(test_dataset)

(23656, 2957, 2963)

In [20]:
from torch_geometric.data import DataLoader
batch_size= 100
train_loader = DataLoader(train_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [21]:
num_items = edge.From.max() +1 
num_items 

2826

In [22]:
import torch
from torch.nn import Sequential as Seq, Linear, ReLU
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import remove_self_loops, add_self_loops
class SAGEConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(SAGEConv, self).__init__(aggr='max') #  "Max" aggregation.
        self.lin = torch.nn.Linear(in_channels, out_channels)
        self.act = torch.nn.ReLU()
        self.update_lin = torch.nn.Linear(in_channels + out_channels, in_channels, bias=False)
        self.update_act = torch.nn.ReLU()
        
    def forward(self, x, edge_index):
        # x has shape [N, in_channels]
        # edge_index has shape [2, E]
        
        
        edge_index, _ = remove_self_loops(edge_index)
        edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))
        
        
        return self.propagate(edge_index, size=(x.size(0), x.size(0)), x=x)

    def message(self, x_j):
        # x_j has shape [E, in_channels]

        x_j = self.lin(x_j)
        x_j = self.act(x_j)
        
        return x_j

    def update(self, aggr_out, x):
        # aggr_out has shape [N, out_channels]


        new_embedding = torch.cat([aggr_out, x], dim=1)
        
        new_embedding = self.update_lin(new_embedding)
        new_embedding = self.update_act(new_embedding)
        
        return new_embedding

In [23]:
embed_dim = 128
from torch_geometric.nn import GraphConv, TopKPooling, GatedGraphConv
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
import torch.nn.functional as F
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = SAGEConv(embed_dim, 128)
        self.pool1 = TopKPooling(128, ratio=0.8)
        self.conv2 = SAGEConv(128, 128)
        self.pool2 = TopKPooling(128, ratio=0.8)
        self.conv3 = SAGEConv(128, 128)
        self.pool3 = TopKPooling(128, ratio=0.8)
        self.item_embedding = torch.nn.Embedding(num_embeddings=edge.From.max() +1, embedding_dim=embed_dim)
        self.lin1 = torch.nn.Linear(256, 128)
        self.lin2 = torch.nn.Linear(128, 64)
        self.lin3 = torch.nn.Linear(64, 1)
        self.bn1 = torch.nn.BatchNorm1d(128)
        self.bn2 = torch.nn.BatchNorm1d(64)
        self.act1 = torch.nn.ReLU()
        self.act2 = torch.nn.ReLU()        
  
    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.item_embedding(x)
        x = x.squeeze(1)        

        x = F.relu(self.conv1(x, edge_index))

        x, edge_index, _, batch, _ = self.pool1(x, edge_index, None, batch)
        x1 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(self.conv2(x, edge_index))
     
        x, edge_index, _, batch, _ = self.pool2(x, edge_index, None, batch)
        x2 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(self.conv3(x, edge_index))

        x, edge_index, _, batch, _ = self.pool3(x, edge_index, None, batch)
        x3 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = x1 + x2 + x3

        x = self.lin1(x)
        x = self.act1(x)
        x = self.lin2(x)
        x = self.act2(x)      
        x = F.dropout(x, p=0.5, training=self.training)

        x = torch.sigmoid(self.lin3(x)).squeeze(1)

        return x

In [24]:
device = torch.device('cuda')
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
crit = torch.nn.BCELoss()

RuntimeError: ignored

In [None]:
def train():
    model.train()

    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        label = data.y.to(device)
        loss = crit(output, label)
        loss.backward()
        loss_all += data.num_graphs * loss.item()
        optimizer.step()
    return loss_all / len(train_dataset)

In [None]:
from sklearn.metrics import roc_auc_score
def evaluate(loader):
    model.eval()

    predictions = []
    labels = []

    with torch.no_grad():
        for data in loader:

            data = data.to(device)
            pred = model(data).detach().cpu().numpy()

            label = data.y.detach().cpu().numpy()
            predictions.append(pred)
            labels.append(label)

    predictions = np.hstack(predictions)
    labels = np.hstack(labels)
    
    return roc_auc_score(labels, predictions)

In [None]:
for epoch in range(1):
    loss = train()
    train_acc = evaluate(train_loader)
    val_acc = evaluate(val_loader)    
    test_acc = evaluate(test_loader)
    print('Epoch: {:03d}, Loss: {:.5f}, Train Auc: {:.5f}, Val Auc: {:.5f}, Test Auc: {:.5f}'.
          format(epoch, loss, train_acc, val_acc, test_acc))

Epoch: 000, Loss: 0.28378, Train Auc: 0.76702, Val Auc: 0.73039, Test Auc: 0.72918
