In [23]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch_geometric 
from torch_geometric.data import Dataset, Data, DataLoader
from torch_geometric.utils import to_dense_batch

from sklearn.preprocessing import LabelEncoder


In [8]:
df = pd.read_csv('./InnoTech-Data/Trainset-Evaluation/Train_Set.csv')
ts = pd.read_csv('./InnoTech-Data/Transaction/InnoTech_Trans.csv')
# For prototype phase we remove the nan values
ts=ts.dropna()
ts = ts.reset_index().drop(columns='index')

# TEMP LINE CODE
df = df.drop_duplicates(subset=['CARD'])

# Data Engineering

## Label encoder for IDs


In [9]:
le = LabelEncoder()  
le.fit(np.unique(np.concatenate([ts['Primary_ID'].values,ts['Second_ID'].values,df['CARD'].values])).tolist())

ts['Primary_ID'] = le.transform(ts['Primary_ID'])
ts['Second_ID'] = le.transform(ts['Second_ID'])
df['CARD'] = le.transform(df['CARD'])

## Graph Creation

We need to convert the data into graph. this function will use in future for inference too.

In [11]:
import torch
from torch_geometric.data import Data

def construct_graph(card_id,y):
    """
    This function will create a graph of transaction for each card base on the transaction dataset. The created graph
    is directed which means each CARD_ID can be source or destination of the transaction.

    params:
        card_id 
        ts: transaction dataset

    return:

        graph 
    """

    card_to_index = ts[ts['Primary_ID']==card_id].index
    card_in_index = ts[ts['Second_ID']==card_id].index

    edge_index_to = ts.loc[card_to_index,['Primary_ID','Second_ID']].values
    edge_index_in = ts.loc[card_in_index,['Primary_ID','Second_ID']].values

    edge_indexs = np.concatenate([edge_index_in,edge_index_to])

    nodes = np.concatenate([np.unique((ts.loc[card_to_index,['Second_ID']])),
                            np.unique((ts.loc[card_in_index,['Primary_ID']]))])
    
    nodes = torch.tensor(nodes)
    edge_indexs = torch.tensor(edge_indexs)
    
    data = Data(x=nodes, edge_index=edge_indexs,y=y)

    return data

In [17]:
data = []
for (i,y) in zip(df['CARD'],df['LABEL']):
    data.append(construct_graph(i,y))



In [16]:
# Save the prepared data
torch.save(data,'graph_data.pt')

# Data Loader

In [188]:
import os
import torch
from torch_geometric.data import Dataset, Data
from torch_geometric.utils import to_dense_batch

class CustomGraphDataset(Dataset):
    def __init__(self, root, transform=None):
        self.root = root
        self.transform = transform
        self.graph_files =root
        self.data = torch.load(self.graph_files)
        self.max_num_nodes = self.find_max_num_nodes()
    
    def find_max_num_nodes(self):
        max_num_nodes = max(data.num_nodes for data in self.data)
        return max_num_nodes
    
    def indices(self):
        return list(range(self.len()))

    def len(self):
        return len(self.data)

    def get(self, idx):
        
        num_nodes_to_pad = self.max_num_nodes - self.data[idx].num_nodes
        num_edge_to_pad = self.max_num_nodes - self.data[idx].edge_index.shape[0]
        
        # Pad num_nodes with zeros
        self.data[idx].x = torch.cat([self.data[idx].x, torch.zeros(num_nodes_to_pad)])
        
        # Pad edge_index with zeros
        self.data[idx].edge_index = torch.cat([self.data[idx].edge_index, torch.zeros(num_edge_to_pad,2)])
        
        # Update the number of nodes in the data object
        self.data[idx].num_nodes = self.max_num_nodes

        return self.data[idx]

In [189]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import to_dense_batch

def collate_fn(batch):
    batch = to_dense_batch(batch, fill_value=0)
    return Data(x=batch.x, edge_index=batch.edge_index, y=batch.y)

In [190]:
root = './graph_data.pt'
train_dataset = CustomGraphDataset(root)
train_dataset.max_num_nodes

24650

In [191]:
len(train_dataset)

3235

In [192]:
for i in train_dataset:
    print(i)

Data(x=[24650], edge_index=[24650, 2], y=1.0, num_nodes=24650)
Data(x=[24650], edge_index=[24650, 2], y=1.0, num_nodes=24650)
Data(x=[24650], edge_index=[24650, 2], y=1.0, num_nodes=24650)
Data(x=[24650], edge_index=[24650, 2], y=1.0, num_nodes=24650)
Data(x=[24650], edge_index=[24650, 2], y=1.0, num_nodes=24650)
Data(x=[24650], edge_index=[24650, 2], y=1.0, num_nodes=24650)
Data(x=[24650], edge_index=[24650, 2], y=1.0, num_nodes=24650)
Data(x=[24650], edge_index=[24650, 2], y=1.0, num_nodes=24650)
Data(x=[24650], edge_index=[24650, 2], y=1.0, num_nodes=24650)
Data(x=[24650], edge_index=[24650, 2], y=1.0, num_nodes=24650)
Data(x=[24650], edge_index=[24650, 2], y=1.0, num_nodes=24650)
Data(x=[24650], edge_index=[24650, 2], y=1.0, num_nodes=24650)
Data(x=[24650], edge_index=[24650, 2], y=1.0, num_nodes=24650)
Data(x=[24650], edge_index=[24650, 2], y=1.0, num_nodes=24650)
Data(x=[24650], edge_index=[24650, 2], y=1.0, num_nodes=24650)
Data(x=[24650], edge_index=[24650, 2], y=1.0, num_nodes

RuntimeError: Trying to create tensor with negative dimension -2960: [-2960, 2]

In [193]:
loader = DataLoader(train_dataset, batch_size=4)




In [195]:
for i in loader:
    print(i)

DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600

DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600], ptr=[5])
DataBatch(x=[98600], edge_index=[24650, 8], y=[4], num_nodes=98600, batch=[98600

RuntimeError: Trying to create tensor with negative dimension -2960: [-2960, 2]