In [1]:
import sys
sys.path.append("..")

import networkx as nx
import pandas as pd
import torch

from src.features import embed_graph
from src.models import train

%load_ext autoreload
%autoreload 2

seed = 12345

### Load dataset

In [3]:
edge_data = pd.read_csv("../data/raw/large_twitch_edges.csv")
edge_data.head()

Unnamed: 0,numeric_id_1,numeric_id_2
0,98343,141493
1,98343,58736
2,98343,140703
3,98343,151401
4,98343,157118


In [None]:
feat_data = pd.read_csv("../data/raw/large_twitch_features.csv").drop(columns=["created_at", "updated_at"]).set_index("numeric_id")
feat_data.head()

Unnamed: 0_level_0,views,mature,life_time,dead_account,language,affiliate
numeric_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,7879,1,969,0,EN,1
1,500,0,2699,0,EN,0
2,382502,1,3149,0,EN,1
3,386,0,1344,0,EN,0
4,2486,0,1784,0,EN,0


In [1]:
from torch_geometric.data import InMemoryDataset

In [None]:
class YooChooseBinaryDataset(InMemoryDataset):
    def __init__(self, root, , transform=None, pre_transform=None):
        super(YooChooseBinaryDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []
    @property
    def processed_file_names(self):
        return ['../dataset/twitch.dataset']

    def download(self):
        pass
    
    def process(self):
        
        data_list = []

        # process by session_id
        grouped = df.groupby('session_id')
        for session_id, group in tqdm(grouped):
            sess_item_id = LabelEncoder().fit_transform(group.item_id)
            group = group.reset_index(drop=True)
            group['sess_item_id'] = sess_item_id
            node_features = group.loc[group.session_id==session_id,['sess_item_id','item_id']].sort_values('sess_item_id').item_id.drop_duplicates().values

            node_features = torch.LongTensor(node_features).unsqueeze(1)
            target_nodes = group.sess_item_id.values[1:]
            source_nodes = group.sess_item_id.values[:-1]

            edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
            x = node_features

            y = torch.FloatTensor([group.label.values[0]])

            data = Data(x=x, edge_index=edge_index, y=y)
            data_list.append(data)
        
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

### Encode data & construct graph

In [4]:
other_feats = ["views", "mature", "life_time", "dead_account", "affiliate"]
onehot_feats = []
label = "language"

In [5]:
feature_matrix, label_vector = embed_graph.encode_data(feat_data=feat_data, onehot_feats=onehot_feats, other_feats=other_feats, label=label)
data = embed_graph.construct_graph(edge_data=edge_data, feature_matrix=feature_matrix, label_vector=label_vector)

print(data)
print("validate data:", data.validate(raise_on_error=True))

Data(x=[168114, 5], edge_index=[2, 13595114], y=[168114])
validate data: True


In [15]:
dataset = embed_graph.TwitchDataset(data_list=[data])
len(dataset)

1

In [6]:
data = train.split_graph_data(data)

Data(x=[168114, 5], edge_index=[2, 13595114], y=[168114], train_mask=[168114], val_mask=[168114], test_mask=[168114]) 

training samples 117680
validation samples 33623
test samples 16811


In [7]:
data

Data(x=[168114, 5], edge_index=[2, 13595114], y=[168114], train_mask=[168114], val_mask=[168114], test_mask=[168114])

### Train

In [8]:
#Setup GPU settings
print("GPU is availiable:", torch.cuda.is_available())
print("number of GPU:", torch.cuda.device_count())

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("Can run on GPU") 
else:
    device = torch.device("cpu")
    print("can only run on CPU")
    
print("Run on:", device)

GPU is availiable: True
number of GPU: 1
Can run on GPU
Run on: cuda:0


In [9]:
from src.models import baseline

In [12]:
net = baseline.baseline_net(num_feat=data.num_node_features, f=200)
train.train(net, data, max_epochs=20, lr=0.001, device=device)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_mm)