In [1]:
import torch #The torch package contains data structures for multi-dimensional tensors and mathematical operations over these are defined.
import torchvision #The torchvision package consists of popular datasets, model architectures, and common image transformations for computer vision.
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv
import pandas as pd

In [115]:
edge_dataset=pd.read_csv('large_twitch_edges.csv')
features_dataset=pd.read_csv('large_twitch_features.csv')

In [116]:
edge_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6797557 entries, 0 to 6797556
Data columns (total 2 columns):
 #   Column        Dtype
---  ------        -----
 0   numeric_id_1  int64
 1   numeric_id_2  int64
dtypes: int64(2)
memory usage: 103.7 MB


In [117]:
edge_dataset.head()

Unnamed: 0,numeric_id_1,numeric_id_2
0,98343,141493
1,98343,58736
2,98343,140703
3,98343,151401
4,98343,157118


In [118]:
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168114 entries, 0 to 168113
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   views         168114 non-null  int64 
 1   mature        168114 non-null  int64 
 2   life_time     168114 non-null  int64 
 3   created_at    168114 non-null  object
 4   updated_at    168114 non-null  object
 5   numeric_id    168114 non-null  int64 
 6   dead_account  168114 non-null  int64 
 7   language      168114 non-null  object
 8   affiliate     168114 non-null  int64 
dtypes: int64(6), object(3)
memory usage: 11.5+ MB


In [6]:
features_dataset.head()

Unnamed: 0,views,mature,life_time,created_at,updated_at,numeric_id,dead_account,language,affiliate
0,7879,1,969,2016-02-16,2018-10-12,0,0,EN,1
1,500,0,2699,2011-05-19,2018-10-08,1,0,EN,0
2,382502,1,3149,2010-02-27,2018-10-12,2,0,EN,1
3,386,0,1344,2015-01-26,2018-10-01,3,0,EN,0
4,2486,0,1784,2013-11-22,2018-10-11,4,0,EN,0


In [7]:
features_dataset = features_dataset.set_index('numeric_id')

In [8]:
features_dataset.head()

Unnamed: 0_level_0,views,mature,life_time,created_at,updated_at,dead_account,language,affiliate
numeric_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,7879,1,969,2016-02-16,2018-10-12,0,EN,1
1,500,0,2699,2011-05-19,2018-10-08,0,EN,0
2,382502,1,3149,2010-02-27,2018-10-12,0,EN,1
3,386,0,1344,2015-01-26,2018-10-01,0,EN,0
4,2486,0,1784,2013-11-22,2018-10-11,0,EN,0


In [9]:
edge_dataset.describe()

Unnamed: 0,numeric_id_1,numeric_id_2
count,6797557.0,6797557.0
mean,83828.01,84015.23
std,48205.13,48527.19
min,0.0,0.0
25%,42217.0,42045.0
50%,83546.0,83851.0
75%,125642.0,125957.0
max,168112.0,168113.0


In [10]:
merged_dataset = pd.merge(edge_dataset, features_dataset,left_on='numeric_id_1',right_on='numeric_id')

In [11]:
merged_dataset = merged_dataset.set_index('numeric_id_1')
merged_dataset.head()

Unnamed: 0_level_0,numeric_id_2,views,mature,life_time,created_at,updated_at,dead_account,language,affiliate
numeric_id_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
98343,141493,282,0,2086,2012-12-27,2018-09-13,0,EN,0
98343,58736,282,0,2086,2012-12-27,2018-09-13,0,EN,0
98343,140703,282,0,2086,2012-12-27,2018-09-13,0,EN,0
98343,151401,282,0,2086,2012-12-27,2018-09-13,0,EN,0
98343,157118,282,0,2086,2012-12-27,2018-09-13,0,EN,0


In [12]:
merged_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6797557 entries, 98343 to 27819
Data columns (total 9 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   numeric_id_2  int64 
 1   views         int64 
 2   mature        int64 
 3   life_time     int64 
 4   created_at    object
 5   updated_at    object
 6   dead_account  int64 
 7   language      object
 8   affiliate     int64 
dtypes: int64(6), object(3)
memory usage: 518.6+ MB


In [13]:
merged_dataset.describe()

Unnamed: 0,numeric_id_2,views,mature,life_time,dead_account,affiliate
count,6797557.0,6797557.0,6797557.0,6797557.0,6797557.0,6797557.0
mean,84015.23,16717070.0,0.4857826,1981.736,0.002065448,0.3226697
std,48527.19,50465520.0,0.4997979,756.7212,0.04540024,0.4674976
min,0.0,0.0,0.0,34.0,0.0,0.0
25%,42045.0,20964.0,0.0,1447.0,0.0,0.0
50%,83851.0,437055.0,0.0,1976.0,0.0,0.0
75%,125957.0,6237401.0,1.0,2502.0,0.0,1.0
max,168113.0,384396600.0,1.0,4161.0,1.0,1.0


**Converting Dataset to Pytorch Geometric Data**

In [119]:
features_dataset['created_at'] = pd.to_datetime(features_dataset['created_at'])
features_dataset['updated_at'] = pd.to_datetime(features_dataset['updated_at'])

features_dataset.head()

Unnamed: 0,views,mature,life_time,created_at,updated_at,numeric_id,dead_account,language,affiliate
0,7879,1,969,2016-02-16,2018-10-12,0,0,EN,1
1,500,0,2699,2011-05-19,2018-10-08,1,0,EN,0
2,382502,1,3149,2010-02-27,2018-10-12,2,0,EN,1
3,386,0,1344,2015-01-26,2018-10-01,3,0,EN,0
4,2486,0,1784,2013-11-22,2018-10-11,4,0,EN,0


In [120]:
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168114 entries, 0 to 168113
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   views         168114 non-null  int64         
 1   mature        168114 non-null  int64         
 2   life_time     168114 non-null  int64         
 3   created_at    168114 non-null  datetime64[ns]
 4   updated_at    168114 non-null  datetime64[ns]
 5   numeric_id    168114 non-null  int64         
 6   dead_account  168114 non-null  int64         
 7   language      168114 non-null  object        
 8   affiliate     168114 non-null  int64         
dtypes: datetime64[ns](2), int64(6), object(1)
memory usage: 11.5+ MB


In [121]:
features_dataset['language'].unique()

array(['EN', 'FR', 'KO', 'JA', 'RU', 'PL', 'DE', 'ES', 'IT', 'PT',
       'OTHER', 'TR', 'ZH', 'SV', 'NL', 'TH', 'CS', 'DA', 'HU', 'FI',
       'NO'], dtype=object)

In [122]:
languages = ['EN', 'FR', 'KO', 'JA', 'RU', 'PL', 'DE', 'ES', 'IT', 'PT',
       'OTHER', 'TR', 'ZH', 'SV', 'NL', 'TH', 'CS', 'DA', 'HU', 'FI',
       'NO']

from sklearn.preprocessing import LabelEncoder

def encode_df(dataframe):
    le = LabelEncoder()
    features_dataset['language'] = le.fit_transform(features_dataset['language'])
    return dataframe

#encode the dataframe
features_dataset = encode_df(features_dataset)
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168114 entries, 0 to 168113
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   views         168114 non-null  int64         
 1   mature        168114 non-null  int64         
 2   life_time     168114 non-null  int64         
 3   created_at    168114 non-null  datetime64[ns]
 4   updated_at    168114 non-null  datetime64[ns]
 5   numeric_id    168114 non-null  int64         
 6   dead_account  168114 non-null  int64         
 7   language      168114 non-null  int32         
 8   affiliate     168114 non-null  int64         
dtypes: datetime64[ns](2), int32(1), int64(6)
memory usage: 10.9 MB


In [126]:
node_features = features_dataset[["views","mature","life_time","created_at","updated_at"]]

In [127]:
node_features['created_at'] = pd.to_datetime(node_features['created_at']).astype('int64')/ 10**9
node_features['updated_at'] = pd.to_datetime(node_features['updated_at']).astype('int64')/ 10**9
node_features.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  node_features['created_at'] = pd.to_datetime(node_features['created_at']).astype('int64')/ 10**9
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  node_features['updated_at'] = pd.to_datetime(node_features['updated_at']).astype('int64')/ 10**9


Unnamed: 0,views,mature,life_time,created_at,updated_at
0,7879,1,969,1455581000.0,1539302000.0
1,500,0,2699,1305763000.0,1538957000.0
2,382502,1,3149,1267229000.0,1539302000.0
3,386,0,1344,1422230000.0,1538352000.0
4,2486,0,1784,1385078000.0,1539216000.0


In [128]:
x = node_features.to_numpy()
x.shape # [num_nodes x num_features]

(168114, 5)

In [129]:
# Select node features
labels = features_dataset[["dead_account","language","affiliate"]]
labels.head()
# Convert to numpy
y = labels.to_numpy()
y.shape # [num_nodes, 1] --> node regression

(168114, 3)

In [130]:
edge_index = edge_dataset.transpose()
all_edges = edge_index.to_numpy() # [2, num_edges]
print(all_edges.shape)

(2, 6797557)


In [131]:
from torch_geometric.data import Data
data = Data(x=x, edge_index=edge_index, y=y)

In [132]:
data.num_classes =3

In [133]:
data

Data(x=[168114, 5], edge_index=              0        1        2        3        4        5        6        \
numeric_id_1    98343    98343    98343    98343    98343    98343    98343   
numeric_id_2   141493    58736   140703   151401   157118   125430     3635   

              7        8        9        ...  6797547  6797548  6797549  \
numeric_id_1    98343    98343    98343  ...   115807    82648     3793   
numeric_id_2      495   116648     1679  ...    64396   157597    14483   

              6797550  6797551  6797552  6797553  6797554  6797555  6797556  
numeric_id_1    91486    91181    97507    71175   151702   118034    27819  
numeric_id_2   140484   137240    29359    12020   128281    38021   153993  

[2 rows x 6797557 columns], y=[168114, 3], num_classes=3)