# HIVE DATASET EXPLORATION

## 0. Set up environment

In [7]:
import os
import shutil
import json
import pandas as pd
import numpy as np

# 1. Data Playground

#### Load data

In [13]:
DATA_VERSION = 'final_v1'
CWD = os.getcwd()
DATA_DIR = os.path.join(CWD, 'dataset', 'hive', DATA_VERSION)

In [18]:
# load data from csv file
node_df = pd.read_csv(os.path.join(DATA_DIR, 'nodes_labelled.csv'))
edge_df = pd.read_csv(os.path.join(DATA_DIR, 'edges_labelled.csv'))

#### Data exploration

In [24]:
node_df.head()

Unnamed: 0,ID,Type,Value,HP,Total vote,No. Follower,No. Following,Created,Post Count,Reputation,Abnormally
0,245921,User,davedickeyyall,15418.963626,,2864.0,848.0,2017-07-08 11:32:27,28993.0,77.123604,False
1,1412702,User,rohansuares,7.598047,,31.0,27.0,2020-09-21 12:30:57,40.0,54.853018,False
2,113695968,Comment,wyosfwue,,317.0,,,2022-06-01 11:53:06,,,False
3,103635,User,tangmo,6115.797315,,1898.0,985.0,2016-10-08 00:34:09,29567.0,72.521919,True
4,1305145,User,idig,762.926607,,59.0,50.0,2019-08-08 12:49:51,651.0,57.602628,False


In [42]:
np.unique(node_df.Abnormally)

array([False,  True])

In [23]:
edge_df.head()

Unnamed: 0,Source,Target,Type,Timestamp,Weight,Abnormally
0,219,113693978,Upvote,2022-06-01 10:47:33,5000,False
1,219,113695893,Upvote,2022-06-01 12:26:45,5000,False
2,219,113697078,Upvote,2022-06-01 13:29:15,4600,False
3,219,113701528,Upvote,2022-06-01 16:44:54,5000,False
4,219,113701891,Upvote,2022-06-01 17:11:24,5000,False


In [25]:
node_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157296 entries, 0 to 157295
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ID             157296 non-null  int64  
 1   Type           157296 non-null  object 
 2   Value          157296 non-null  object 
 3   HP             18645 non-null   float64
 4   Total vote     138651 non-null  float64
 5   No. Follower   18645 non-null   float64
 6   No. Following  18645 non-null   float64
 7   Created        157296 non-null  object 
 8   Post Count     18645 non-null   float64
 9   Reputation     18645 non-null   float64
 10  Abnormally     157296 non-null  bool   
dtypes: bool(1), float64(6), int64(1), object(3)
memory usage: 12.2+ MB


In [28]:
edge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1157925 entries, 0 to 1157924
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Source      1157925 non-null  int64 
 1   Target      1157925 non-null  int64 
 2   Type        1157925 non-null  object
 3   Timestamp   1157925 non-null  object
 4   Weight      1157925 non-null  int64 
 5   Abnormally  1157925 non-null  bool  
dtypes: bool(1), int64(3), object(2)
memory usage: 45.3+ MB


In [29]:
np.unique(edge_df.Type)

array(['Belong_to', 'Downvote', 'Upvote', 'Write'], dtype=object)

In [30]:
np.unique(node_df.Type)

array(['Comment', 'Post', 'User'], dtype=object)

In [31]:
np.unique(edge_df.Weight)

array([    0,     1,     2, ..., 11322, 19372, 21966], dtype=int64)

In [38]:
user_node = node_df.loc[node_df.Type == 'User']
user_node.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18645 entries, 0 to 157054
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             18645 non-null  int64  
 1   Type           18645 non-null  object 
 2   Value          18645 non-null  object 
 3   HP             18645 non-null  float64
 4   Total vote     0 non-null      float64
 5   No. Follower   18645 non-null  float64
 6   No. Following  18645 non-null  float64
 7   Created        18645 non-null  object 
 8   Post Count     18645 non-null  float64
 9   Reputation     18645 non-null  float64
 10  Abnormally     18645 non-null  bool   
dtypes: bool(1), float64(6), int64(1), object(3)
memory usage: 1.6+ MB


In [39]:
post_node = node_df.loc[node_df.Type == 'Post']
post_node.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13540 entries, 28 to 157229
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             13540 non-null  int64  
 1   Type           13540 non-null  object 
 2   Value          13540 non-null  object 
 3   HP             0 non-null      float64
 4   Total vote     13540 non-null  float64
 5   No. Follower   0 non-null      float64
 6   No. Following  0 non-null      float64
 7   Created        13540 non-null  object 
 8   Post Count     0 non-null      float64
 9   Reputation     0 non-null      float64
 10  Abnormally     13540 non-null  bool   
dtypes: bool(1), float64(6), int64(1), object(3)
memory usage: 1.1+ MB


In [40]:
cmt_node = node_df.loc[node_df.Type == 'Comment']
cmt_node.info()

<class 'pandas.core.frame.DataFrame'>
Index: 125111 entries, 2 to 157295
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ID             125111 non-null  int64  
 1   Type           125111 non-null  object 
 2   Value          125111 non-null  object 
 3   HP             0 non-null       float64
 4   Total vote     125111 non-null  float64
 5   No. Follower   0 non-null       float64
 6   No. Following  0 non-null       float64
 7   Created        125111 non-null  object 
 8   Post Count     0 non-null       float64
 9   Reputation     0 non-null       float64
 10  Abnormally     125111 non-null  bool   
dtypes: bool(1), float64(6), int64(1), object(3)
memory usage: 10.6+ MB


In [52]:
edge_df.loc[edge_df.Type == 'Belong_to'].loc[edge_df.Target.isin(cmt_node.ID)]

Unnamed: 0,Source,Target,Type,Timestamp,Weight,Abnormally
1079176,14130740,14128206,Belong_to,2022-06-07 19:55:03,1,False
1079177,14133737,14132553,Belong_to,2022-06-07 19:54:06,1,False
1079178,14197893,14147125,Belong_to,2022-06-07 19:53:21,1,False
1079179,107931797,107901340,Belong_to,2022-06-13 14:26:39,214,False
1079180,107934402,107901340,Belong_to,2022-06-09 17:22:42,212,False
...,...,...,...,...,...,...
1157920,114421729,114267130,Belong_to,2022-06-29 11:48:39,93,False
1157921,114421730,114421701,Belong_to,2022-06-29 11:48:54,53,False
1157922,114421731,114356799,Belong_to,2022-06-29 11:49:00,43,False
1157923,114421733,114410742,Belong_to,2022-06-29 11:49:12,51,False


In [58]:
np.array(edge_df.Weight).reshape(-1, 1).shape

(1157925, 1)