In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('precision', 3)

In [3]:
df = pd.read_csv('trimmed_dataset/tx.edgelist', sep=" ", names=['prev_txID', 'txID', 'value'])

In [4]:
df.head()

Unnamed: 0,prev_txID,txID,value
0,9,171,50.0
1,171,183,40.0
2,183,185,30.0
3,185,187,29.0
4,187,192,1.0


In [5]:
len(df)

65715613

In [7]:
df.dtypes

prev_txID      int64
txID           int64
value        float64
dtype: object

In [6]:
df.memory_usage()

Index               80
prev_txID    525724904
txID         525724904
value        525724904
dtype: int64

In [8]:
out_df = df[['prev_txID', 'value']]. \
    groupby(by='prev_txID'). \
    agg(['count', np.mean, np.median, lambda x: np.std(x)]). \
    reset_index()

In [9]:
in_df = df[['txID', 'value']]. \
    groupby(by='txID'). \
    agg(['count', np.mean, np.median, lambda x: np.std(x)]). \
    reset_index()

In [10]:
out_df.columns = ['txID', 'out_degree', 'avg_sent', 'median_sent', 'stdev_sent']
in_df.columns = ['txID', 'in_degree', 'avg_recv', 'median_recv', 'stdev_recv']

In [11]:
len(out_df), len(in_df)

(29854993, 29771540)

In [12]:
out_df.head()

Unnamed: 0,txID,out_degree,avg_sent,median_sent,stdev_sent
0,9,1,50.0,50.0,0.0
1,78,1,50.0,50.0,0.0
2,171,2,25.0,25.0,15.0
3,183,1,30.0,30.0,0.0
4,185,2,15.0,15.0,14.0


In [13]:
in_df.head()

Unnamed: 0,txID,in_degree,avg_recv,median_recv,stdev_recv
0,171,1,50.0,50.0,0.0
1,183,1,40.0,40.0,0.0
2,185,1,30.0,30.0,0.0
3,187,1,29.0,29.0,0.0
4,192,1,1.0,1.0,0.0


In [16]:
new_df = pd.merge(in_df, out_df, how='outer', on='txID').fillna(0)

In [17]:
len(new_df)

30010551

In [30]:
new_df.dtypes

txID             int64
in_degree        int64
out_degree       int64
avg_recv       float64
avg_sent       float64
median_recv    float64
median_sent    float64
stdev_recv     float64
stdev_sent     float64
dtype: object

In [22]:
new_df['in_degree'] = new_df['in_degree'].astype('int')
new_df['out_degree'] = new_df['out_degree'].astype('int')

In [26]:
new_df.columns

Index(['txID', 'in_degree', 'avg_recv', 'median_recv', 'stdev_recv',
       'out_degree', 'avg_sent', 'median_sent', 'stdev_sent'],
      dtype='object')

In [28]:
new_df = new_df[['txID', 'in_degree', 'out_degree', 'avg_recv', 'avg_sent', 'median_recv', 'median_sent', 'stdev_recv',
       'stdev_sent']]

In [36]:
new_df.sort_values(by=['txID'], inplace=True)

In [37]:
new_df.head()

Unnamed: 0,txID,in_degree,out_degree,avg_recv,avg_sent,median_recv,median_sent,stdev_recv,stdev_sent
29771540,9,0,1,0.0,50.0,0.0,50.0,0.0,0.0
29771541,78,0,1,0.0,50.0,0.0,50.0,0.0,0.0
0,171,1,2,50.0,25.0,50.0,25.0,0.0,15.0
1,183,1,1,40.0,30.0,40.0,30.0,0.0,0.0
2,185,1,2,30.0,15.0,30.0,15.0,0.0,14.0


In [38]:
new_df.to_csv('trimmed_dataset/tx_features.csv', float_format='%g', header=True, index=False)