# Data processing

In [1]:
import pandas as pd 
import glob 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import os 
import sklearn
import scipy.sparse
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
os.getcwd()

In [2]:
path = "/Users/julesbaudet/Documents/0. Cours/ENS/Deep Learning DIY/Projet final/ens_data"
# path = "/Users/linusbleistein/Documents/Cours ENS/Cours mathématiques/Deep learning 2020-2021/data_project"

In [3]:
df = pd.read_csv(os.path.join(path,"clean_brave_data.csv"))
df.head()

In [4]:
df = df.drop(columns = 'Unnamed: 0')

## Transaction History Embedding

In [12]:
# import the known exchange list 
exch_labels = pd.read_csv(os.path.join(path, "exchanges_encoded.csv"), 
                          delimiter=';', names  = ['address','label']).set_index('address')

labels = exch_labels.to_dict()['label']

labels2int = dict(zip(labels.values(),[i for i in range(len(labels))])) # keys = addresses, values = name of the exchange
address2int = {k:labels2int[labels[k]] for k in labels.keys()}

known_addresses = list(labels.keys())

In [13]:
# drop transactions of addresses who only made 1 transaction 
a = (df.groupby('from_address').count() + df.groupby('to_address').count())['value']
idx = a[a<3][~a[a<3].index.isin(known_addresses)].index
df = df.drop(df.loc[df['from_address'].isin(idx)|df['from_address'].isin(idx)].index).reset_index()

In [None]:
# sum([len(v)!=42 for v in df['from_address'].values]) # check that all addresses have same format

In [None]:
# df['transaction_id'] = df['from_address']+df['to_address']

In [None]:
# df['transaction_id'].nunique()

In [None]:
# df = df.sort_values('unix_block_timestamp')

In [None]:
# vocab_size = df['transaction_id'].nunique()
# word2vec = Word2Vec(vocab_size=vocab_size, embedding_size=300)
# sgns = SGNS(embedding=word2vec, vocab_size=vocab_size, n_negs=20)
# optim = Adam(sgns.parameters())
# for batch, (iword, owords) in enumerate(dataloader):
#     loss = sgns(iword, owords)
#     optim.zero_grad()
#     loss.backward()
#     optim.step()

In [20]:
# first we create an one-hot encoding of all transactions ranked by chronological order
hot_encoding =  csr_matrix((np.ones(df.shape[0]),(df.index.values, df.index.values)), shape=(df.shape[0], df.shape[0]))

In [30]:
a = '0x88e2efac3d2ef957fcd82ec201a506871ad06204'

In [102]:
#np.unique(np.hstack((df.from_address.values, df['to_address'].values))).shape

In [49]:
# now we create a dictionnary address: [transaction nbrs] where the transaction nbrs involve the address
all_addresses  = np.hstack((df.from_address.unique(),
                            df.loc[~df['to_address'].isin(df['from_address'])].to_address.unique()))

In [146]:
to_add = df.loc[~df['to_address'].isin(df['from_address'])].copy()
to_add = to_add.rename(columns={'to_address':'from_address','from_address':'to_address'})
to_add['value'] = 0
df2 = pd.concat((df,to_add))

categories = CategoricalDtype(sorted(df2.from_address.unique()), ordered=True) # will have all nodes! use it for all 

In [147]:
from_c = df.from_address.astype(categories).cat.codes # .unique().shape
to_c = df.loc[df.from_address != df.to_address].to_address.astype(categories).cat.codes

vocab = pd.DataFrame(np.hstack((from_c,to_c)), columns=['address'])
vocab['transactions'] = np.hstack((from_c.index, to_c.index))

In [150]:
%%time
vocab = vocab.groupby('address')['transactions'].apply(list).reset_index(name='transactions')

CPU times: user 17.6 s, sys: 182 ms, total: 17.8 s
Wall time: 18 s


In [151]:
# ARE THEY IN ORDER? CHECK! 
vocab.head()

Unnamed: 0,address,transactions
0,0,"[[30449, 34453, 956587, 1348259, 1713041, 1926..."
1,1,"[[485438, 1486175]]"
2,2,[[457529]]
3,3,"[[1063476, 1234990]]"
4,4,"[[513642, 513654]]"


## Transaction stats

## Creating a data  embedding (OLD)

To be able to learn a classification of nodes on our graph, we need to transform the data on nodes to be able to express some of the information contained in graph form.

In [None]:
# matrix of adjacency with line data relative to sent transactions only 
# %%time
# from_c = CategoricalDtype(sorted(df.from_address.unique()), ordered=True)
# to_c = CategoricalDtype(sorted(df.to_address.unique()), ordered=True)
# 
# row = df.from_address.astype(from_c).cat.codes
# col = df.to_address.astype(to_c).cat.codes
# sparse = csr_matrix((df["value"], (row, col)), \
#                            shape=(from_c.categories.size, to_c.categories.size))
# print(sparse.shape)

In [68]:
# matrix of adjacency with line data relative to sent and received transactions 
%%time
# first we add addresses that only received but not sent to first column 
to_add = df.loc[~df['to_address'].isin(df['from_address'])].copy()
to_add = to_add.rename(columns={'to_address':'from_address','from_address':'to_address'})
to_add['value'] = 0

df = pd.concat((df,to_add))

categories = CategoricalDtype(sorted(df.from_address.unique()), ordered=True) # will have all nodes! use it for all 
row = df.to_address.astype(categories).cat.codes
col = df.from_address.astype(categories).cat.codes

# sparse matrix for values sent 
sp = csr_matrix((df['value'],(row, col)), shape=(categories.categories.size, categories.categories.size)
                
# sparse matrix for values received 
sp2 = csr_matrix((-df['value'],(col, row)), shape=(categories.categories.size, categories.categories.size))
                
# our data 
X = scipy.sparse.hstack((sp,sp2))

SyntaxError: invalid syntax (<ipython-input-68-4069e6302457>, line 18)

In [None]:
X.shape

##  Dimension Reduction   

Our data has particularly high dimension. We use truncated SVD to reduce our data set. 

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
%%time
tsvd = TruncatedSVD(n_components=65, random_state=42)
out = tsvd.fit_transform(X)
print(f'our data has shape {out.shape}')

In [None]:
tsvd_var_ratios = tsvd.explained_variance_ratio_
print(tsvd_var_ratios.sum())

In [None]:
# taken on https://chrisalbon.com/machine_learning/feature_engineering/select_best_number_of_components_in_tsvd/
def select_n_components(var_ratio, goal_var: float) -> int:
    total_variance = 0.0
    n_components = 0
    for explained_variance in var_ratio:
        total_variance += explained_variance
        n_components += 1
        if total_variance >= goal_var:
            break
    return n_components

var_goal =  0.99
print(f'to keep {var_goal*100}% of variance, one should keep {select_n_components(tsvd_var_ratios,var_goal)} components')

##  Adding the labels

In [None]:
labels = np.vectorize(address2int.get)(np.array(categories.categories))
labels = np.nan_to_num(labels.astype(float))

In [None]:
labels.shape

In [None]:
data = pd.DataFrame(out)
# data['label'] = np.isin(np.array(sorted(df.from_address.unique())),known_addresses).astype(int)
data['label'] = labels

In [None]:
data.to_csv(os.path.join(path,"processed_data.csv"))

In [None]:
# # Debugging example
# a = pd.DataFrame({"col": [15, 32, 3, 8], "col2":[26,3,17,20], "value":[1,1,1,1]})
# 
# # add this to the sparse matrix 
# add = a.loc[~a['col2'].isin(a['col'])].copy()
# add = add.rename(columns={'col':'col2','col2':'col'})
# #add['value'] = 0 # -to_add['value']
# 
# a = pd.concat((a,add))
# 
# categories = CategoricalDtype(sorted(a.col.unique()), ordered=True) # will have all nodes! use it for all 
# row = a.col.astype(categories).cat.codes
# col = a.col2.astype(categories).cat.codes
# sp = csr_matrix((a['value'],(row, col)), shape=(categories.categories.size, categories.categories.size))

## Handling time

In [5]:
# from datetime import datetime

In [6]:
# df['day'] = df['unix_block_timestamp']//(60*60*24)  # euclidean division to get the day number 

In [7]:
# # tests
# df['unix_block_timestamp'].max()//(60*60*24)
# datetime.utcfromtimestamp(df['unix_block_timestamp'].max())
# datetime.utcfromtimestamp(df['unix_block_timestamp'].min())

In [8]:
# df.groupby(by='day').count().plot(y='value')

In [9]:
#  plt.subplots(figsize = (20,7))
#  test = df.groupby(['day']).nunique()['from_address']
#  test.hist(bins = 100)

In [10]:
# test.mean()

In [11]:
# plt.subplots(figsize = (20,7))
# df.groupby(by='day').count()['value'].hist(bins = 100)