# Data processing

In [None]:
import pandas as pd 
import glob 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import os 
import sklearn
import scipy.sparse
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
os.getcwd()

In [None]:
path = "/Users/julesbaudet/Documents/0. Cours/ENS/Deep Learning DIY/Projet final/ens_data"
# path = "/Users/linusbleistein/Documents/Cours ENS/Cours mathématiques/Deep learning 2020-2021/data_project"

In [None]:
df = pd.read_csv(os.path.join(path,"clean_brave_data.csv"))
df.head()

## Creating a data  embedding

To be able to learn a classification of nodes on our graph, we need to transform the data on nodes to be able to express some of the information contained in graph form.

In [None]:
# matrix of adjacency with line data relative to sent transactions only 
# %%time
# from_c = CategoricalDtype(sorted(df.from_address.unique()), ordered=True)
# to_c = CategoricalDtype(sorted(df.to_address.unique()), ordered=True)
# 
# row = df.from_address.astype(from_c).cat.codes
# col = df.to_address.astype(to_c).cat.codes
# sparse = csr_matrix((df["value"], (row, col)), \
#                            shape=(from_c.categories.size, to_c.categories.size))
# print(sparse.shape)

In [None]:
# matrix of adjacency with line data relative to sent and received transactions 
%%time
# first we add addresses that only received but not sent to first column 
to_add = df.loc[~df['to_address'].isin(df['from_address'])].copy()
to_add = to_add.rename(columns={'to_address':'from_address','from_address':'to_address'})
to_add['value'] = 0

df = pd.concat((df,to_add))

categories = CategoricalDtype(sorted(df.from_address.unique()), ordered=True) # will have all nodes! use it for all 
row = df.to_address.astype(categories).cat.codes
col = df.from_address.astype(categories).cat.codes

# sparse matrix for values sent 
sp = csr_matrix((df['value'],(row, col)), shape=(categories.categories.size, categories.categories.size)
                
# sparse matrix for values received 
sp2 = csr_matrix((-df['value'],(col, row)), shape=(categories.categories.size, categories.categories.size))
                
# our data 
X = scipy.sparse.hstack((sp,sp2))

In [None]:
X.shape

##  Dimension Reduction   

Our data has particularly high dimension. We use truncated SVD to reduce our data set. 

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
%%time
tsvd = TruncatedSVD(n_components=65, random_state=42)
out = tsvd.fit_transform(X)
print(f'our data has shape {out.shape}')

In [None]:
tsvd_var_ratios = tsvd.explained_variance_ratio_
print(tsvd_var_ratios.sum())

In [None]:
# taken on https://chrisalbon.com/machine_learning/feature_engineering/select_best_number_of_components_in_tsvd/
def select_n_components(var_ratio, goal_var: float) -> int:
    total_variance = 0.0
    n_components = 0
    for explained_variance in var_ratio:
        total_variance += explained_variance
        n_components += 1
        if total_variance >= goal_var:
            break
    return n_components

var_goal =  0.99
print(f'to keep {var_goal*100}% of variance, one should keep {select_n_components(tsvd_var_ratios,var_goal)} components')

##  Adding the labels

In [None]:
# import the known exchange list 
exch_labels = pd.read_csv(os.path.join(path, "exchanges_encoded.csv"), 
                          delimiter=';', names  = ['address','label']).set_index('address')

labels = exch_labels.to_dict()['label']

labels2int = dict(zip(labels.values(),[i for i in range(len(labels))])) # keys = addresses, values = name of the exchange
address2int = {k:labels2int[labels[k]] for k in labels.keys()}

known_addresses = list(labels.keys())

In [None]:
labels = np.vectorize(address2int.get)(np.array(categories.categories))
labels = np.nan_to_num(labels.astype(float))

In [None]:
labels.shape

In [None]:
data = pd.DataFrame(out)
# data['label'] = np.isin(np.array(sorted(df.from_address.unique())),known_addresses).astype(int)
data['label'] = labels

In [None]:
data.to_csv(os.path.join(path,"processed_data.csv"))

In [None]:
# # Debugging example
# a = pd.DataFrame({"col": [15, 32, 3, 8], "col2":[26,3,17,20], "value":[1,1,1,1]})
# 
# # add this to the sparse matrix 
# add = a.loc[~a['col2'].isin(a['col'])].copy()
# add = add.rename(columns={'col':'col2','col2':'col'})
# #add['value'] = 0 # -to_add['value']
# 
# a = pd.concat((a,add))
# 
# categories = CategoricalDtype(sorted(a.col.unique()), ordered=True) # will have all nodes! use it for all 
# row = a.col.astype(categories).cat.codes
# col = a.col2.astype(categories).cat.codes
# sp = csr_matrix((a['value'],(row, col)), shape=(categories.categories.size, categories.categories.size))