In [173]:
import os
import torch
import pandas as pd
from typing import Dict, List
import itertools
from scipy import stats
import datetime
import networkx as nx

## DEALING WITH DATA

In [56]:
dataset_path = '/Users/nvu/WorkSpace/DumexGitHub/gilgamesh/datasets/coin'
datasets_files = os.listdir(dataset_path)
_datasets = {x.replace("coin_","").replace(".csv", ""): pd.read_csv(f'{dataset_path}/{x}') for x in datasets_files if x.lower() != ".ds_store"}
datasets = {x:y.set_index('Date') for x, y in _datasets.items()}

In [191]:
SOL = datasets['Solana']
COLUMNS = ['High', 'Low', 'Open', 'Close', 'Volume', 'Marketcap']
coins = datasets.keys()

def corr(df1, df2):
    """ Returns the pearson correlation between two series.
    """
    index_intersection = df1.index.intersection(df2.index)
    _df1 = df1[index_intersection]
    _df2 = df2[index_intersection]
    return stats.pearsonr(_df1, _df2)[0]

def create_correlation_graph(dataset: Dict, columns: List[str]):
    """ Returns a correlation graph given a list of dataframes.

    Args
    ------
    dataset: dictionary
        where key is the name of the data and value is a dataframe
    columns: list of string
        where each column will be an edge

    Returns
    --------
    nx.Graph: a networkx undirected graph
        where the vertices are the dataset keys and each edge is a correlation
        between a pair of vertices on a single column
    """
    # Construct the vertices
    V = dataset.keys()
    all_coin_pairs = [x for x in itertools.combinations(V, 2)]
    all_possible_pairs = [x for x in itertools.product(all_coin_pairs, columns)]
    # create correlation matrix 
    all_corelation = pd.DataFrame(
        [[edge,*vertices,corr(datasets[vertices[0]][edge],datasets[vertices[1]][edge])] for vertices,edge in all_possible_pairs],
        columns=['edge_label','from','to','pearson']
    )
    # create the graph
    G = nx.Graph()
    _ = [G.add_edge(x['from'], x['to'], weight=x['pearson'],label=x['edge_label']) for x in all_corelation.to_dict('records')]
    return G

def get_edge_subgraph(G, threshold):
    F = G.copy()
    F.remove_edges_from([(n1, n2) for n1, n2, w in F.edges(data="weight") if w < threshold])
    return F

In [194]:
# data preprocessing
def get_union_index(indices):
    union = pd.Index([])
    for index in indices:
        union = union.union(index)
    return union.sort()

def get_time_limit_datasets(datasets: Dict, upper_bound: datetime):
    """ Returns a datasets up and until the give upper_bound.

    Args
    ------
    datasets: Dict
    """

IndentationError: expected an indented block (2103218449.py, line 8)

In [192]:
G = create_correlation_graph(dataset=datasets, columns=COLUMNS)

In [157]:
a = union.to_series()
test = pd.period_range(earliest, latest, freq='Q')

Unnamed: 0_level_0,SNo,Name,Symbol,High,Low,Open,Close,Volume,Marketcap
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-07-02 23:59:59,1,EOS,EOS,2.877510,0.822648,0.996521,2.710050,3.204520e+08,0.000000e+00
2017-07-03 23:59:59,2,EOS,EOS,5.395970,2.632310,2.717390,4.086640,4.149500e+08,6.549307e+08
2017-07-04 23:59:59,3,EOS,EOS,4.191240,2.933450,4.098010,3.372000,2.185590e+08,5.501552e+08
2017-07-05 23:59:59,4,EOS,EOS,3.523500,2.730130,3.356000,3.003230,1.243390e+08,5.006264e+08
2017-07-06 23:59:59,5,EOS,EOS,3.867470,3.007830,3.014260,3.361330,1.774630e+08,5.683615e+08
...,...,...,...,...,...,...,...,...,...
2019-03-27 23:59:59,634,EOS,EOS,4.350354,3.748847,3.763280,4.319756,3.426748e+09,3.914757e+09
2019-03-28 23:59:59,635,EOS,EOS,4.358344,4.217540,4.306860,4.274330,2.090073e+09,3.873590e+09
2019-03-29 23:59:59,636,EOS,EOS,4.362083,4.195792,4.272928,4.283937,2.376111e+09,3.882297e+09
2019-03-30 23:59:59,637,EOS,EOS,4.432984,4.057098,4.280243,4.136014,1.980146e+09,3.748243e+09


In [171]:
def create_subdataset(dfs, last_date):
    return [df[pd.to_datetime(df.index) < last_date.to_timestamp()] for df in dfs]

dfs = datasets.values()
_dfs = create_subdataset(dfs, test[-1])

[                      SNo Name Symbol      High       Low      Open     Close  \
 Date                                                                            
 2015-04-02 23:59:59     1  NEM    XEM  0.000323  0.000227  0.000242  0.000314   
 2015-04-03 23:59:59     2  NEM    XEM  0.000330  0.000291  0.000309  0.000310   
 2015-04-04 23:59:59     3  NEM    XEM  0.000318  0.000251  0.000310  0.000277   
 2015-04-05 23:59:59     4  NEM    XEM  0.000283  0.000218  0.000272  0.000232   
 2015-04-06 23:59:59     5  NEM    XEM  0.000299  0.000229  0.000232  0.000289   
 ...                   ...  ...    ...       ...       ...       ...       ...   
 2021-06-26 23:59:59  2278  NEM    XEM  0.109938  0.101675  0.105897  0.107710   
 2021-06-27 23:59:59  2279  NEM    XEM  0.112772  0.105629  0.108171  0.112606   
 2021-06-28 23:59:59  2280  NEM    XEM  0.117987  0.110549  0.112790  0.116804   
 2021-06-29 23:59:59  2281  NEM    XEM  0.134806  0.116468  0.116791  0.127813   
 2021-06-30 23:5