In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
import numpy as np

In [None]:
# zakaj potrebujem ponovno celotni pipeline?
# - smo normalizirali target values? ker se lahko drugače nauči za 0.1 in 0.3
# - smo ločili 0.1 in 0.3? Ker če ima skoraj vse iste vhodne podatke, je smiselno, da se loči?
#

In [3]:
root_folder = 'drive/MyDrive/Research/Projects/Graph_Sampling_Prediction/notebooks-export/'
sources = {'train': root_folder + 'data/generated_graphs/aggr_data/samplings/set_1/set_1_3_6_8_9_10_with_features.csv',
           'test_synth_medium': root_folder + 'data/generated_graphs/set_medium/all_graphs_sampling_results_with_features_v3.csv',
           'test_synth_large': root_folder + 'data/generated_graphs/set_large/all_graphs_sampling_results_with_features_v3.csv',
           'test_world_medium': root_folder +'data/real_graphs/set_medium/all_graphs_sampling_results_with_features.csv',
           'test_world_large': root_folder +'data/real_graphs/set_large/all_graphs_sampling_results_with_features_v3.csv'
           }

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#df[['Real value']].hist(bins=20)

In [4]:
def generate_dataset(file_paths, metric):
    train = pd.read_csv(file_paths['train'])
    test_synth_medium = pd.read_csv(file_paths['test_synth_medium'])
    test_synth_large = pd.read_csv(file_paths['test_synth_large'])
    test_world_medium = pd.read_csv(file_paths['test_world_medium'])
    test_world_large = pd.read_csv(file_paths['test_world_large'])

    train['partition']='train'
    test_synth_medium['partition']='test'
    test_synth_large['partition']='test'
    test_world_medium['partition']='test'
    test_world_large['partition']='test'

    train['synthetic']='synthetic'
    test_synth_medium['synthetic']='synthetic_medium'
    test_synth_large['synthetic']='synthetic_large'
    test_world_medium['synthetic']='realworld_medium'
    test_world_large['synthetic']='realworld_large'

    #frames = [test_world]
    frames = [train, test_synth_medium, test_synth_large, test_world_medium, test_world_large]
    return pd.concat(frames)

In [5]:
def max_possible_edges_not_directed(n):
    return n*(n-1)/2

# https://en.wikipedia.org/wiki/Betweenness_centrality
def scaling_factor_node_betweenness_centrality(n):
    return (n-1)*(n-2)/2 # undirected graphs

def scaling_factor_edge_betweenness_centrality(n):
    return (n*(n-1))/2 # undirected graphs

In [6]:
def is_node_based(sampler_type):
    if(sampler_type in set(['random degree node', 'random node', 'random node edge'])):
        return 1
    return 0

In [7]:
def is_edge_based(sampler_type):
    if(sampler_type in set(['random edge', 'random node edge', 'induced random edge'])):
        return 1
    return 0

In [8]:
def is_traversal_based(sampler_type):
    if(sampler_type in set(['random jump', 'snowball', 'forest fire', 'metropolis hastings random walk', 'expansion', 'frontier', 'rank degree'])):
        return 1
    return 0

In [9]:
def add_sampler_type(df):
    df['sampler_type']=df.apply(lambda row: get_sampler_type(row['sampling algorithm']), axis=1)

In [10]:
def get_graph_param(graph_id):
    if '_Syn_' in graph_id:
        values = graph_id.split('range_size')[1].split('_param:')
        values = [values[0]] + values[1].split('_')
        return values[1]
    return ''

In [11]:
def generate_features(df):
    #df['graph param'] = df.apply(lambda row: get_graph_param(row['graph id']), axis=1)

    mapping = {1:'snowball', 2:'random node', 3:'metropolis hastings random walk', 4: 'random degree node', 5: 'random jump', 6: 'random edge', 7: 'random node edge', 8: 'forest fire', 9:'expansion', 10: 'frontier', 11:'induced random edge', 12:'rank degree'}

    #df['sampling_algorithm'] = df['sample_algs'].map(mapping)

    df['node_count/edge_count']=np.exp(-np.log((df['node_nums']/df['edge_nums'])+1))
    df['edge_count/node_count']=np.exp(-np.log((df['edge_nums']/df['node_nums'])+1))

    df['clust_coeff_max']=np.exp(-np.log(df['max_clust_coeff']+1))
    df['clust_coeff_min']=df['min_clust_coeff']/df['max_clust_coeff']
    df['clust_coeff_avg']=df['mean_clust_coeff']/df['max_clust_coeff']
    df['clust_coeff_var']=np.exp(-np.log(df['var_clust_coeff']+1))
    df['clust_coeff_median']=df['median_clust_coeff']/df['max_clust_coeff']

    df['scaling_factor_node_betweenness_centrality']=df.apply(lambda row: max_possible_edges_not_directed(row['node_nums']), axis=1)
    #df['scaling_factor_edge_betweenness_centrality']=df.apply(lambda row: max_possible_edges_not_directed(row['node_nums']), axis=1)

    df['degree_min']=df['min_degree']/df['max_degree']
    df['degree_avg']=df['mean_degree']/df['max_degree']
    df['degree_max']=np.exp(-np.log(df['max_degree']+1))
    # see: https://math.stackexchange.com/questions/2833062/a-measure-similar-to-variance-thats-always-between-0-and-1
    # we add log to alleviate how quickly the value approximates zero
    df['degree_var']=np.exp(-np.log(df['var_degree']+1))
    df['degree_median']=df['median_degree']/df['max_degree']

    #'graph_density', 'min_clust_coeff', ''
    # 'mean_clust_coeff', 'var_clust_coeff', 'median_clust_coeff'
    df['node_betweenness_centrality_max']=np.exp(-(df['max_node_betweenness_centrality']/df['scaling_factor_node_betweenness_centrality']))
    df['node_betweenness_centrality_avg']=df['mean_node_betweenness_centrality']/df['scaling_factor_node_betweenness_centrality']
    df['node_betweenness_centrality_var']=np.exp(-np.log(df['var_node_betweenness_centrality']+1))
    df['node_betweenness_centrality_median']=df['median_node_betweenness_centrality']/df['scaling_factor_node_betweenness_centrality']
    df['node_betweenness_centrality_min']=df['min_node_betweenness_centrality']/df['scaling_factor_node_betweenness_centrality']

    '''
    df['edge_betweenness_centrality_max']=np.exp(-(df['max_edge_betweenness_centrality']/df['scaling_factor_edge_betweenness_centrality']))
    df['edge_betweenness_centrality_avg']=df['mean_edge_betweenness_centrality']/df['scaling_factor_edge_betweenness_centrality']
    df['edge_betweenness_centrality_var']=np.exp(-np.log(df['var_edge_betweenness_centrality']+1))
    df['edge_betweenness_centrality_median']=df['median_edge_betweenness_centrality']/df['scaling_factor_edge_betweenness_centrality']

    df['eccentricity_centrality_min']=df['min_eccentricity_centrality']/df['max_eccentricity_centrality']
    df['eccentricity_centrality_avg']=df['mean_eccentricity_centrality']/df['max_eccentricity_centrality']
    df['eccentricity_centrality_var']=np.exp(-np.log(df['var_eccentricity_centrality']+1))
    df['eccentricity_centrality_median']=df['median_eccentricity_centrality']/df['max_eccentricity_centrality']
    '''

    df['eigenvector_centrality_min']=df['min_eigenvector_centrality']/df['max_eigenvector_centrality']
    df['eigenvector_centrality_avg']=df['mean_eigenvector_centrality']/df['max_eigenvector_centrality']
    df['eigenvector_centrality_median']=df['median_eigenvector_centrality']/df['max_eigenvector_centrality']
    df['eigenvector_centrality_var']=np.exp(-np.log(df['var_eigenvector_centrality']+1))
    df['eigenvector_centrality_max'] = np.exp(-np.log(df['max_eigenvector_centrality']+1))

    df['degrees_spanning_tree_min']=df['min_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
    df['degrees_spanning_tree_avg']=df['mean_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
    df['degrees_spanning_tree_var']=df['var_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
    df['degrees_spanning_tree_median']=df['median_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
    df['degrees_spanning_tree_max']=np.exp(-np.log(df['max_degrees_max_spanning_tree']+1))

    df['min_connected_components_size'] = df['min_connected_components_size']/df['max_connected_components_size']
    df['mean_connected_components_size'] = df['mean_connected_components_size']/df['max_connected_components_size']
    df['median_connected_components_size'] = df['median_connected_components_size']/df['max_connected_components_size']
    df['var_connected_components_size'] = np.exp(-np.log(df['var_connected_components_size']+1))
    df['num_connected_components'] = np.exp(-np.log(df['num_connected_components']+1))
    df['max_connected_components_size'] = np.exp(-np.log(df['max_connected_components_size']+1))

    df['pagerank_centrality_min'] = df['min_pagerank_centrality']/df['max_pagerank_centrality']
    df['pagerank_centrality_avg'] = df['mean_pagerank_centrality']/df['max_pagerank_centrality']
    df['pagerank_centrality_median'] = df['median_pagerank_centrality']/df['max_pagerank_centrality']
    df['pagerank_centrality_var'] = np.exp(-np.log(df['var_pagerank_centrality']+1))
    df['pagerank_centrality_max']=np.exp(-np.log(df['max_pagerank_centrality']+1))

    df['shortest_path_length_min'] = df['min_shortest_path_length']/df['max_shortest_path_length']
    df['shortest_path_length_avg'] = df['mean_shortest_path_length']/df['max_shortest_path_length']
    df['shortest_path_length_var'] = np.exp(-np.log(df['var_shortest_path_length']+1))
    df['shortest_path_length_max']=np.exp(-np.log(df['max_shortest_path_length']+1))
    '''
    df['shortest_path_length_lcc_min'] = df['min_shortest_path_length_LCC']/df['max_shortest_path_length_LCC']
    df['shortest_path_length_lcc_mean'] = df['mean_shortest_path_length_LCC']/df['max_shortest_path_length_LCC']
    df['shortest_path_length_lcc_var'] = np.exp(-np.log(df['var_shortest_path_length_LCC']+1))
    df['shortest_path_length_lcc_max']=np.exp(-np.log(df['max_shortest_path_length_LCC']+1))
    '''
    df['sampler_type_node_based']=df.apply(lambda row: is_node_based(row['sampling_algorithm']), axis=1)
    df['sampler_type_edge_based']=df.apply(lambda row: is_edge_based(row['sampling_algorithm']), axis=1)
    df['sampler_type_traversal_based']=df.apply(lambda row: is_traversal_based(row['sampling_algorithm']), axis=1)

    # calc time features
    df['clust_coeff_calc_time'] = np.exp(-np.log(df['clust_coeff_calc_time']+1))
    df['connected_components_calc_time'] = np.exp(-np.log(df['connected_components_calc_time']+1))
    df['degree_assortativity_calc_time'] = np.exp(-np.log(df['degree_assortativity_calc_time']+1))
    df['eigenvector_centrality_calc_time'] = np.exp(-np.log(df['eigenvector_centrality_calc_time']+1))
    df['max_spanning_tree_calc_time'] = np.exp(-np.log(df['max_spanning_tree_calc_time']+1))
    df['pagerank_centrality_calc_time'] = np.exp(-np.log(df['pagerank_centrality_calc_time']+1))

    # size features
    # model 1
    #df['node_nums'] = df['node_nums']/1000000
    #df['edge_nums'] = df['edge_nums']/100000000

    # model 2
    df['node_nums'] = np.exp(-np.log(df['node_nums']+1))
    df['edge_nums'] = np.exp(-np.log(df['edge_nums']+1))

    #print('df shape before join ', df.shape)
    #print('df index before join ', df.columns)
    #one_hot = pd.get_dummies(df['sampling_algorithm']).replace({False: 0, True: 1})
    #print('one_hot indx ', one_hot.index)
    #print('one_hot ', one_hot)
    df['sampling algorithm'] = df['sampling_algorithm']
    df = pd.get_dummies(df, columns=['sampling_algorithm'], prefix='', prefix_sep='').replace({False: 0, True: 1})
    return df
    #return df

In [12]:
def data_quality_check(df):
    nalist = df.columns[df.isna().any()].tolist()
    print('nalist ', nalist)
    if len(nalist)!=0:
        print(df[df['graph_ID'].isnull()])

In [14]:
def now_vs_after(df):
    print('Now: {}, after: {}'.format(len(df.index), len(df.drop_duplicates().index)))

In [None]:
target='D3'
#df = generate_dataset(sources, target)
dataset = generate_features(generate_dataset(sources, target)).drop_duplicates().fillna(0)
print('dataset ', dataset)
print(dataset.columns)

NameError: name 'pd' is not defined

In [None]:
# quality check - ensure all values are between zero and one
for feature in ['node_count/edge_count', 'edge_count/node_count', 'clust_coeff_max', 'clust_coeff_min', 'clust_coeff_avg',
            'clust_coeff_var', 'clust_coeff_median', 'degree_min',
            'degree_avg', 'degree_var', 'degree_median', 'node_betweenness_centrality_max', 'node_betweenness_centrality_avg',
            'node_betweenness_centrality_var', 'node_betweenness_centrality_median',
            'edge_betweenness_centrality_max', 'edge_betweenness_centrality_avg',
            'edge_betweenness_centrality_var', 'edge_betweenness_centrality_median', 'eccentricity_centrality_min',
            'eccentricity_centrality_avg', 'eccentricity_centrality_var', 'eccentricity_centrality_median',
            'eigenvector_centrality_min', 'eigenvector_centrality_var', 'eigenvector_centrality_avg',
            'pagerank_centrality_var', 'degrees_spanning_tree_min', 'degrees_spanning_tree_avg', 'degrees_spanning_tree_var',
            'min_pagerank_centrality', 'max_pagerank_centrality', 'mean_pagerank_centrality', 'median_pagerank_centrality',
            'graph_density']:
    if dataset[feature].max()>1:
        print('Feature: {}, value: {}'.format(feature, dataset[feature].max()))

In [15]:
import pandas as pd
import numpy as np
model_num = '2'

for target in ['D3', 'C2D2', 'HPD2', 'HPD2_LCC', 'run_time']:
    dataset = generate_dataset(sources, target)

    # data quality check
    print('Target: {}'.format(target))
    data_quality_check(dataset)
    now_vs_after(dataset)

    dfx = generate_features(dataset).drop_duplicates().fillna(0)
    now_vs_after(dfx)
    all_algorithms = ['forest fire', 'random degree node', 'random edge', 'random jump', 'random node', 'random node edge', 'snowball', 'frontier', 'rank degree', 'induced random edge', 'metropolis hastings random walk', 'expansion']
    for alg in all_algorithms:
        if not alg in dfx.columns: # alg not in df
            dfx[alg] = 0
    dfx = dfx.rename(columns={"KS Degree Distr": "D3", "KS Clustering Coefficient Distr": "C2D2", 'KS hop plots Distr': 'HPD2', 'KS hop plots LCC Distr': 'HPD2_LCC'})
    dfx.to_csv(root_folder + 'data/model_' + model_num + '/{}_v4.csv'.format(target), index=False)

Target: D3
nalist  ['Unnamed: 0', 'entropy_clust_coeff', 'global_clust_coeff', 'global_clust_coeff_calc_time', 'node_betweenness_centrality_calc_time', 'min_shortest_path_length_LCC', 'max_shortest_path_length_LCC', 'var_shortest_path_length_LCC', 'mean_shortest_path_length_LCC', 'node_edge_betweenness_centrality_calc_time', 'min_edge_betweenness_centrality', 'max_edge_betweenness_centrality', 'mean_edge_betweenness_centrality', 'var_edge_betweenness_centrality', 'median_edge_betweenness_centrality', 'min_eccentricity_centrality', 'max_eccentricity_centrality', 'mean_eccentricity_centrality', 'median_eccentricity_centrality', 'var_eccentricity_centrality', 'entropy_degrees', 'diameter', 'diameter_calc_time', 'min_farness_centrality', 'max_farness_centrality', 'var_farness_centrality', 'mean_farness_centrality', 'median_farness_centrality', 'farness_centrality_calc_time', 'median_shortest_path_length_LCC', 'shortest_path_length_LCC_calc_time']
Empty DataFrame
Columns: [Unnamed: 0, graph

  df['eigenvector_centrality_max'] = np.exp(-np.log(df['max_eigenvector_centrality']+1))
  df['degrees_spanning_tree_min']=df['min_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_avg']=df['mean_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_var']=df['var_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_median']=df['median_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_max']=np.exp(-np.log(df['max_degrees_max_spanning_tree']+1))
  df['pagerank_centrality_min'] = df['min_pagerank_centrality']/df['max_pagerank_centrality']
  df['pagerank_centrality_avg'] = df['mean_pagerank_centrality']/df['max_pagerank_centrality']
  df['pagerank_centrality_median'] = df['median_pagerank_centrality']/df['max_pagerank_centrality']
  df['pagerank_centrality_var'] = np.exp(-np.log(df['var_pagerank_centrality']+1))
  df['pagerank_cent

Now: 14143, after: 14143
Target: C2D2
nalist  ['Unnamed: 0', 'entropy_clust_coeff', 'global_clust_coeff', 'global_clust_coeff_calc_time', 'node_betweenness_centrality_calc_time', 'min_shortest_path_length_LCC', 'max_shortest_path_length_LCC', 'var_shortest_path_length_LCC', 'mean_shortest_path_length_LCC', 'node_edge_betweenness_centrality_calc_time', 'min_edge_betweenness_centrality', 'max_edge_betweenness_centrality', 'mean_edge_betweenness_centrality', 'var_edge_betweenness_centrality', 'median_edge_betweenness_centrality', 'min_eccentricity_centrality', 'max_eccentricity_centrality', 'mean_eccentricity_centrality', 'median_eccentricity_centrality', 'var_eccentricity_centrality', 'entropy_degrees', 'diameter', 'diameter_calc_time', 'min_farness_centrality', 'max_farness_centrality', 'var_farness_centrality', 'mean_farness_centrality', 'median_farness_centrality', 'farness_centrality_calc_time', 'median_shortest_path_length_LCC', 'shortest_path_length_LCC_calc_time']
Empty DataFrame


  df['eigenvector_centrality_max'] = np.exp(-np.log(df['max_eigenvector_centrality']+1))
  df['degrees_spanning_tree_min']=df['min_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_avg']=df['mean_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_var']=df['var_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_median']=df['median_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_max']=np.exp(-np.log(df['max_degrees_max_spanning_tree']+1))
  df['pagerank_centrality_min'] = df['min_pagerank_centrality']/df['max_pagerank_centrality']
  df['pagerank_centrality_avg'] = df['mean_pagerank_centrality']/df['max_pagerank_centrality']
  df['pagerank_centrality_median'] = df['median_pagerank_centrality']/df['max_pagerank_centrality']
  df['pagerank_centrality_var'] = np.exp(-np.log(df['var_pagerank_centrality']+1))
  df['pagerank_cent

Now: 14143, after: 14143
Target: HPD2
nalist  ['Unnamed: 0', 'entropy_clust_coeff', 'global_clust_coeff', 'global_clust_coeff_calc_time', 'node_betweenness_centrality_calc_time', 'min_shortest_path_length_LCC', 'max_shortest_path_length_LCC', 'var_shortest_path_length_LCC', 'mean_shortest_path_length_LCC', 'node_edge_betweenness_centrality_calc_time', 'min_edge_betweenness_centrality', 'max_edge_betweenness_centrality', 'mean_edge_betweenness_centrality', 'var_edge_betweenness_centrality', 'median_edge_betweenness_centrality', 'min_eccentricity_centrality', 'max_eccentricity_centrality', 'mean_eccentricity_centrality', 'median_eccentricity_centrality', 'var_eccentricity_centrality', 'entropy_degrees', 'diameter', 'diameter_calc_time', 'min_farness_centrality', 'max_farness_centrality', 'var_farness_centrality', 'mean_farness_centrality', 'median_farness_centrality', 'farness_centrality_calc_time', 'median_shortest_path_length_LCC', 'shortest_path_length_LCC_calc_time']
Empty DataFrame


  df['eigenvector_centrality_max'] = np.exp(-np.log(df['max_eigenvector_centrality']+1))
  df['degrees_spanning_tree_min']=df['min_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_avg']=df['mean_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_var']=df['var_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_median']=df['median_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_max']=np.exp(-np.log(df['max_degrees_max_spanning_tree']+1))
  df['pagerank_centrality_min'] = df['min_pagerank_centrality']/df['max_pagerank_centrality']
  df['pagerank_centrality_avg'] = df['mean_pagerank_centrality']/df['max_pagerank_centrality']
  df['pagerank_centrality_median'] = df['median_pagerank_centrality']/df['max_pagerank_centrality']
  df['pagerank_centrality_var'] = np.exp(-np.log(df['var_pagerank_centrality']+1))
  df['pagerank_cent

Now: 14143, after: 14143
Target: HPD2_LCC
nalist  ['Unnamed: 0', 'entropy_clust_coeff', 'global_clust_coeff', 'global_clust_coeff_calc_time', 'node_betweenness_centrality_calc_time', 'min_shortest_path_length_LCC', 'max_shortest_path_length_LCC', 'var_shortest_path_length_LCC', 'mean_shortest_path_length_LCC', 'node_edge_betweenness_centrality_calc_time', 'min_edge_betweenness_centrality', 'max_edge_betweenness_centrality', 'mean_edge_betweenness_centrality', 'var_edge_betweenness_centrality', 'median_edge_betweenness_centrality', 'min_eccentricity_centrality', 'max_eccentricity_centrality', 'mean_eccentricity_centrality', 'median_eccentricity_centrality', 'var_eccentricity_centrality', 'entropy_degrees', 'diameter', 'diameter_calc_time', 'min_farness_centrality', 'max_farness_centrality', 'var_farness_centrality', 'mean_farness_centrality', 'median_farness_centrality', 'farness_centrality_calc_time', 'median_shortest_path_length_LCC', 'shortest_path_length_LCC_calc_time']
Empty DataFr

  df['eigenvector_centrality_max'] = np.exp(-np.log(df['max_eigenvector_centrality']+1))
  df['degrees_spanning_tree_min']=df['min_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_avg']=df['mean_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_var']=df['var_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_median']=df['median_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_max']=np.exp(-np.log(df['max_degrees_max_spanning_tree']+1))
  df['pagerank_centrality_min'] = df['min_pagerank_centrality']/df['max_pagerank_centrality']
  df['pagerank_centrality_avg'] = df['mean_pagerank_centrality']/df['max_pagerank_centrality']
  df['pagerank_centrality_median'] = df['median_pagerank_centrality']/df['max_pagerank_centrality']
  df['pagerank_centrality_var'] = np.exp(-np.log(df['var_pagerank_centrality']+1))
  df['pagerank_cent

Now: 14143, after: 14143
Target: run_time
nalist  ['Unnamed: 0', 'entropy_clust_coeff', 'global_clust_coeff', 'global_clust_coeff_calc_time', 'node_betweenness_centrality_calc_time', 'min_shortest_path_length_LCC', 'max_shortest_path_length_LCC', 'var_shortest_path_length_LCC', 'mean_shortest_path_length_LCC', 'node_edge_betweenness_centrality_calc_time', 'min_edge_betweenness_centrality', 'max_edge_betweenness_centrality', 'mean_edge_betweenness_centrality', 'var_edge_betweenness_centrality', 'median_edge_betweenness_centrality', 'min_eccentricity_centrality', 'max_eccentricity_centrality', 'mean_eccentricity_centrality', 'median_eccentricity_centrality', 'var_eccentricity_centrality', 'entropy_degrees', 'diameter', 'diameter_calc_time', 'min_farness_centrality', 'max_farness_centrality', 'var_farness_centrality', 'mean_farness_centrality', 'median_farness_centrality', 'farness_centrality_calc_time', 'median_shortest_path_length_LCC', 'shortest_path_length_LCC_calc_time']
Empty DataFr

  df['eigenvector_centrality_max'] = np.exp(-np.log(df['max_eigenvector_centrality']+1))
  df['degrees_spanning_tree_min']=df['min_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_avg']=df['mean_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_var']=df['var_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_median']=df['median_degrees_max_spanning_tree']/df['max_degrees_max_spanning_tree']
  df['degrees_spanning_tree_max']=np.exp(-np.log(df['max_degrees_max_spanning_tree']+1))
  df['pagerank_centrality_min'] = df['min_pagerank_centrality']/df['max_pagerank_centrality']
  df['pagerank_centrality_avg'] = df['mean_pagerank_centrality']/df['max_pagerank_centrality']
  df['pagerank_centrality_median'] = df['median_pagerank_centrality']/df['max_pagerank_centrality']
  df['pagerank_centrality_var'] = np.exp(-np.log(df['var_pagerank_centrality']+1))
  df['pagerank_cent

Now: 14143, after: 14143


In [None]:
dfx[['clust_coeff_calc_time',
       'connected_components_calc_time', 'pagerank_centrality_calc_time',
       'max_spanning_tree_calc_time']]

Unnamed: 0,clust_coeff_calc_time,connected_components_calc_time,pagerank_centrality_calc_time,max_spanning_tree_calc_time
0,0.155451,0.440589,0.454012,1.744806
0,2.067737,0.920141,0.321629,9.598539
1,2.018222,0.765105,0.528590,6.645228
1,1.130755,0.269564,0.256353,4.912762
2,0.205690,0.193303,0.760978,3.227988
...,...,...,...,...
475,9.447464,0.664761,0.824831,11.493466
476,7.822001,3.362185,0.783072,17.071564
477,4.993967,0.087790,0.752546,6.690350
478,5.315836,0.321391,0.650829,11.097322
