In [None]:
import pandas as pd
import numpy as np

# Calculate Distance and Produce Adj. Matrix

## 1. Demo

In [None]:
ca_fi = "./{}-P.csv".format("GBM")

In [None]:
df_ca = pd.read_csv(ca_fi, sep='\t', header=None)

In [None]:
# Get Feature Matrix (last col. is label; drop it in advance)
df = df_ca.drop(df_ca.columns[len(df_ca.columns) - 1], axis=1)
df.head()

In [None]:
# number of nodes
n_nodes = df.shape[0]
n_nodes

In [None]:
# allocate memory for adj. matrix
adj = np.zeros([n_nodes, n_nodes])

In [None]:
# feature matrix: transform from pandas to numpy matrix
ftr = df.to_numpy()
ftr

In [None]:
ftr[0]

In [None]:
# try: ouclidean distance between two arbitrary nodes - method A
np.linalg.norm(ftr[0] - ftr[1])

In [None]:
# try: ouclidean distance between two arbitrary nodes - method B
np.sqrt(np.sum(np.square(ftr[0] - ftr[1])))

In [None]:
# produce adj matrix according to ouclidean distance
dist_avg = np.zeros(n_nodes)
dist = np.zeros([n_nodes, n_nodes])  # to note down distance of any two nodes
for ii in range(0, n_nodes):
    for jj in range(ii + 1, n_nodes):
        vec_i, vec_j = ftr[ii], ftr[jj]
        dist_ij = np.linalg.norm(vec_i - vec_j)
        dist[ii][jj] = dist[jj][ii] = dist_ij
    # avg distances of every nodes to its neighbours
    dist_avg[ii] = np.average(dist[ii])

In [None]:
# produce adj. matrix
for ii in range(0, n_nodes):
    for jj in range(ii + 1, n_nodes):
        vec_i, vec_j = ftr[ii], ftr[jj]
        dist_ij = dist[ii][jj]
        # if distance < avg distances, then connect them
        if dist_ij < dist_avg[ii]:
            adj[ii][jj] = adj[jj][ii] = 1

In [None]:
# only for preview purpose
adj_df = pd.DataFrame(adj)
adj_df.head()

In [None]:
from scipy.sparse import coo_matrix
adj_coo = coo_matrix(adj)

In [None]:
adj_coo.row.shape

In [None]:
adj_coo.col.shape

## 2. Produce adj. matrix for every dataset

In [None]:
def produce_adj(data_name):
    print('Reading data: {}'.format(data_name))
    ca_fi = "./{}-P.csv".format(data_name)
    df_ca = pd.read_csv(ca_fi, sep='\t', header=None)
    df = df_ca.drop(df_ca.columns[len(df_ca.columns) - 1], axis=1)
    ftr = df.to_numpy()
    n_nodes = df.shape[0]
    adj = np.zeros([n_nodes, n_nodes])
    
    print('Making adj. matrix: {}'.format(data_name))
    # calc distances
    dist_avg = np.zeros(n_nodes)
    dist = np.zeros([n_nodes, n_nodes])
    for ii in range(0, n_nodes):
        for jj in range(ii + 1, n_nodes):
            vec_i, vec_j = ftr[ii], ftr[jj]
            dist_ij = np.linalg.norm(vec_i - vec_j)
            dist[ii][jj] = dist[jj][ii] = dist_ij
        dist_avg[ii] = np.average(dist[ii])
    # calc adj matrix
    for ii in range(0, n_nodes):
        for jj in range(ii + 1, n_nodes):
            vec_i, vec_j = ftr[ii], ftr[jj]
            dist_ij = dist[ii][jj]
            # 看看是否合格？
            if dist_ij < dist_avg[ii]:
                adj[ii][jj] = adj[jj][ii] = 1
    # save adj matrix to file
    adj_coo = coo_matrix(adj)
    print('Saving adj. matrix to file: {}'.format(data_name))
    with open('./ADJ/{}-A.txt'.format(data_name), 'w') as f:
        for row, col in zip(adj_coo.row, adj_coo.col):
            f.write('{}\t{}\n'.format(row, col))

In [None]:
for name in cas:
    produce_adj(name)