In [None]:
import numpy as np
import pandas as pd
import random
import tensorflow as tf
import warnings
from math import log2
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.neighbors import kneighbors_graph
from scipy import spatial
from scipy import sparse as sp
from tqdm.notebook import tqdm
from scipy.spatial.distance import cdist
from scipy.spatial.distance import pdist
from scipy import sparse
from scipy.linalg import fractional_matrix_power
import time
%matplotlib inline
from sklearn.model_selection import train_test_split

<font size = "5"> **Performance metrics** </font>

In [None]:
def _generalized_average(U, V, average_method):
    """Return a particular mean of two numbers."""
    if average_method == "min":
        return min(U, V)
    elif average_method == "geometric":
        return np.sqrt(U * V)
    elif average_method == "arithmetic":
        return np.mean([U, V])
    elif average_method == "max":
        return max(U, V)
    else:
        raise ValueError(
            "'average_method' must be 'min', 'geometric', 'arithmetic', or 'max'"
        )


def contingency_matrix(
    labels_true, labels_pred, *, eps=None, sparse=False, dtype=np.int64
):
  

    if eps is not None and sparse:
        raise ValueError("Cannot set 'eps' when sparse=True")

    classes, class_idx = np.unique(labels_true, return_inverse=True)
    clusters, cluster_idx = np.unique(labels_pred, return_inverse=True)
    n_classes = classes.shape[0]
    n_clusters = clusters.shape[0]
    # Using coo_matrix to accelerate simple histogram calculation,
    # i.e. bins are consecutive integers
    # Currently, coo_matrix is faster than histogram2d for simple cases
    contingency = sp.coo_matrix(
        (np.ones(class_idx.shape[0]), (class_idx, cluster_idx)),
        shape=(n_classes, n_clusters),
        dtype=dtype,
    )
    if sparse:
        contingency = contingency.tocsr()
        contingency.sum_duplicates()
    else:
        contingency = contingency.toarray()
        if eps is not None:
            # don't use += as contingency is integer
            contingency = contingency + eps
    return contingency


# clustering measures


def pair_confusion_matrix(labels_true, labels_pred):
   
    #labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
    n_samples = np.int64(labels_true.shape[0])

    # Computation using the contingency data
    contingency = contingency_matrix(
        labels_true, labels_pred, sparse=True, dtype=np.int64
    )
    n_c = np.ravel(contingency.sum(axis=1))
    n_k = np.ravel(contingency.sum(axis=0))
    sum_squares = (contingency.data**2).sum()
    C = np.empty((2, 2), dtype=np.int64)
    C[1, 1] = sum_squares - n_samples
    C[0, 1] = contingency.dot(n_k).sum() - sum_squares
    C[1, 0] = contingency.transpose().dot(n_c).sum() - sum_squares
    C[0, 0] = n_samples**2 - C[0, 1] - C[1, 0] - sum_squares
    return C


def rand_score(labels_true, labels_pred):
  
    contingency = pair_confusion_matrix(labels_true, labels_pred)
    numerator = contingency.diagonal().sum()
    denominator = contingency.sum()

    if numerator == denominator or denominator == 0:
        # Special limit cases: no clustering since the data is not split;
        # or trivial clustering where each document is assigned a unique
        # cluster. These are perfect matches hence return 1.0.
        return 1.0

    return numerator / denominator


def adjusted_rand_score(labels_true, labels_pred):
    
    (tn, fp), (fn, tp) = pair_confusion_matrix(labels_true, labels_pred)
    # convert to Python integer types, to avoid overflow or underflow
    tn, fp, fn, tp = int(tn), int(fp), int(fn), int(tp)

    # Special cases: empty data or full agreement
    if fn == 0 and fp == 0:
        return 1.0

    return 2.0 * (tp * tn - fn * fp) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn))


def mutual_info_score(labels_true, labels_pred, *, contingency=None):
  

    if isinstance(contingency, np.ndarray):
        # For an array
        nzx, nzy = np.nonzero(contingency)
        nz_val = contingency[nzx, nzy]
    elif sp.issparse(contingency):
        # For a sparse matrix
        nzx, nzy, nz_val = sp.find(contingency)
    else:
        raise ValueError("Unsupported type for 'contingency': %s" % type(contingency))

    contingency_sum = contingency.sum()
    pi = np.ravel(contingency.sum(axis=1))
    pj = np.ravel(contingency.sum(axis=0))

    # Since MI <= min(H(X), H(Y)), any labelling with zero entropy, i.e. containing a
    # single cluster, implies MI = 0
    if pi.size == 1 or pj.size == 1:
        return 0.0

    log2_contingency_nm = np.log2(nz_val)
    contingency_nm = nz_val / contingency_sum
    # Don't need to calculate the full outer product, just for non-zeroes
    outer = pi.take(nzx).astype(np.int64, copy=False) * pj.take(nzy).astype(
        np.int64, copy=False
    )
    log2_outer = -np.log2(outer) + log2(pi.sum()) + log2(pj.sum())
    mi = (
        contingency_nm * (log2_contingency_nm - log2(contingency_sum))
        + contingency_nm * log2_outer
    )
    mi = np.where(np.abs(mi) < np.finfo(mi.dtype).eps, 0.0, mi)
    return np.clip(mi.sum(), 0.0, None)


def adjusted_mutual_info_score(
    labels_true, labels_pred, *, average_method="arithmetic"
):
   
    #labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
    n_samples = labels_true.shape[0]
    classes = np.unique(labels_true)
    clusters = np.unique(labels_pred)

    # Special limit cases: no clustering since the data is not split.
    # It corresponds to both labellings having zero entropy.
    # This is a perfect match hence return 1.0.
    if (
        classes.shape[0] == clusters.shape[0] == 1
        or classes.shape[0] == clusters.shape[0] == 0
    ):
        return 1.0

    contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
    contingency = contingency.astype(np.float64, copy=False)
    # Calculate the MI for the two clusterings
    mi = mutual_info_score(labels_true, labels_pred, contingency=contingency)
    # Calculate the expected value for the mutual information
    emi = expected_mutual_information(contingency, n_samples)
    # Calculate entropy for each labeling
    h_true, h_pred = entropy(labels_true), entropy(labels_pred)
    normalizer = _generalized_average(h_true, h_pred, average_method)
    denominator = normalizer - emi
    # Avoid 0.0 / 0.0 when expectation equals maximum, i.e a perfect match.
    # normalizer should always be >= emi, but because of floating-point
    # representation, sometimes emi is slightly larger. Correct this
    # by preserving the sign.
    if denominator < 0:
        denominator = min(denominator, -np.finfo("float64").eps)
    else:
        denominator = max(denominator, np.finfo("float64").eps)
    ami = (mi - emi) / denominator
    return ami


def normalized_mutual_info_score(
    labels_true, labels_pred, *, average_method="arithmetic"
):
  
    #labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
    classes = np.unique(labels_true)
    clusters = np.unique(labels_pred)

    # Special limit cases: no clustering since the data is not split.
    # It corresponds to both labellings having zero entropy.
    # This is a perfect match hence return 1.0.
    if (
        classes.shape[0] == clusters.shape[0] == 1
        or classes.shape[0] == clusters.shape[0] == 0
    ):
        return 1.0

    contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
    contingency = contingency.astype(np.float64, copy=False)
    # Calculate the MI for the two clusterings
    mi = mutual_info_score(labels_true, labels_pred, contingency=contingency)

    # At this point mi = 0 can't be a perfect match (the special case of a single
    # cluster has been dealt with before). Hence, if mi = 0, the nmi must be 0 whatever
    # the normalization.
    if mi == 0:
        return 0.0

    # Calculate entropy for each labeling
    h_true, h_pred = entropy(labels_true), entropy(labels_pred)

    normalizer = _generalized_average(h_true, h_pred, average_method)
    return mi / normalizer





def entropy(labels):
 
    if len(labels) == 0:
        return 1.0
    label_idx = np.unique(labels, return_inverse=True)[1]
    pi = np.bincount(label_idx).astype(np.float64)
    pi = pi[pi > 0]

    # single cluster => zero entropy
    if pi.size == 1:
        return 0.0

    pi_sum = np.sum(pi)
    # log2(a / b) should be calculated as log2(a) - log2(b) for
    # possible loss of precision
    return -np.sum((pi / pi_sum) * (np.log2(pi) - log2(pi_sum)))


<font size = "5"> **Privacy Analysis using DP** </font>

In [None]:
lambdas = [0.005, 0.01, 0.015, 0.02, 0.025, 0.030]

delta1 = 0.0005
epsilon_del1 = [(2*delta1)/lambd for lambd in lambdas]

delta2 = 0.001
epsilon_del2 = [(2*delta2)/lambd for lambd in lambdas]

delta3 = 0.005
epsilon_del3 = [(2*delta3)/lambd for lambd in lambdas]

fig = plt.figure(figsize=(6,6)) 
default_x_ticks = lambdas
plt.plot(default_x_ticks, epsilon_del1 , 'o--', linewidth=2)
plt.plot(default_x_ticks, epsilon_del2, 'd--', linewidth=2)
plt.plot(default_x_ticks, epsilon_del3, 's--', linewidth=2)

#customization

#plt.xticks(default_x_ticks, [0,10,30,50,70])
plt.xlabel('λ', fontsize = 14)
plt.ylabel('ε', fontsize = 14)
plt.legend(title='Datasets', title_fontsize = 13, labels=['δ = 0.0005', 'δ = 0.001', 'δ = 0.005'])
#plt.savefig("C:/Users/nimes/OneDrive/Desktop/noise_f-score.eps")
plt.show()

<font size = "5"> **Privacy Preserving Framework** </font>

In [None]:
#Function for constructing Incomplete Distance Matrix for PPDA

def dist_approx(X_na,X_a):

  DA = cdist(X_a,X_a, metric='euclidean')                   #anchor to anchor distance
  DNA = cdist(X_na,X_a, metric='euclidean')                 #non-anchor to anchor distance 
  
  #for k in attackers:
      #DNA[k] += np.full(DNA[k].shape, 2)
#       DNA[k] += 0.3*np.random.uniform(0,1,DNA[k].shape)
  #Distances with uniform random noise
  #DNA = cdist(X_na,X_a, metric='euclidean') + 0.1*np.random.uniform(0,1,(X_na.shape[0], X_a.shape[0]))

#   #Distance with LDP Noise
#   delta = 0.2
#   for i in range(DNA.shape[0]):
#     DNA[i] = DNA[i] * delta / (np.linalg.norm(DNA[i]))
#     DNA[i] += np.random.laplace(loc = 1, scale = 0.05, size = DNA[i].shape)
#   for i in range(DA.shape[0]):
#     DA[i] = DA[i]*delta / (np.linalg.norm(DA[i]))
  zero_mat = np.eye((X_na.shape[0]))
  ones_mat = np.ones((X_a.shape[0],X_na.shape[0]))
  ones_mat2 =  np.ones((X_a.shape[0],X_a.shape[0]))
  D = np.array(np.vstack((np.hstack((np.zeros((X_na.shape[0],X_na.shape[0])), DNA)), np.hstack((DNA.T, DA)))))    #Incomplete distance matrix
  
  W_1 = np.array(np.vstack((np.hstack((zero_mat,ones_mat.T)), np.hstack((ones_mat, ones_mat2)))))          #Weight matrix


  V = np.array(np.diag(np.matmul(W_1,np.ones(W_1.shape[0]))) - W_1)      #V matrix required for SMACOF
 

  V1=V[:X_na.shape[0],:X_na.shape[0]]
  V2=V[:X_na.shape[0],X_na.shape[0]:]

  return D, V, V1, V2, W_1, DA, DNA

#Function for learning embeddings through MDS

def classical_MDS_X(D, V, W_1, n,d):

  epsilon= 1e-3
 
  #D_inv = np.reciprocal(D,  out = np.zeros_like(D), where=(D!=0))

  #L_D_inv = np.diag(np.matmul(D_inv,np.ones(D_inv.shape[0]))) - D_inv
  #print(L_D_inv) 
  #Zu = np.random.multivariate_normal(np.zeros(n), np.linalg.pinv(L_D_inv), 50).T
  np.random.seed(10)  
  Zu = np.random.normal(0,1,(n, 500))               #Initializing Embeddings
  epochs = 2000
  loss = []
  V_inv = np.linalg.pinv(V)

  W_new = np.multiply(W_1,D)
  print(W_new)

  D_new = cdist(Zu,Zu, metric='euclidean')
    
  #SMACOF implementation

  for t in tqdm(range(epochs)):
    
   
    W_final = np.divide(W_new,D_new, out = np.zeros_like(W_new), where=(D_new!=0))
    B_Z = np.diag(np.matmul(W_final,np.ones(W_final.shape[0]))) - W_final 
    X_final = np.matmul(np.matmul(V_inv, B_Z), Zu)
    D_new = cdist(X_final,X_final, metric='euclidean')
    D_inv_new = np.reciprocal(D_new ,  out = np.zeros_like(D_new), where=(D_new!=0))
    W_upper_triag = np.array(D_inv_new[np.triu_indices(D_inv_new.shape[0], k = 1)])
    C = np.square(D - D_new)
    D_upper_triag = C[np.triu_indices(C.shape[0], k = 1)]
    stress = np.dot(W_upper_triag, D_upper_triag)
    loss.append(stress)
    Zu = X_final
    if t % 10 == 0:
       print(stress)
    
    if t!=0:
      if abs(loss[t]-loss[t-1]) < epsilon:
        break
    
  return X_final, loss

#Function for learning embeddings through Anchored-MDS 

def MDS_X(D, V1, V2, W_1, DA,X_na, X_a, n,d):
    
  print(D)
  D_inv = np.reciprocal(D,  out = np.zeros_like(D), where=(D!=0))

  L_D_inv = np.diag(np.matmul(D_inv,np.ones(D_inv.shape[0]))) - D_inv

  np.random.seed(32)
  Zu_samples = np.random.multivariate_normal(np.zeros(X_na.shape[0] + X_a.shape[0]), np.linalg.pinv(L_D_inv),d).T
  Zu = Zu_samples[:n,:]                    #Intializing Embeddings


  
  epsilon= 1e-3
  epochs = 2000
  loss = []
  V1_inv = np.linalg.pinv(V1)

  W_new = np.multiply(W_1,D)

  DNA_new = cdist(Zu,X_a, metric='euclidean')
  D_new = np.array(np.vstack((np.hstack((np.zeros((X_na.shape[0],X_na.shape[0])), DNA_new)), np.hstack((DNA_new.T, DA)))))
  for t in tqdm(range(epochs)):
   
    W_final = np.divide(W_new,D_new, out = np.zeros_like(W_new), where=(D_new!=0))

    B_Z = np.diag(np.matmul(W_final,np.ones(W_final.shape[0]))) - W_final 

    BZ1 = B_Z[:X_na.shape[0],:X_na.shape[0]]
    BZ2 = B_Z[:X_na.shape[0],X_na.shape[0]:]

    term1 = np.matmul(BZ1,Zu)
    term2_temp = BZ2 - V2
    term2 = np.matmul(term2_temp, X_a)


    X_final = np.matmul(V1_inv,(term1 + term2))

    DNA_new = cdist(X_final,X_a, metric='euclidean')   
    D_new = np.array(np.vstack((np.hstack((np.zeros((X_na.shape[0],X_na.shape[0])), DNA_new)), np.hstack((np.transpose(DNA_new), DA)))))
    D_inv_new = np.reciprocal(D_new ,  out = np.zeros_like(D_new), where=(D_new!=0))
    W_upper_triag = np.array(D_inv_new[np.triu_indices(D_inv.shape[0], k = 1)])

    C = np.square(D - D_new)
    
    D_upper_triag = C[np.triu_indices(C.shape[0], k = 1)]

    
    stress = np.dot(W_upper_triag, D_upper_triag)

    loss.append(stress)
    Zu = X_final
    if t % 10 == 0:
       print(stress)
    
    if t!=0:
      if abs(loss[t]-loss[t-1]) < epsilon:
        break
    
  return X_final, loss

#Function for computing error in estimation of distances

def dist_error(X_na, X_final):
  
  D_true = cdist(X_na, X_na, metric='euclidean')
  z_true = spatial.distance.squareform(D_true)

  D_esti = cdist(X_final, X_final, metric='euclidean')
  z_esti = spatial.distance.squareform(D_esti) 
  Error = np.linalg.norm((D_true - D_esti), 'fro')/ np.linalg.norm((D_true), 'fro')
  
  return Error, D_true, D_esti, z_true, z_esti

#Function for checking F-score for neighborhood structure preservation, where k is the number of nearest neighbor
#Input (k+1)  for k-NN as we are not considering distance of node from itself.

def check_score(D_true, D_approx,k):
      f_scores = []
      for i in range(D_true.shape[0]):
        list1 =  np.argsort(D_true[i])
        list2 =  np.argsort(D_approx[i])
        newlist1 = list1[1:k]
        newlist2 = list2[1:k]
        count = 0
        for p in range(k-1):
          for q in range(k-1):
            if(newlist1[p] == newlist2[q]):
              count += 1
              break;
        
        f_score = 2*count/(2*count + (k- 1 - count))
        f_scores.append(f_score)
        #print("Node:{}, newlist1:{}, newlist2:{}".format(i+1, newlist1, newlist2))
        #print("Relative F-score for node: {} = {}".format(i+1, f_score))
      avg_f_score = sum(f_scores)/len(f_scores)

      return avg_f_score
    
#Function for checking similarity between graph structures obtained in non-private and private manner

def check_F_score(A_esti, A_org):
      temp1 = spatial.distance.squareform(A_esti)
      temp2 = spatial.distance.squareform(A_org)
      print(temp2)
      print(temp1)
      TP = 0
      FP = 0
      FN = 0
      FP_elements = []
      TP_elements = []
      for i in range(temp1.shape[0]):
        if(temp2[i] > 0 and temp1[i] > 0):
          TP+=1
          TP_elements.append(temp1[i])
        elif(temp2[i] == 0 and temp1[i] > 0):
          FP+=1
          FP_elements.append(temp1[i])
        elif(temp2[i] > 0 and temp1[i] == 0):
          FN+=1
        
      print("TP: {}, FP: {}, FN: {}".format(TP, FP, FN))
      F_score = (2*TP)/(2*TP + FP + FN)

      return F_score

<font size = "5"> **Dataset Loaders** </font>

**Human Activity Recognition (Large)**

In [None]:
df = pd.read_csv("/train.csv", sep = ',')
df['Activity'].replace(['STANDING', 'SITTING', 'LAYING', 'WALKING', 'WALKING_DOWNSTAIRS',
       'WALKING_UPSTAIRS'],[1, 2,3,4,5,6], inplace=True)
#print(df.head())
X_na = df.to_numpy()
Y_na = X_na[:,-1]
X_na = X_na[:,:561]
df_test = pd.read_csv("/test.csv", sep = ',')
df_test['Activity'].replace(['STANDING', 'SITTING', 'LAYING', 'WALKING', 'WALKING_DOWNSTAIRS',
       'WALKING_UPSTAIRS'],[1, 2,3,4,5,6], inplace=True)
X_a = df_test.to_numpy()
X_a = X_a[:X_a.shape[1] - 1, :561]


n = X_na.shape[0]
d = X_na.shape[1]

**Human Activity Recognition (Moderate)**

In [None]:
df = pd.read_csv("/final_X_train.txt", sep = ',', header = None)
#print(df.head())
X_na = df.to_numpy()
df_test = pd.read_csv("/final_X_test.txt", sep = ',', header = None)
df_y_train = pd.read_csv("/final_y_train.txt", sep = ',', header = None)
df_y_test = pd.read_csv("/final_y_test.txt", sep = ',', header = None)
X_a = df_test.to_numpy()
X_a = X_a[:X_a.shape[1] - 1]
Y_na = df_y_train.to_numpy()
Y_na = [item for sublist in Y_na for item in sublist]
Y_a = df_y_test.to_numpy()

n = X_na.shape[0]
d = X_na.shape[1]

**PANCAN**

In [None]:
from sklearn import preprocessing

df = pd.read_csv("/data.csv", sep = ",")
print(df.head())
X = df.to_numpy()
X_na = X[:,1:]
df_label = pd.read_csv("/labels.csv", sep = ",")
Y = df_label.to_numpy()
Y_temp = Y[:,1:]
le = preprocessing.LabelEncoder()
le.fit(Y_temp)
Y_na = le.transform(Y_temp)
Y_na = Y_na + 1
print(Y_na)

#Y_na = np.ones((150,))
#Y_na[50:100] = 2
#Y_na[100:150] = 3
#print(Y_na)
print(X_na.shape)
n_classes = 5

<font size = "3"> **Iris** </font>

In [None]:
df = pd.read_csv("/bezdekIris.data.txt", sep = ",", header = None)
print(df.head())
X = df.to_numpy()
X_na = X[:,:4]
Y_na = np.ones((150,))
Y_na[50:100] = 2
Y_na[100:150] = 3
#print(Y_na)
print(X_na.shape)
n_classes = 3

**Glass**

In [None]:
df = pd.read_csv("/glass.data.txt", sep = ",", header = None)
print(df.head())
X = df.to_numpy()
#print(X.shape)
X_na = X[:,1:10]
Y_na = X[:, 10]
Y_na[Y_na == 7] = 4
#print(Y_na)
print(X_na.shape)
print(X_na)
from scipy import stats
#X_na = stats.zscore(X_na, axis=1, ddof=1)
n_classes = 6

**Wine**

In [None]:
df = pd.read_csv("/wine.data.txt", sep = ",", header = None)
print(df.head())
X = df.to_numpy()
#print(X.shape)
X_na = X[:,1:14]
Y_na = X[:, 0]
#print(Y_na)
print(X_na.shape)
from scipy import stats
#X_na = stats.zscore(X_na, axis=1, ddof=1)
n_classes = 3

**WDBC**

In [None]:
df = pd.read_csv("/wdbc.data.txt", sep = ",", header = None)
print(df.head())
X = df.to_numpy()
#print(X.shape)
X_na = X[:,2:32]
Y = X[:, 1]
#print(Y)
from sklearn.preprocessing import LabelEncoder
la = LabelEncoder()
labels = la.fit_transform(Y)
Y_na = labels + 1
#print(Y_na)
print(X_na.shape)
from scipy import stats
#X_na = stats.zscore(X_na.astype(float), axis=1, ddof=1)
n_classes = 2

**Control Chart**

In [None]:
df = pd.read_csv("/synthetic_control.data.txt", delim_whitespace=True, header = None)
print(df.head())
X = df.to_numpy()
X_na = X[:,:]
c = np.array([1,2,3,4,5,6])
Y_na = np.tile(c,(100,1))
Y_na = Y_na.flatten('F')
print(X_na.shape)
from scipy import stats
#X_na = stats.zscore(X_na.astype('float'), axis=1, ddof=1)
n_classes = 6

**Parkinsons**

In [None]:
df = pd.read_csv("/parkinsons.data.txt", sep = ",")
print(df.head())
X = df.to_numpy()
#print(X.shape)
cols = np.r_[1:17, 18:24]
X_na = X[:][:,cols]
Y_na = X[:, 17] + 1
#print(Y_na)
print(X_na.shape)
from scipy import stats
#X_na = stats.zscore(X_na.astype('float'), axis=1, ddof=1)
n_classes = 2

**Vertebral**

In [None]:
df = pd.read_csv("/column_3C.dat",  delim_whitespace=True, header = None)
print(df.head())
X = df.to_numpy()
#print(X.shape)
X_na = X[:,:6]
Y = X[:,6]
#print(Y_na)
from sklearn.preprocessing import LabelEncoder
la = LabelEncoder()
labels = la.fit_transform(Y)
Y_na = labels + 1
print(Y_na)
print(X_na.shape)
from scipy import stats
#X_na = stats.zscore(X_na.astype('float'), axis=1, ddof=1)
n_classes = 3

**Breast tissue**

In [None]:
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv("/breast-tissue.txt", sep = ",", header = None)
print(df.head())
X = df.to_numpy()
#print(X.shape)
X_na = X[:,2:11]
Y = X[:, 1]
#print(Y)

la = LabelEncoder()
labels = la.fit_transform(Y)
Y_na = labels + 1
#print(Y_na)
print(X_na.shape)
from scipy import stats
#X_na = stats.zscore(X_na.astype(float), axis=1, ddof=1)
n_classes = 6

**Seeds**

In [None]:
df = pd.read_csv("/seeds_dataset.txt", sep = "	", header = None)
print(df.head())
X = df.to_numpy()
print(X.shape)
X_na = X[:,:7]
Y_na = X[:, 7]
print(Y_na)
print(X_na.shape)

n_classes = 3

**Segmentation**

In [None]:
from sklearn.preprocessing import LabelEncoder
df1 = pd.read_csv("/segmentation.data.txt", sep = ",", header = None)
df2 = pd.read_csv("/segmentation.test.txt", sep = ",", header = None)
print(df.head())
X1 = df1.to_numpy()
X2 = df2.to_numpy()
#print(X.shape)
X = np.vstack((X1,X2))
#print(X.shape)
X_na = X[:,1:]
Y = X[:, 0]
#print(Y)

la = LabelEncoder()
labels = la.fit_transform(Y)
Y_na = labels + 1
#print(Y_na.shape)
#print(X_na.shape)
from scipy import stats
X_na = stats.zscore(X_na.astype(float), axis=1, ddof=1)
n_classes = 7

**Yeast**

In [None]:
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv("/yeast.data.txt", delim_whitespace=True , header = None)
print(df.head())
X = df.to_numpy()
#print(X.shape)
X_na = X[:,1:9]
Y = X[:, 9]
#print(Y)

la = LabelEncoder()
labels = la.fit_transform(Y)
Y_na = labels + 1
#print(Y_na)
print(X_na.shape)
from scipy import stats
#X_na = stats.zscore(X_na.astype(float), axis=1, ddof=1)
n_classes = 10

In [None]:
X_na, X_a, Y_na, Y_a = train_test_split(X_na, Y_na, test_size=0.10, random_state=42)


n = X_na.shape[0]               #number of non-anchor nodes (clients)

d = X_na.shape[1]               #dimensionality of data

X_a = X_a[:(d-1), :]
print("Size of reference nodes(anchors)", X_a.shape)

X = np.vstack((X_na, X_a))

nodes = X.shape[0]


In [None]:
X_a = np.random.uniform(0, 1, size = (81, d))
#X_a = np.random.rand((d-1),d)
X = np.vstack((X_na, X_a))

nodes = X.shape[0]

In [None]:
for i in range(X_na.shape[0]):
    X_na[i] = X_na[i]/np.linalg.norm(X_na[i])

In [None]:
import random
indices = np.arange(0, X_na.shape[0]).tolist()
num_attackers = int((0.50)*(X_na.shape[0]))
attackers_list = random.sample(indices, num_attackers)
print(attackers_list)

In [None]:
for k in attackers_list:
    X_na[k] += 0.3*np.random.uniform(0,1,X_na[k].shape)

In [None]:
delta = 0.2
X_na_noisy = np.zeros((X_na.shape[0], X_na.shape[1]))
for i in range(DNA.shape[0]):
    X_na_noisy[i] = X_na[i] * delta / (np.linalg.norm(X_na[i]))
    X_na_noisy[i] += np.random.laplace(loc = 0, scale = 0.005, size = X_na_noisy[i].shape)

In [None]:
#1) Using Anchored MDS:

#D, V, V1, V2, W_1, DA, DNA = dist_approx(X_na.astype('float'), X_a.astype('float'), attackers_list)
D, V, V1, V2, W_1, DA, DNA = dist_approx(X_na.astype('float'), X_a.astype('float'))
X_final, loss = MDS_X(D, V1, V2, W_1, DA, X_na.astype('float'), X_a.astype('float'), n, d)
error, D_true, D_esti, z_true, z_esti = dist_error(X_na.astype('float'), X_final)
print("Error in distance approximation: ", error)
fscore = check_score(D_true, D_esti, 11)
print("F-score: ", fscore)

In [None]:
# delta = 0.0005
# for i in range(DNA.shape[0]):
#     DNA[i] = DNA[i] * delta / (np.linalg.norm(DNA[i]))
#     DNA[i] += np.random.laplace(loc = 0, scale = 0.005, size = DNA[i].shape)
# for i in range(DNA.shape[0]):
#     print(np.linalg.norm(DNA[i]))
    

In [None]:
X = np.vstack((X_na, X_a))

nodes = X.shape[0]

In [None]:
#2) Using classical MDS

D, V, V1, V2, W_1, DA, DNA = dist_approx(X_na.astype('float'), X_a.astype('float'))
X_final, loss = classical_MDS_X(D, V, W_1, nodes,d)
error, D_true, D_esti, z_true, z_esti = dist_error(X_na.astype('float'), X_final[:X_na.shape[0], :])
print("Error in distance approximation: ", error)
fscore = check_score(D_true, D_esti, 11)
print("F-score: ", fscore)

In [None]:
#Validating Results 

print(X_final.shape)
#embed_anc = X_final[X_na.shape[0] : , :]
embed_non_anc = X_final[:X_na.shape[0], :]
#embed_non_anc = X_final
print("Embeddings:",embed_non_anc)
print("Original data:",X_na)
print("Estimated distance:", cdist(embed_non_anc,embed_non_anc, metric='euclidean'))
print("True distance:", D_true)
print("Anchor to Anchor Distance: ", DA)
print("Anchor to Non-anchor distance: ", DNA)

<font size = "5"> **Distance plot** </font>

In [None]:
# True euclidean distance (vertical axis) vs. predicted euclidean distance (horizontal axis) plot for PPDA
fig = plt.figure(figsize=(6,6)) 
plt.scatter( z_esti,z_true, s = 5, c = 'red', alpha = 0.6)
plt.plot( [2,4.5], [2,4.5], color='black', linestyle = '-' )
plt.xlabel("Predicted distance", fontsize = 12)
plt.ylabel("Real distance",fontsize = 12)
#plt.savefig("C:/Users/nimes/OneDrive/Desktop/Project_Results/ER_n = {}_d = {}_p = {}_learnt.jpeg".format(n,d,param))
plt.show()

In [None]:
#Plot of Loss vs epochs:
fig = plt.figure(figsize=(10,10)) 
plt.plot(np.arange(0, len(loss) - 10), loss[10:] )
plt.show()

In [None]:
delta = 0.1
X_na_noisy = np.zeros((X_na.shape[0], X_na.shape[1]))
for i in range(X_na.shape[0]):
    X_na_noisy[i] = X_na[i] * delta / (np.linalg.norm(X_na[i]))
    X_na_noisy[i] += np.random.laplace(loc = 0, scale = 0.01,size = X_na_noisy[i].shape)

<font size = "5"> **T-SNE visualization for graph structure positioning** </font>

In [None]:
from sklearn.manifold import TSNE
def visualize(h, color):
    z = TSNE(n_components=2).fit_transform(h)

    plt.figure(figsize=(10,10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    plt.show()
    
    return z

In [None]:
#z = visualize(X_na_noisy,Y_na)
z = visualize(X_na,Y_na)
#z_private = visualize(X_final, Y_na)                                   #For Anchored MDS
z_private = visualize(X_final[:X_na.shape[0]], Y_na)                    #For Classical MDS
print(type(z))
pos = {}
for i in range(X_na.shape[0]):
    pos[i] = z[i]

In [None]:
from sklearn.neighbors import kneighbors_graph
AS = kneighbors_graph(X_na,10,mode='distance')
AS_1 = kneighbors_graph(X_final,10,mode='distance')
AS=AS.toarray()
AS_1 = AS_1.toarray()

<font size = "5"> **Graph Clustering** </font>

<font size = "5"> **Constrained Laplacian Rank (CLR) Algorithm** </font>

In [None]:
from sklearn.neighbors import kneighbors_graph
from scipy.sparse import csr_matrix
from networkx import *
from scipy.linalg import fractional_matrix_power
from sklearn.cluster import KMeans
from quadprog import solve_qp

#Function for clustering graph into k-components

def cluster_k_component_graph(Y, k = 1, m = 5, lmd = 100, eigtol = 1e-9, edgetol = 1e-2, maxiter = 1000):
  A = build_initial_graph(Y, m)
  #print(check_symmetric(A))
  n = A.shape[0]
  S = np.full((n, n), 1/n)
  DS = np.diag(0.5 * np.sum((S + S.T), axis = 0))
  LS =  DS - .5 * (S + S.T)
  #print(LS[0])
  DA = np.diag(0.5 * np.sum((A + A.T), axis = 0))
  LA = DA - .5 * (A + A.T)
  #print(LA[0])
  if k == 1:
    F = eigvec_sym(LA)
    F = F[:, 0:k]
  else:
    F = eigvec_sym(LA)
    F = F[:, 0:k]
  # bounds for variables in the QP solver
  x = np.array([1, 0])
  bvec = np.repeat(x, [1, n], axis=0)
  Amat = np.hstack((np.repeat(1,n, axis = 0).reshape(n,1), np.eye(n)))
  lmd_seq = [lmd]

  for t in tqdm(range(maxiter)):
    
    V =  cdist(F,F, metric='sqeuclidean')
    print(V)
    for i in range(n): 
      p = A[i,:] - .5 * lmd * V[i,:]
      qp,_,_,_,_,_ = solve_qp(G = np.eye(n).astype('double'), a = p.astype('double'), C = Amat.astype('double'), b = bvec.astype('double'))
      #print(qp)
      S[i] = qp
    
    DS = np.diag(0.5 * np.sum((S + S.T), axis = 0))
    LS = DS - .5 * (S + S.T)
    F = eigvec_sym(LS)
    F = F[:, 0:k]
    eig_vals = eigval_sym(LS)
    n_zero_eigenvalues = np.sum(np.abs(eig_vals) < eigtol)
    
    if k < n_zero_eigenvalues:
      lmd = .5 * lmd
    elif k > n_zero_eigenvalues:
      lmd = 2 * lmd
    else:
      break
    lmd_seq.append(lmd)
  
  LS[np.abs(LS) < edgetol] = 0
  AS = np.diag(np.diag(LS)) - LS
  #AS[np.abs(AS) < edgetol] = 0

  return LS, AS, eig_vals,lmd_seq

def eigvec_sym(L):
    Lambdas, V = np.linalg.eig(L)
   
    ind = np.argsort(np.linalg.norm(np.reshape(Lambdas, (1, len(Lambdas))), axis=0))
    #Lambdas = Lambdas[np.argsort(Lambdas)]
    #print(ind)
    #print(Lambdas)
    #print(V[:,0])
    V = np.real(V[:,np.argsort(Lambdas)])
    return V
def eigval_sym(L):
    Lambdas, V = np.linalg.eig(L)
    Lambdas = Lambdas[np.argsort(Lambdas)]
    return Lambdas


#Function for building intial graph structure

def build_initial_graph(Y,m):
    
      n = Y.shape[0]
      A = np.zeros((n,n))
      E = cdist(Y,Y, metric='sqeuclidean')
      for i in range(n):
        sorted_index = np.argsort(E[i,:])
        j_sweep = sorted_index[1:(m+1)]
        den = m * E[i, sorted_index[m+1]] - np.sum(E[i, j_sweep])
        ei = E[i, sorted_index[m+1]]
        for j in j_sweep: 
          A[i, j] = (ei - E[i, j]) / den
        
      #S = 0.5*(A + A.T)
        
      
      return A

#Function for checking quality metrics of clustering

def metrics(Y_na, assignment):
    nmi = normalized_mutual_info_score(Y_na, assignment, average_method='max')
    ari = adjusted_rand_score(Y_na, assignment)
    
    return nmi,ari

In [None]:
#LS, AS, eig_vals,lmd_seq = cluster_k_component_graph(X_na_noisy.astype('float'),6,5)
#LS, AS, eig_vals,lmd_seq = cluster_k_component_graph(X_na.astype('float'),6,5)
LS_1, AS_1, eig_vals_1,lmd_seq_1 = cluster_k_component_graph(X_final.astype('float'),3,7)                             #For Anchored MDS
#LS_1, AS_1, eig_vals_1,lmd_seq_1 = cluster_k_component_graph(X_final[:X_na.shape[0]].astype('float'),1,5)              #For classical MDS

In [None]:
print(lmd_seq,lmd_seq_1)
print(eig_vals, eig_vals_1)

In [None]:
plt.figure(figsize=(8, 8))
plt.imshow(AS)
plt.colorbar()
print(AS[0])

In [None]:
#Assinging colours for clusters for different number of classes

print(Y_na)
#colors = [
#"lightcoral", "gray", "lightgray", "firebrick", "red", "chocolate", "darkorange", "moccasin", "gold", "yellow", "darkolivegreen", "chartreuse", "forestgreen", "lime", "mediumaquamarine", "turquoise", "teal", "cadetblue","magenta", "blue"]

#2 classes:
# colors = ["blue", "red"]
# C =  [colors[0] if i == 1 else colors[1] for i in Y_na]

#3 classes:
colors = ["lightcoral", "blue", "red"]
C =  [colors[0] if i == 1 else colors[1] if i == 2 else colors[2] for i in Y_na]

#5 classes:
#colors = ["lightcoral", "gray", "blue", "red", "chocolate"]
#C = [colors[0] if i == 1 else colors[1] if i == 2 else colors[2] if i == 3 else colors[3] if i == 4 else colors[4] for i in Y_na]

#6 classes:
#colors = ["lightcoral", "gray", "blue", "red", "chocolate", "darkorange"]
#C = [colors[0] if i == 1 else colors[1] if i == 2 else colors[2] if i == 3 else colors[3] if i == 4 else colors[4] if i == 5 else colors[5] for i in Y_na]

#7 classes:
#C = [colors[0] if i == 1 else colors[1] if i == 2 else colors[2] if i == 3 else colors[3] if i == 4 else colors[4] if i == 5 else colors[5] if i == 6 else colors[6] for i in Y_na]

#10 classes:
#colors = [ "gray", "blue", "red", "chocolate", "darkorange","yellow","darkolivegreen", "chartreuse", "cadetblue","magenta"]
#C = [colors[0] if i == 1 else colors[1] if i == 2 else colors[2] if i == 3 else colors[3] if i == 4 else colors[4] if i == 5 else colors[5] if i == 6 else colors[6] if i == 7 else colors[7] if i == 8 else colors[8] if i == 9 else colors[9] for i in Y_na]


In [None]:
np.save('Adjacency_private_ActivityLarge', AS_1)
np.save('Adjacency_non_private_ActivityLarge', AS)
np.save('X_final_ActivityLarge', X_final)
np.save('X_na_ActivityLarge', X_na)
np.save('Y_na_ActivityLarge', Y_na)


In [None]:
np.savetxt("X_final_Iris.csv", X_final, delimiter=",")
np.savetxt("Adjacency_Iris_private.csv", AS_1, delimiter=",")
np.savetxt("Adjacency_Iris_non_private.csv", AS, delimiter=",")
np.savetxt("Y_na_Iris.csv", Y_na, delimiter=",")

<font size = "5"> **Plotting graph structures for non-private and private cases** </font>

In [None]:
#G_org = nx.from_numpy_matrix(AS)
G_private = nx.from_numpy_matrix(AS_1)
#print('Graph statistics(shared):')
#print('Nodes: ', G_org.number_of_nodes(), 'Edges: ', G_org.number_of_edges())

#print('Graph statistics(private):')
#print('Nodes: ', G_private.number_of_nodes(), 'Edges: ', G_private.number_of_edges())
#pos1 = nx.spring_layout(G_org)
pos2 = nx.spring_layout(G_private)
# # # normalize edge weights to plot edges strength
fig = plt.figure(figsize=(12,12)) 
# # # # plot graph
#plt.subplot(221)
#fig = plt.figure(figsize=(20,20)) 

#nx.draw_networkx(G_org,pos1,node_color = C, node_size = 50, with_labels = False, width = 0.2)

#plt.subplot(222)
plt.box(False)
nx.draw_networkx(G_private,pos2,node_color = C, node_size = 50, with_labels = False, width = 0.2)
#plt.savefig("C:/Users/nimes/OneDrive/Desktop/PANCAN_non_private.eps")

plt.show()


<font size = "5"> **Assigning clusters** </font>

<font size = "5"> **Non-private graph clustering** </font>

In [None]:
z = [i for i in range(X_na.shape[0])]
cluster1 = list(nx.node_connected_component(G_org,0))
print(set(z) - set(cluster1))
cluster2 = list(nx.node_connected_component(G_org, 66))
print((set(z) - set(cluster1)) - set(cluster2))


cluster3 = list(nx.node_connected_component(G_org, 260))
print(set(z) - set(cluster1) - set(cluster2) - set(cluster3))

cluster4 = list(nx.node_connected_component(G_org, 107))
print(set(z) - set(cluster1) - set(cluster2) - set(cluster3) - set(cluster4))

cluster5 = list(nx.node_connected_component(G_org, 204))
print(set(z) - set(cluster1) - set(cluster2) - set(cluster3) - set(cluster4) - set(cluster5))

cluster6 = list(nx.node_connected_component(G_org, 334))
print(set(z) - set(cluster1) - set(cluster2) - set(cluster3) - set(cluster4) - set(cluster5) - set(cluster6))
# cluster7 = list(nx.node_connected_component(G_org, 53))
#print(set(z) - set(cluster1) - set(cluster2) - set(cluster3) - set(cluster4) - set(cluster5) - set(cluster6) - set(cluster7))
# cluster8 = list(nx.node_connected_component(G_org, 53))
#print(set(z) - set(cluster1) - set(cluster2) - set(cluster3) - set(cluster4) - set(cluster5) - set(cluster6) - set(cluster7) - set(cluster8))
# cluster9 = list(nx.node_connected_component(G_org, 53))
#print(set(z) - set(cluster1) - set(cluster2) - set(cluster3) - set(cluster4) - set(cluster5) - set(cluster6) - set(cluster7) - set(cluster8) - set(cluster9))
# cluster10 = list(nx.node_connected_component(G_org, 53))

In [None]:
nodes = X_na.shape[0] 
#assignment = [1 if i in cluster1 else 2 for i in range(nodes)]
#assignment = [1 if i in cluster1 else 2 if i in cluster2 else 3 for i in range(nodes)]
assignment = [1 if i in cluster1 else 2 if i in cluster2 else 3 if i in cluster3  else 4 if i in cluster4 else 5 if i in cluster5 else 6 for i in range(nodes)]
#assignment = [1 if i in cluster1 else 2 if i in cluster2 else 3 if i in cluster3  else 4 if i in cluster4 else 5 for i in range(nodes)]
print(assignment)
print(normalized_mutual_info_score(Y_na, assignment, average_method = 'geometric'))
print(adjusted_rand_score(Y_na, assignment))


<font size = "5"> **Private graph clustering** </font>

In [None]:
z = [i for i in range(X_na.shape[0])]
cluster1 = list(nx.node_connected_component(G_private, 0))
print(set(z) - set(cluster1))

cluster2 = list(nx.node_connected_component(G_private, 1))
print((set(z) - set(cluster1)) - set(cluster2))

cluster3 = list(nx.node_connected_component(G_private,2))
print(set(z) - set(cluster1) - set(cluster2) - set(cluster3))

# cluster4 = list(nx.node_connected_component(G_private, 513))
# print(set(z) - set(cluster1) - set(cluster2) - set(cluster3) - set(cluster4))

# cluster5 = list(nx.node_connected_component(G_private,387))
# print(set(z) - set(cluster1) - set(cluster2) - set(cluster3) - set(cluster4) - set(cluster5))

# cluster6 = list(nx.node_connected_component(G_private, 260))

#print("cluster6",cluster6)
# cluster7 = list(nx.node_connected_component(G_private, 53))
# cluster8 = list(nx.node_connected_component(G_private, 53))
# cluster9 = list(nx.node_connected_component(G_private, 53))
# cluster10 = list(nx.node_connected_component(G_private, 53))

In [None]:
nodes = X_na.shape[0] 
#assignment = [1 if i in cluster1 else 2 for i in range(nodes)]
assignment = [1 if i in cluster1 else 2 if i in cluster2 else 3 for i in range(nodes)]
#assignment = [1 if i in cluster1 else 2 if i in cluster2 else 3 if i in cluster3  else 4 if i in cluster4 else 5 if i in cluster5 else 6 for i in range(nodes)]
#assignment = [1 if i in cluster1 else 2 if i in cluster2 else 3 if i in cluster3  else 4 if i in cluster4 else 5 for i in range(nodes)]
print(assignment)
print(normalized_mutual_info_score(Y_na, assignment, average_method = 'geometric'))
print(adjusted_rand_score(Y_na, assignment))

<font size = "5"> **Animals** </font>

In [None]:
with open('C:/Users/nimes/Downloads/Project_codes/animal_data.txt', 'r') as f:
    data = [[int(num) for num in line.split(',')] for line in f]
data = np.asarray(data)
n = data.shape[0]
d = data.shape[1]
with open('C:/Users/nimes/Downloads/Project_codes/animal_name.txt', 'r') as f:
    names = [[str(num) for num in line.split(',')] for line in f]
names = names[0]
print(data.shape)

anchors = np.random.randint(2, size=(d-1,d))
#anchors = np.random.rand(d-1,d)
#anchors = np.random.randn(d-1, d)
print(anchors.shape)

X_na = data
X_a = anchors

In [None]:
X_na

In [None]:

D, V, V1, V2, W_1, DA, DNA = dist_approx(X_na.astype('float'), X_a.astype('float'))
X_final, loss = MDS_X(D, V1, V2, W_1, DA, X_na.astype('float'), X_a.astype('float'), n, d)
error, D_true, D_esti, z_true, z_esti = dist_error(X_na.astype('float'), X_final)
print("Error in distance approximation: ", error)
fscore = check_score(D_true, D_esti, 11)
print("F-score: ", fscore)

In [None]:
X = np.vstack((X_na, X_a))

nodes = X.shape[0]

#2) Using classical MDS

D, V, V1, V2, W_1, DA, DNA = dist_approx(X_na.astype('float'), X_a.astype('float'))
X_final, loss = classical_MDS_X(D, V, W_1, nodes,d)
error, D_true, D_esti, z_true, z_esti = dist_error(X_na.astype('float'), X_final[:X_na.shape[0], :])
print("Error in distance approximation: ", error)
fscore = check_score(D_true, D_esti, 11)
print("F-score: ", fscore)

In [None]:
np.savetxt("X_na_animal.csv", X_na, delimiter=",")
np.savetxt("X_final_animal_50.csv",X_final, delimiter=",")

<font size = "5"> **Synthetic Data (SGL with low dimension embedding)** </font>

In [None]:
#Toy example of two-moon dataset

from sklearn.datasets import make_moons, make_blobs
from sklearn.datasets import make_circles
'''Visual results on two moon dataset 

'''
np.random.seed(0)
n = 100  # number of nodes per cluster
k = 5   # number of components
#X, y = make_moons(n_samples=n*k, noise=.05, shuffle=True)
#X, y = make_circles(n_samples=n*k, factor=0.3, noise=0.05, random_state=0)
X, y = make_blobs(n_samples=n*k, centers=k, n_features=500, random_state=0)
y = y + 1
# dict to store position of nodes
pos = {}
for i in range(n*k):
    pos[i] = X[i,:2]
# Visualization of original data
fig = plt.figure()
plt.scatter(X[:,0], X[:,1], c=y)
plt.title("Two moon dataset")
plt.xlabel('x-coordinate')
plt.ylabel('y-coordinate')

print(X.shape)

In [None]:
X_na, X_a, Y_na, Y_a = train_test_split(X, y, test_size=0.10, random_state=42)


n = X_na.shape[0]               #number of non-anchor nodes (clients)

d = X_na.shape[1]               #dimensionality of data

print("Size of reference nodes(anchors)", X_a.shape)

X = np.vstack((X_na, X_a))

nodes = X.shape[0]

#2) Using classical MDS

D, V, V1, V2, W_1, DA, DNA = dist_approx(X_na.astype('float'), X_a.astype('float'))
X_final, loss = classical_MDS_X(D, V, W_1, nodes,d)
error, D_true, D_esti, z_true, z_esti = dist_error(X_na.astype('float'), X_final[:X_na.shape[0], :])
print("Error in distance approximation: ", error)
fscore = check_score(D_true, D_esti, 11)
print("F-score: ", fscore)

In [None]:
X_final = np.load('X_final_CC.npy')
#X_na = np.load('X_na_Iris.npy')
AS_1 = np.load('Adjacency_Private_CC.npy')
AS = np.load('Adjacency_non_private_CC.npy')
Y_na = np.load('Y_na_CC.npy')

In [None]:
X_final.shape

In [None]:
AS_1_sparse = sparse.csr_matrix(AS_1) 
AS_sparse = sparse.csr_matrix(AS) 
X_na_tensor = torch.from_numpy(X_na.astype('float'))
X_final_tensor = torch.from_numpy(X_final)                                 #For anchored MDS
#X_final_tensor = torch.from_numpy(X_final[:X_na.shape[0]])                 #For classical MDS
Y_na_tensor = torch.from_numpy(Y_na - 1).type(torch.LongTensor)               #(-1) is done for considering 5 classes from 0 to 4

In [None]:
from sklearn.model_selection import ShuffleSplit # or StratifiedShuffleSplit
sss = ShuffleSplit(n_splits=10, test_size=0.20) 
#train_index, test_index = next(sss.split(X_final[:X_na.shape[0]], Y_na))         #For classical MDS 
train_index, test_index = next(sss.split(X_final, Y_na))                         #For anchored MDS 
train_mask = np.zeros((X_na.shape[0],),dtype=bool)
train_mask[train_index] = True
train_mask = torch.from_numpy(train_mask)

test_mask = np.zeros((X_na.shape[0],),dtype=bool)
test_mask[test_index] = True
test_mask = torch.from_numpy(test_mask)

<font size = "5"> **GNN with private graph** </font>

In [None]:
# Install required packages.
import os
import torch

os.environ['TORCH'] = torch.__version__
print(torch.__version__)

# !pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
# !pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
# !pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

# Helper function for visualization.
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

def visualize(h, color):
    z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())

    plt.figure(figsize=(10,10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    plt.show()
    #plt.savefig("C:/Users/nimes/OneDrive/Desktop/ActivityLarge_after_GCN.eps")

In [None]:
from torch_geometric.nn import GCNConv

from torch.nn import Linear
import torch.nn.functional as F
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(1234567)
        self.conv1 = GCNConv(X_final.shape[1], hidden_channels)             #Change number of features of input nodes
        self.conv2 = GCNConv(hidden_channels, 6)                                #Change number of classes

    def forward(self, x, edge_index, edge_weight):
        x = self.conv1(x, edge_index, edge_weight)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index, edge_weight)
        return x

model = GCN(hidden_channels=16)
print(model)

In [None]:
from torch_geometric.utils.convert import from_scipy_sparse_matrix
edge_info = from_scipy_sparse_matrix(AS_sparse)

In [None]:
model = GCN(hidden_channels=16)
model.double()
model.eval()

out = model(X_na_tensor, edge_info[0], edge_info[1])
visualize(out, color=Y_na)

In [None]:
model = GCN(hidden_channels=16)
model.double()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(X_na_tensor, edge_info[0], edge_info[1])  # Perform a single forward pass.
      loss = criterion(out[train_mask], Y_na_tensor[train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test():
      model.eval()
      out = model(X_na_tensor, edge_info[0], edge_info[1])
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[test_mask] == Y_na_tensor[test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc

acc = []
for t in range(10):
    sss = ShuffleSplit(n_splits=10, test_size=0.20) 
    #train_index, test_index = next(sss.split(X_final[:X_na.shape[0]], Y_na))         #For classical MDS 
    train_index, test_index = next(sss.split(X_final, Y_na))                         #For anchored MDS 
    train_mask = np.zeros((X_na.shape[0],),dtype=bool)
    train_mask[train_index] = True
    train_mask = torch.from_numpy(train_mask)

    test_mask = np.zeros((X_na.shape[0],),dtype=bool)
    test_mask[test_index] = True
    test_mask = torch.from_numpy(test_mask)
    for epoch in range(1, 201):
        loss = train()
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    test_acc = test()
    acc.append(test_acc)
    print(f'Test Accuracy: {test_acc:.4f}')

print("Average Accuracy over 10 runs:", np.average(acc), " ± ", np.std(acc))

In [None]:
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

In [None]:
model.eval()

out = model(X_na_tensor, edge_info[0], edge_info[1])
visualize(out, color=Y_na_tensor)