# Setup

In [1]:
# Import libraries
import pandas as pd
import numpy as np

# Agglomerative clustering library
from sklearn.cluster import AgglomerativeClustering

In [2]:
# Load dataset
df = pd.read_csv('small_GNPS_edges.tsv', sep='\t')

# View dataset
df

Unnamed: 0,CLUSTERID1,CLUSTERID2,DeltaMZ,MEH,Cosine,OtherScore,ComponentIndex
0,1,2,-3.03,0,0.55142,0.480834,1
1,2,3,-0.018005,0,0.55746,0.555385,1
2,2,4,-0.065002,0,0.864792,0.515213,1
3,3,4,-57.061,0,0.650067,0.5386,1
4,5,6,0.001007,0,0.837411,0.606836,2
5,5,7,-18.008,0,0.583397,0.531694,2
6,6,7,-46.019,0,0.613145,0.547435,2
7,7,9,0.007004,0,0.818767,0.455516,2
8,8,9,-2.01801,0,0.578485,0.44188,2
9,9,10,0.0,0,0.855883,0.742791,2


# Agglomerative Clustering

In [3]:
# Create precomputed matrix - this will be used as input for the 'fit' method.

# 'distance' is an np.array of floats with values from the Cosine distance column.
distance = df['Cosine'].to_numpy().astype(np.float64)

# Reshape the data.
distance = distance.reshape(-1, 1)
distance

array([[0.55142 ],
       [0.55746 ],
       [0.864792],
       [0.650067],
       [0.837411],
       [0.583397],
       [0.613145],
       [0.818767],
       [0.578485],
       [0.855883]])

In [4]:
# Transform 'distance' np.array into a distance matrix.
from scipy.spatial import distance_matrix
distance_matrix = pd.DataFrame(distance_matrix(distance, distance), index=df.index, columns=df.index)
distance_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.00604,0.313372,0.098647,0.285991,0.031977,0.061725,0.267347,0.027065,0.304463
1,0.00604,0.0,0.307332,0.092607,0.279951,0.025937,0.055685,0.261307,0.021025,0.298423
2,0.313372,0.307332,0.0,0.214725,0.027381,0.281395,0.251647,0.046025,0.286307,0.008909
3,0.098647,0.092607,0.214725,0.0,0.187344,0.06667,0.036922,0.1687,0.071582,0.205816
4,0.285991,0.279951,0.027381,0.187344,0.0,0.254014,0.224266,0.018644,0.258926,0.018472
5,0.031977,0.025937,0.281395,0.06667,0.254014,0.0,0.029748,0.23537,0.004912,0.272486
6,0.061725,0.055685,0.251647,0.036922,0.224266,0.029748,0.0,0.205622,0.03466,0.242738
7,0.267347,0.261307,0.046025,0.1687,0.018644,0.23537,0.205622,0.0,0.240282,0.037116
8,0.027065,0.021025,0.286307,0.071582,0.258926,0.004912,0.03466,0.240282,0.0,0.277398
9,0.304463,0.298423,0.008909,0.205816,0.018472,0.272486,0.242738,0.037116,0.277398,0.0


In [5]:
# Create model
# Parameters: affinity = 'precomputed', linkage = {'complete', 'average', 'single'}
# Note from sklearn:  If affinity =  “precomputed”, a distance matrix (instead of a similarity matrix) is needed as input for the fit method.
cluster = AgglomerativeClustering(affinity = 'precomputed', linkage = 'average')

# Fit model using distance matrix
cluster.fit(distance_matrix)

AgglomerativeClustering(affinity='precomputed', compute_full_tree='auto',
                        connectivity=None, distance_threshold=None,
                        linkage='average', memory=None, n_clusters=2,
                        pooling_func='deprecated')

In [6]:
# Output how the data points have been clustered. 
cluster.labels_

array([0, 0, 1, 0, 1, 0, 0, 1, 0, 1])

# Ward Heirarchial Clustering

In [7]:
# Create model
# Parameters: affinity = 'euclidean', linkage = 'ward'
# Note from sklearn:  If linkage is “ward”, only affinity = “euclidean” is accepted.
ward_cluster = AgglomerativeClustering(affinity = 'euclidean', linkage = 'ward')

# Fit model using Cos distance column
ward_cluster.fit(distance) 

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
                        connectivity=None, distance_threshold=None,
                        linkage='ward', memory=None, n_clusters=2,
                        pooling_func='deprecated')

In [8]:
# Output how the data points have been clustered. 
cluster.labels_

array([0, 0, 1, 0, 1, 0, 0, 1, 0, 1])

# Partitioning Around Medoids

In [9]:
import rpy2
import rpy2.robjects as robjects               # We utilize a 'high-level' interface over a 'low-level' interface. 
from rpy2.robjects.packages import importr     # Import R packages. 
import rpy2.robjects.packages as rpackages

# Pandas conversion of rpy2
from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [10]:
# Import R's 'base' and 'utils' packages.
base = importr('base')
utils = importr('utils')

# select a mirror for R packages
utils.chooseCRANmirror(ind=1) # select the first mirror in the list

# Install 'pam'
utils.install_packages('pam')

#### RRuntimeError: Error in loadNamespace(name) : there is no package called ‘cluster’ ###
#r_cluster = importr('cluster')
#r_cluster.pam

rpy2.rinterface.NULL

In [11]:
# Convert pandas df to R object
r_df = pandas2ri.py2ri(df)

# View results
print(type(r_df))
print(r_df)

<class 'rpy2.robjects.vectors.DataFrame'>
  CLUSTERID1 CLUSTERID2      DeltaMZ MEH   Cosine OtherScore ComponentIndex
0          1          2  -3.03000000   0 0.551420   0.480834              1
1          2          3  -0.01800540   0 0.557460   0.555385              1
2          2          4  -0.06500240   0 0.864792   0.515213              1
3          3          4 -57.06100000   0 0.650067   0.538600              1
4          5          6   0.00100708   0 0.837411   0.606836              2
5          5          7 -18.00800000   0 0.583397   0.531694              2
6          6          7 -46.01900000   0 0.613145   0.547435              2
7          7          9   0.00700378   0 0.818767   0.455516              2
8          8          9  -2.01801000   0 0.578485   0.441880              2
9          9         10   0.00000000   0 0.855883   0.742791              2

