In [17]:
import pandas as pd
import numpy as np

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

# load dataset into Pandas DataFrame
df = pd.read_csv(url, names=['sepal length','sepal width','petal length','petal width','target'])
df

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [None]:
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

features = ['sepal length', 'sepal width', 'petal length', 'petal width']

# Separating out the features
x = df.loc[:, features].values
# Separating out the target
y = df.loc[:,['target']].values
# Standardizing the features
x = StandardScaler().fit_transform(x)


from sklearn.decomposition import PCA

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, df[['target']]], axis = 1)

filter_pca = finalDf.drop(['target'], axis=1)
filter_pca

#plot
# fig = plt.figure(figsize = (8,8))
# ax = fig.add_subplot(1,1,1) 
# ax.set_xlabel('Principal Component 1', fontsize = 15)
# ax.set_ylabel('Principal Component 2', fontsize = 15)
# ax.set_title('2 component PCA', fontsize = 20)

# targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
# colors = ['r', 'g', 'b']
# for target, color in zip(targets,colors):
#     indicesToKeep = finalDf['target'] == target
#     ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
#                , finalDf.loc[indicesToKeep, 'principal component 2']
#                , c = color
#                , s = 50)
# ax.legend(targets)
# ax.grid()


#check performance: [variance in PC 1, variance in PC 2]
# pca.explained_variance_ratio_


In [None]:
from sklearn.manifold import TSNE

# Using t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsneComponents = tsne.fit_transform(x)
tsneDf = pd.DataFrame(data=tsneComponents, columns=['t-SNE component 1', 't-SNE component 2'])
tsnefDf = pd.concat([tsneDf, df[['target']]], axis=1)
tsnefDf
filter_tsne = tsnefDf.drop(['target'], axis=1)
filter_tsne


# Plot
# fig = plt.figure(figsize=(8, 8))
# ax = fig.add_subplot(1, 1, 1)
# ax.set_xlabel('t-SNE Component 1', fontsize=15)
# ax.set_ylabel('t-SNE Component 2', fontsize=15)
# ax.set_title('2D t-SNE', fontsize=20)

# targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
# colors = ['r', 'g', 'b']
# for target, color in zip(targets, colors):
#     indicesToKeep = finalDf['target'] == target
#     ax.scatter(finalDf.loc[indicesToKeep, 't-SNE component 1'],
#                finalDf.loc[indicesToKeep, 't-SNE component 2'],
#                c=color,
#                s=50)
# ax.legend(targets)
# ax.grid()
# plt.show()


In [16]:
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import jaccard_score
from itertools import combinations

filter_pca
filter_tsne

pca_dist_matrix = squareform(pdist(filter_pca, metric='euclidean'))
tsne_dist_matrix = squareform(pdist(filter_tsne, metric='euclidean'))
pca_dist_matrix


array([[0.        , 1.17469087, 0.83064315, ..., 3.79291851, 3.67656962,
        3.26679009],
       [1.17469087, 0.        , 0.43906307, ..., 3.72328517, 3.84524423,
        3.11083222],
       [0.83064315, 0.43906307, 0.        , ..., 3.93259122, 3.97515913,
        3.34040665],
       ...,
       [3.79291851, 3.72328517, 3.93259122, ..., 0.        , 0.76335977,
        0.63158598],
       [3.67656962, 3.84524423, 3.97515913, ..., 0.76335977, 0.        ,
        1.11926382],
       [3.26679009, 3.11083222, 3.34040665, ..., 0.63158598, 1.11926382,
        0.        ]])

In [18]:
#Jaccard - code from gpt
from sklearn.metrics import jaccard_score

# Example presence/absence data (replace with your actual data)
presence_absence_data = np.random.randint(0, 2, size=(10, 5))  # 10 samples, 5 species

# Function to compute Jaccard distance
def jaccard_distance(u, v):
    return 1 - jaccard_score(u, v)

# Calculate Jaccard distance matrix
num_samples = presence_absence_data.shape[0]
jaccard_dist_matrix = np.zeros((num_samples, num_samples))

# Iterate over all pairs of samples to compute the Jaccard distances
for i in range(num_samples):
    for j in range(i + 1, num_samples):
        dist = jaccard_distance(presence_absence_data[i], presence_absence_data[j])
        jaccard_dist_matrix[i, j] = dist
        jaccard_dist_matrix[j, i] = dist

print("Jaccard Distance Matrix:\n", jaccard_dist_matrix)


Jaccard Distance Matrix:
 [[0.         0.75       0.75       0.75       0.6        1.
  0.66666667 1.         0.66666667 0.        ]
 [0.75       0.         0.5        0.5        0.4        0.66666667
  0.33333333 1.         0.33333333 0.75      ]
 [0.75       0.5        0.         0.8        0.4        0.66666667
  0.75       1.         0.75       0.75      ]
 [0.75       0.5        0.8        0.         0.4        0.66666667
  0.75       1.         0.75       0.75      ]
 [0.6        0.4        0.4        0.4        0.         0.8
  0.6        1.         0.6        0.6       ]
 [1.         0.66666667 0.66666667 0.66666667 0.8        0.
  1.         1.         1.         1.        ]
 [0.66666667 0.33333333 0.75       0.75       0.6        1.
  0.         1.         0.         0.66666667]
 [1.         1.         1.         1.         1.         1.
  1.         0.         1.         1.        ]
 [0.66666667 0.33333333 0.75       0.75       0.6        1.
  0.         1.         0.       