In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Data from: https://archive.ics.uci.edu/dataset/236/seeds

In [None]:
cols = ["area", "perimeter", "compactness", "length", "width", "asymmetry", "groove", "class"]
df = pd.read_csv("../data_files/seeds_dataset.txt", names = cols, sep = "\s+") # makes spaces count as delimiters

In [None]:
df.head()

In [None]:
for i in range(len(cols)-1): 
    for j in range(i+1, len(cols)-1):
        x_label = cols[i]
        y_label = cols[j]
        sns.scatterplot(x = x_label, y = y_label, data = df, hue = 'class')
        plt.show()

# Clustering
K-means clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
x = "perimeter"
y = "asymmetry"
X = df[[x, y]].values

In [None]:
kmeans = KMeans(n_clusters = 3).fit(X)

In [None]:
clusters = kmeans.labels_

In [None]:
cluster_df = pd.DataFrame(np.hstack((X, clusters.reshape(-1,1))), columns = [x, y, "class"])

In [None]:
# K means
sns.scatterplot(x = x, y = y, hue = 'class', data = cluster_df)
plt.plot()

In [None]:
# Original
sns.scatterplot(x = x, y = y, hue = 'class', data = df)
plt.plot()

# Higher dimensions

In [None]:
X = df[cols[:-1]].values

In [None]:
kmeans = KMeans(n_clusters = 3).fit(X)
cluster_df = pd.DataFrame(np.hstack((X, kmeans.labels_.reshape(-1,1))), columns = df.columns)

In [None]:
# Original
sns.scatterplot(x = x, y = y, hue = 'class', data = cluster_df)
plt.plot()

In [None]:
# Original
sns.scatterplot(x = x, y = y, hue = 'class', data = df)
plt.plot()

# PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components = 2)
transformed_x = pca.fit_transform(X)

In [None]:
X.shape # (210, 7)
transformed_x.shape # (210, 2)

In [None]:
plt.scatter(transformed_x[:,0], transformed_x[:,1]) # 2d representation of 7 dimensional thing

In [None]:
kmeans_pca_df = pd.DataFrame(np.hstack((transformed_x, kmeans.labels_.reshape(-1, 1))), columns = ["pca1", "pca2", "class"])

In [None]:
truth_pca_df = pd.DataFrame(np.hstack((transformed_x, df["class"].values.reshape(-1, 1))), columns = ["pca1", "pca2", "class"])

In [None]:
# Predicted
sns.scatterplot(x = "pca1", y = "pca2", hue = 'class', data = kmeans_pca_df)
plt.plot()

In [None]:
# Truth classes
sns.scatterplot(x = "pca1", y = "pca2", hue = 'class', data = truth_pca_df)
plt.plot()