### Impport Libraries

In [164]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

### Import Seeds Dataset

In [141]:
dfs = pd.read_csv("seeds_dataset.txt", delimiter = "\t")
dfs[::40]

Unnamed: 0,Area,Perimeter,Compactness,Kernel Length,Kernel Width,Coefficient,Kernel Groove Length,Wheat Kernels
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,1
40,13.54,13.85,0.8871,5.348,3.156,2.587,5.178,1
80,16.53,15.34,0.8823,5.875,3.467,5.532,5.88,2
120,20.24,16.91,0.8897,6.315,3.962,5.901,6.188,2
160,12.54,13.67,0.8425,5.451,2.879,3.082,5.491,3
200,12.38,13.44,0.8609,5.219,2.989,5.472,5.045,3


In [142]:
dfs["Wheat Kernels"] = dfs["Wheat Kernels"].replace([1, 2, 3], [0, 1, 2])

In [143]:
def Wheat_Variety(x):
    if x == 0:
        return "Kama Wheat"
    elif x == 1:
        return "Rosa Wheat"
    else:
        return "Canadian Wheat"

In [144]:
dfs["Wheat Name"] = dfs["Wheat Kernels"].apply(Wheat_Variety)
dfs[::40]

Unnamed: 0,Area,Perimeter,Compactness,Kernel Length,Kernel Width,Coefficient,Kernel Groove Length,Wheat Kernels,Wheat Name
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,0,Kama Wheat
40,13.54,13.85,0.8871,5.348,3.156,2.587,5.178,0,Kama Wheat
80,16.53,15.34,0.8823,5.875,3.467,5.532,5.88,1,Rosa Wheat
120,20.24,16.91,0.8897,6.315,3.962,5.901,6.188,1,Rosa Wheat
160,12.54,13.67,0.8425,5.451,2.879,3.082,5.491,2,Canadian Wheat
200,12.38,13.44,0.8609,5.219,2.989,5.472,5.045,2,Canadian Wheat


In [148]:
samples = dfs[dfs.columns.drop(["Wheat Kernels", "Wheat Name"])]
varieties = dfs["Wheat Name"]

### Cross Tabulation

In [149]:
# Create a KMeans model with 3 clusters: model
model = KMeans(n_clusters = 3)

# Use fit_predict to fit model and obtain cluster labels: labels
labels = model.fit_predict(samples)

In [150]:
# Create a DataFrame with labels and varieties as columns: df
data_s = pd.DataFrame({"Predicted Labels": labels, "Actual Varieties": varieties})
data_s[::30]

Unnamed: 0,Predicted Labels,Actual Varieties
0,2,Kama Wheat
30,2,Kama Wheat
60,1,Kama Wheat
90,0,Rosa Wheat
120,0,Rosa Wheat
150,1,Canadian Wheat
180,1,Canadian Wheat


In [151]:
# Create crosstab: ct
ct = pd.crosstab(data_s["Predicted Labels"], data_s["Actual Varieties"])
ct

Actual Varieties,Canadian Wheat,Kama Wheat,Rosa Wheat
Predicted Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,1,60
1,68,9,0
2,2,60,10


### Import Juice Dataset

In [249]:
dfj = pd.read_csv("juice.csv")
dfj[::50]

Unnamed: 0,class_label,class_name,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280,proline
0,1,Barolo,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
50,1,Barolo,13.05,1.73,2.04,12.4,92,2.72,3.27,0.17,2.91,7.2,1.12,2.91,1150
100,2,Grignolino,12.08,2.08,1.7,17.5,97,2.23,2.17,0.26,1.4,3.3,1.27,2.96,710
150,3,Barbera,13.5,3.12,2.62,24.0,123,1.4,1.57,0.22,1.25,8.6,0.59,1.3,500


In [250]:
dfj["class_label"] = dfj["class_label"].replace([1, 2, 3], [0, 1, 2])

In [251]:
def Wheat_Variety(x):
    if x == 0:
        return "Barolo"
    elif x == 1:
        return "Grignolino"
    else:
        return "Barbera"

In [252]:
samples = dfj[dfj.columns.drop(["class_label", "class_name"])]
varieties = dfj["class_name"]

**Clustering the juice**

In [253]:
# Create a KMeans model with 3 clusters: model
model = KMeans(n_clusters = 3)

# Use fit_predict to fit model and obtain cluster labels: labels
labels = model.fit_predict(samples)

In [254]:
data_j = pd.DataFrame({"Predicted labels": labels, "Actual Varieties": varieties})
data_j[::50]

Unnamed: 0,Predicted labels,Actual Varieties
0,1,Barolo
50,1,Barolo
100,0,Grignolino
150,2,Barbera


In [255]:
# Cross Tabulation
ct = pd.crosstab(data_j["Predicted labels"], data_j["Actual Varieties"])
ct

Actual Varieties,Barbera,Barolo,Grignolino
Predicted labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,29,13,20
1,0,46,1
2,19,0,50


### Pipeline

In [239]:
steps = [("scaler", StandardScaler()),
         ("kmeans", KMeans(n_clusters = 3))]

pipeline = Pipeline(steps)

pipeline.fit(samples)

labels = pipeline.predict(samples)

In [240]:
data_1 = pd.DataFrame({"Predicted Labels": labels, "Actual Varieties": varieties})
data_1[::40]

Unnamed: 0,Predicted Labels,Actual Varieties
0,0,Barolo
40,0,Barolo
80,2,Grignolino
120,2,Grignolino
160,1,Barbera


In [241]:
ct1 = pd.crosstab(data_1["Predicted Labels"], data_1["Actual Varieties"])
ct1

Actual Varieties,Barbera,Barolo,Grignolino
Predicted Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,59,3
1,48,0,3
2,0,0,65


### Import Fish Dataset

In [304]:
Fish = pd.read_csv("fish.csv")
Fish[::10]

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
10,Bream,475.0,28.4,31.0,36.2,14.2628,5.1042
20,Bream,575.0,31.3,34.0,39.5,15.1285,5.5695
30,Bream,920.0,35.0,38.5,44.1,18.0369,6.3063
40,Roach,0.0,19.0,20.5,22.8,6.4752,3.3516
50,Roach,200.0,22.1,23.5,26.8,7.3968,4.1272
60,Whitefish,1000.0,37.3,40.0,43.5,12.354,6.525
70,Parkki,273.0,23.0,25.0,28.0,11.088,4.144
80,Perch,85.0,17.8,19.6,20.8,5.1376,3.0368
90,Perch,110.0,20.0,22.0,23.5,5.5225,3.995


In [305]:
samples = Fish[Fish.columns.drop("Species")]
varieties = Fish["Species"]

### Pipeline

In [306]:
steps = [("scaler", StandardScaler()),
         ("kmeans", KMeans(n_clusters = 7))]

pipeline = Pipeline(steps)

pipeline.fit(samples)

labels = pipeline.predict(samples)

fd = pd.DataFrame({"Predicted Labels": labels, "Actual Variety": varieties})
fd[::10]

Unnamed: 0,Predicted Labels,Actual Variety
0,1,Bream
10,0,Bream
20,0,Bream
30,4,Bream
40,5,Roach
50,1,Roach
60,4,Whitefish
70,1,Parkki
80,5,Perch
90,5,Perch


In [307]:
ctf = pd.crosstab(fd["Predicted Labels"], fd["Actual Variety"])
ctf

Actual Variety,Bream,Parkki,Perch,Pike,Roach,Smelt,Whitefish
Predicted Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,25,0,2,0,0,0,1
1,3,3,12,1,6,0,3
2,0,0,3,0,1,14,0
3,0,0,0,5,0,0,0
4,7,0,17,0,0,0,2
5,0,8,22,0,13,0,0
6,0,0,0,11,0,0,0
