Implementation of K-means clustering algorithm using yellowbrik python library in other to make beautifull plot with few lines of codes

In [3]:
####################### IMPORT STATEMENT #########################

# sklearn imports
from sklearn.datasets import load_wine
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# pandas and numpy imports
import pandas as pd
import numpy as np

# plotting imports
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

# yellowbrick libs
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import SilhouetteVisualizer

from wine_consts_info import WINE_FEATURES

# set sns theme and set pandas to display all rows and columns
sns.set_theme()

### Load wine dataset

The Wine dataset is a classic multivariate dataset used for classification tasks in machine learning. 
- It consists of 178 samples of wine from three different cultivars (classes) in the same region in Italy. 
- Each sample has 11 continuous attributes (features) that are the result of a chemical analysis of the wines. 
- The goal of using this dataset is usually to build a classifier that can predict the cultivar of a wine based on its chemical composition.

The dataset contains following features:

- **Alcohol**: The alcohol content in the wine, measured in percentage.
- **Malic Acid**: The amount of malic acid in the wine.
- **Ash**: The measure of the non-aqueous residue remaining after heating.
- **Alcalinity of Ash**: A measure of the alkalinity of the ash formed post-combustion.
- **Magnesium**: The amount of magnesium in the wine.
- **Total Phenols**: The total amount of phenolic compounds.
- **Flavanoids**: Indicates the flavanoid phenolic content.
- **Nonflavanoid Phenols**: Measures the non-flavanoid phenolic content.
- **Proanthocyanins**: Indicates the proanthocyanin content.
- **Color Intensity**: The intensity of the wine's color, measured optically.
- **Hue**: The color attribute that describes a pure color, usually measured via spectrophotometry.
- **OD280/OD315 of diluted wines**: Measures the antioxidant content using absorbance ratio.
- **Proline**: The amount of the amino acid proline.

Citation : Aeberhard,Stefan and Forina,M.. (1991). Wine. UCI Machine Learning Repository. https://doi.org/10.24432/C5PC7J.

In [5]:
# Load the wine dataset
wine_data = load_wine()

wine_df_numeric = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)



In [6]:
wine_df_numeric

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


In [7]:
wine_df = wine_df_numeric.copy()
wine_df['label'] = wine_data.target

In [8]:
wine_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,label
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [9]:

# Scale the data
standard_scaler = StandardScaler()
standard_scaler.fit(wine_df_numeric)
wine_df_numeric_scaled = standard_scaler.transform(wine_df_numeric)

In [10]:


# Create data df
wine_df_numeric = pd.DataFrame(
    wine_df_numeric_scaled, 
    columns = wine_df_numeric.columns
)

In [11]:
wine_df_numeric

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,1.518613,-0.562250,0.232053,-1.169593,1.913905,0.808997,1.034819,-0.659563,1.224884,0.251717,0.362177,1.847920,1.013009
1,0.246290,-0.499413,-0.827996,-2.490847,0.018145,0.568648,0.733629,-0.820719,-0.544721,-0.293321,0.406051,1.113449,0.965242
2,0.196879,0.021231,1.109334,-0.268738,0.088358,0.808997,1.215533,-0.498407,2.135968,0.269020,0.318304,0.788587,1.395148
3,1.691550,-0.346811,0.487926,-0.809251,0.930918,2.491446,1.466525,-0.981875,1.032155,1.186068,-0.427544,1.184071,2.334574
4,0.295700,0.227694,1.840403,0.451946,1.281985,0.808997,0.663351,0.226796,0.401404,-0.319276,0.362177,0.449601,-0.037874
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,0.876275,2.974543,0.305159,0.301803,-0.332922,-0.985614,-1.424900,1.274310,-0.930179,1.142811,-1.392758,-1.231206,-0.021952
174,0.493343,1.412609,0.414820,1.052516,0.158572,-0.793334,-1.284344,0.549108,-0.316950,0.969783,-1.129518,-1.485445,0.009893
175,0.332758,1.744744,-0.389355,0.151661,1.422412,-1.129824,-1.344582,0.549108,-0.422075,2.224236,-1.612125,-1.485445,0.280575
176,0.209232,0.227694,0.012732,0.151661,1.422412,-1.033684,-1.354622,1.354888,-0.229346,1.834923,-1.568252,-1.400699,0.296498


In [15]:
wine_df_numeric.columns

Index(['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium',
       'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
       'proanthocyanins', 'color_intensity', 'hue',
       'od280/od315_of_diluted_wines', 'proline'],
      dtype='object')

In [22]:
# Plot all the feature pairs
_ = wine_df_numeric.hist(figsize=(9,9))
plt.tight_layout()


Error in callback <function _draw_all_if_interactive at 0x31062b760> (for post_execute), with arguments args (),kwargs {}:


ValueError: object __array__ method not producing an array

ValueError: object __array__ method not producing an array

<Figure size 900x900 with 16 Axes>

In [12]:
g = sns.boxplot(wine_df_numeric)
_ = g.set_xticklabels(g.get_xticklabels(), rotation=90)

Error in callback <function _draw_all_if_interactive at 0x31062b760> (for post_execute), with arguments args (),kwargs {}:


  _ = g.set_xticklabels(g.get_xticklabels(), rotation=90)


ValueError: object __array__ method not producing an array

ValueError: object __array__ method not producing an array

<Figure size 800x550 with 1 Axes>

In [25]:
# import pandas as pd
# import seaborn as sns
# from sklearn.datasets import load_wine
# from sklearn.preprocessing import StandardScaler
# import matplotlib.pyplot as plt

# # Load the wine dataset
# wine_data = load_wine()

# # Create DataFrame
# wine_df_numeric = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)

# # Scale the data
# standard_scaler = StandardScaler()
# wine_df_numeric_scaled = standard_scaler.fit_transform(wine_df_numeric)

# # Convert scaled data back to a DataFrame
# wine_df_scaled = pd.DataFrame(
#     wine_df_numeric_scaled,
#     columns=wine_df_numeric.columns
# )

# # Melt the DataFrame for Seaborn boxplot
# wine_df_melted = wine_df_scaled.melt(var_name="Features", value_name="Values")

# # Create the boxplot
# plt.figure(figsize=(12, 6))
# g = sns.boxplot(x="Features", y="Values", data=wine_df_melted)

# # Rotate x-axis labels
# g.set_xticklabels(wine_df_scaled.columns, rotation=90)
# plt.title("Boxplot of Scaled Wine Dataset Features")
# plt.tight_layout()
# plt.show()
