# k-MEANS CLUSTERING

# IMPORT LIBRARIES

In [None]:
import pandas as pd                               # For dataframes
import matplotlib.pyplot as plt                   # For plotting data
import seaborn as sns                             # For plotting data
from sklearn.cluster import KMeans                # For k-Means
from sklearn.preprocessing import StandardScaler  # For standardizing data

# LOAD AND PREPARE DATA
Read the `penguins.csv` file from the `data` directory in variable `df`. Keep all features in variable `df` and store the class variable in `y`. Also, standardize the data so each variable has a mean of zero and unit variance (i.e., variance and standard deviation of one).

In [None]:
# Reads the .csv file into variable df
df = pd.read_csv('penguins.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'penguins.csv'

In [None]:
df.head()

In [None]:
df.shape

In [None]:
sns.countplot(x = "y", data = df)

In [None]:
# Separates the class variable in y
y = df.y

# Removes the y column from df
X = df.drop('y', axis=1)

In [None]:
# Displays the first 5 rows of x
X.head()

In [None]:
y.head()

In [None]:
# Standardizes df
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

# Displays the first 5 rows of df
X.head()

In [None]:
# Splitting data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# RUNNING k-MEANS

## k-Means: Train the Model
We'll set up a `KMeans` object with the following parameters:

- `n_clusters`: Total number of clusters to make.
- `random_state`: Set to one to reproduce these results (42 as an example of The answer to life, the universe, and everything).
- `n_init`: Number of times k-means would be run.

In [None]:
# Sets up the kMeans object
km = KMeans(n_clusters=3, random_state=42, n_init=10)

# Fits the model to the data
km.fit(X_train)

# Displays the parameters of the fitted model
km.get_params()

# k-Means: Visualize the Clusters
The code below creates a scatterplot of the first two features. Each point is colored according to its actual label. For comparison, each instance is drawn with a marker according to the label found by the clustering algorithm.

In [None]:
# Creates a scatter plot
sns.scatterplot(
    x='bill_length_mm',
    y='bill_depth_mm',
    data=X_train,
    hue=y_train,
    style=km.labels_,
    palette=["orange", "green", "blue"])

# Adds cluster centers to the same plot
plt.scatter(
    km.cluster_centers_[:,0],
    km.cluster_centers_[:,1],
    marker='x',
    s=200,
    c='red')

# k-MEANS: TEST

In [None]:
# Using the predict method of KMeans to predict 3 clusters using the sample data

labels = km.predict(X_test)

In [None]:
labels

In [None]:
# Creates a scatter plot
sns.scatterplot(
    x='bill_length_mm',
    y='bill_depth_mm',
    data=X_test,
    hue=y_test,
    style=labels,
    palette=["orange", "green", "blue"])

# Adds cluster centers to the same plot
plt.scatter(
    km.cluster_centers_[:,0],
    km.cluster_centers_[:,1],
    marker='x',
    s=200,
    c='red')