In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score, get_scorer
from sklearn.linear_model import Lasso, Ridge, LassoCV,LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold, RepeatedKFold, GridSearchCV, cross_validate, train_test_split
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv('ml-09-k-means-animals-dataset.csv')

In [3]:
data.drop(columns=['Region', 'Stage', 'Comments', 'Individual ID'], inplace = True)

In [4]:
data.dropna(axis = 0, thresh = len(data.columns)/2, inplace=True)
data.dropna(axis = 1, thresh = len(data.columns)/2, inplace=True)

data.reset_index(drop = True, inplace = True)

In [5]:
continuous_columns = ['Delta 15 N (o/oo)', 'Delta 13 C (o/oo)']
continuous_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
continuous_imputed_df = pd.DataFrame(continuous_imputer.fit_transform(data[continuous_columns]), columns=continuous_columns)


for col in continuous_columns:
    data[col] = continuous_imputed_df[col]
    
data.replace('.' , None ,inplace = True)
categorical_columns = ['Sex']
categorical_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
categorical_imputed_df = pd.DataFrame(categorical_imputer.fit_transform(data[categorical_columns]), columns=categorical_columns)

for col in categorical_columns:
    data[col] = categorical_imputed_df[col]
    

In [6]:
breakout = pd.get_dummies(data['Clutch Completion'])
breakout
breakout.drop(columns='No', inplace = True)
breakout.columns=['Clutch Completion - Yes']

breakout = pd.get_dummies(data['Island'])
breakout
breakout.drop(columns='Torgersen', inplace = True)
data = data.merge(breakout, how = 'inner', left_index = True, right_index = True)

breakout = pd.get_dummies(data['Sex'])
breakout
breakout.drop(columns='MALE', inplace = True)
data = data.merge(breakout, how = 'inner', left_index = True, right_index = True)


In [7]:
data.drop(columns = ['Island', 'Clutch Completion', 'Sex'], inplace = True, axis = 1)
data

Unnamed: 0,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo),Biscoe,Dream,FEMALE
0,2011-11-07,39.1,18.7,181.0,3750.0,8.733382,-25.686292,0,0,0
1,2011-11-07,39.5,17.4,186.0,3800.0,8.949560,-24.694540,0,0,1
2,11/16/07,40.3,18.0,195.0,3250.0,8.368210,-25.333020,0,0,1
3,11/16/07,36.7,19.3,193.0,3450.0,8.766510,-25.324260,0,0,1
4,11/16/07,39.3,20.6,190.0,3650.0,8.664960,-25.298050,0,0,0
...,...,...,...,...,...,...,...,...,...,...
337,2012-01-09,47.2,13.7,214.0,4925.0,7.991840,-26.205380,1,0,1
338,11/22/09,46.8,14.3,215.0,4850.0,8.411510,-26.138320,1,0,1
339,11/22/09,50.4,15.7,222.0,5750.0,8.301660,-26.041170,1,0,0
340,11/22/09,45.2,14.8,212.0,5200.0,8.242460,-26.119690,1,0,1


In [8]:
data.Age.describe()

AttributeError: 'DataFrame' object has no attribute 'Age'

In [None]:
data['Datetime'] = pd.to_datetime(data['Date Egg'], infer_datetime_format=True)
data['Age'] = (pd.Timestamp.today() - data['Datetime']).dt.days/365
data.drop(columns = ['Date Egg', 'Datetime'], inplace=True)

In [None]:
sc = StandardScaler()
data_scaled = sc.fit_transform(data)
data_scaled = pd.DataFrame(data = data_scaled, columns = data.columns)
data_scaled

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(data, test_size=0.3, random_state=0)



In [None]:
X_test.reset_index(drop='True', inplace=True)

In [None]:
from sklearn.cluster import KMeans
import numpy as np

kmeans = KMeans(n_clusters=2, random_state=0).fit(X_train)

kmeans.labels_
kmeans.cluster_centers_
label = kmeans.predict(X_test)

predSeries = pd.Series(label)
df2 = X_test.merge(predSeries.rename('Target'), how='inner', left_index=True, right_index=True)

In [None]:
kmeans.feature_names_in_

In [None]:
from sklearn.decomposition import PCA
import pylab as pl
pca = PCA(n_components=2).fit(X_test)
pca_2d = pca.transform(X_test)

clusters = pca.transform(kmeans.cluster_centers_)



for i in range(0, len(clusters)):
        c0 = pl.scatter(clusters[i,0],pca_2d[i,1],c='#1f77b4', marker='X')  
        
for i in range(0, pca_2d.shape[0]):
    if df2.Target.iloc[i] == 0:
        c1 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='#ff7f0e', marker='+')
    elif df2.Target.iloc[i] == 1:
        c2 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='#d62728',marker='o')
 
pl.legend([c1, c2], ['1', '2'])
pl.title(' Data with 2 clusters')
pl.show()

In [None]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(X_train)

label= kmeans.predict(X_test)

predSeries = pd.Series(label)
df2 = X_test.merge(predSeries.rename('Target'), how='inner', left_index=True, right_index=True)


In [None]:
clusters = pca.transform(kmeans.cluster_centers_)

for i in range(0, len(clusters)):
        c0 = pl.scatter(clusters[i,0],pca_2d[i,1],c='#1f77b4', marker='X')  
        
for i in range(0, pca_2d.shape[0]):
    if df2.Target.iloc[i] == 0:
        c1 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='#ff7f0e', marker='+')
    elif df2.Target.iloc[i] == 1:
        c2 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='#d62728',marker='o')
    elif df2.Target.iloc[i] == 2:
        c3 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='#8c564b',marker='1')
 
pl.legend([c1, c2, c3], ['1', '2', '3'])
pl.title(' Data with 3 clusters')
pl.show()

In [None]:
kmeans = KMeans(n_clusters=4, random_state=0).fit(X_train)

label= kmeans.predict(X_test)

predSeries = pd.Series(label)
df2 = X_test.merge(predSeries.rename('Target'), how='inner', left_index=True, right_index=True)


In [None]:
clusters = pca.transform(kmeans.cluster_centers_)

for i in range(0, len(clusters)):
        c0 = pl.scatter(clusters[i,0],pca_2d[i,1],c='#1f77b4', marker='X')  
        
for i in range(0, pca_2d.shape[0]):
    if df2.Target.iloc[i] == 0:
        c1 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='#ff7f0e', marker='+')
    elif df2.Target.iloc[i] == 1:
        c2 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='#d62728',marker='o')
    elif df2.Target.iloc[i] == 2:
        c3 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='#8c564b',marker='1')
    elif df2.Target.iloc[i] == 3:
        c4 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='#bcbd22',marker='2')
 
pl.legend([c1, c2, c3, c4], ['1', '2', '3', '4'])
pl.title(' Data with 4 clusters')
pl.show()

In [None]:
kmeans = KMeans(n_clusters=5, random_state=0).fit(X_train)

label= kmeans.predict(X_test)

predSeries = pd.Series(label)
df2 = X_test.merge(predSeries.rename('Target'), how='inner', left_index=True, right_index=True)


In [None]:
clusters = pca.transform(kmeans.cluster_centers_)

for i in range(0, len(clusters)):
        c0 = pl.scatter(clusters[i,0],pca_2d[i,1],c='#1f77b4', marker='X')  
        
for i in range(0, pca_2d.shape[0]):
    if df2.Target.iloc[i] == 0:
        c1 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='#ff7f0e', marker='+')
    elif df2.Target.iloc[i] == 1:
        c2 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='#d62728',marker='o')
    elif df2.Target.iloc[i] == 2:
        c3 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='#8c564b',marker='1')
    elif df2.Target.iloc[i] == 3:
        c4 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='#bcbd22',marker='2')
    elif df2.Target.iloc[i] == 4:
        c5 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='#7f7f7f',marker='3')
 
 
pl.legend([c1, c2, c3, c4, c5], ['1', '2', '3', '4', '5'])
pl.title(' Data with 5 clusters')
pl.show()

In [None]:
kmeans = KMeans(n_clusters=6, random_state=0).fit(X_train)

kmeans.labels_
kmeans.cluster_centers_
kmeans.predict(X_test)



In [None]:
kmeans = KMeans(n_clusters=7, random_state=0).fit(X_train)

kmeans.labels_
kmeans.cluster_centers_
kmeans.predict(X_test)



In [None]:
kmeans = KMeans(n_clusters=8, random_state=0).fit(X_train)

kmeans.labels_
kmeans.cluster_centers_
kmeans.predict(X_test)



In [None]:
kmeans = KMeans(n_clusters=9, random_state=0).fit(X_train)

kmeans.labels_
kmeans.cluster_centers_
kmeans.predict(X_test)



In [None]:
Sum_of_squared_distances = []
K = range(1,10)
for k in K:
    km = KMeans(n_clusters=k, random_state=0)
    km = km.fit(X_train)
    Sum_of_squared_distances.append(km.inertia_)
    
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()