In [30]:
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import  linkage,dendrogram

import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.linear_model import ElasticNet, LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector,make_column_transformer
from sklearn.metrics import classification_report, mean_absolute_error, r2_score, f1_score, accuracy_score,log_loss
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor, BaggingClassifier, BaggingRegressor, \
    RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from tqdm import tqdm
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans
from sklearn.metrics import silhouette_score


In [2]:
milk=pd.read_csv("milk.csv",index_col=0)
ss=StandardScaler().set_output(transform='pandas')
milk_scaled=ss.fit_transform(milk)

In [5]:
clust=DBSCAN(eps=0.6,min_samples=2)
clust.fit(milk_scaled)
clust.labels_

array([ 0,  0,  0,  0, -1,  1,  1,  2, -1, -1,  2,  1,  0, -1,  1,  2, -1,
       -1, -1, -1,  3,  3, -1, -1, -1])

In [6]:
df_copy=milk.copy()
df_copy['clust']=clust.labels_
df_copy

Unnamed: 0_level_0,water,protein,fat,lactose,ash,clust
Animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HORSE,90.1,2.6,1.0,6.9,0.35,0
ORANGUTAN,88.5,1.4,3.5,6.0,0.24,0
MONKEY,88.4,2.2,2.7,6.4,0.18,0
DONKEY,90.3,1.7,1.4,6.2,0.4,0
HIPPO,90.4,0.6,4.5,4.4,0.1,-1
CAMEL,87.7,3.5,3.4,4.8,0.71,1
BISON,86.9,4.8,1.7,5.7,0.9,1
BUFFALO,82.1,5.9,7.9,4.7,0.78,2
GUINEA PIG,81.9,7.4,7.2,2.7,0.85,-1
CAT,81.6,10.1,6.3,4.4,0.75,-1


In [10]:
inliers=milk_scaled.copy()
inliers['label']=clust.labels_
inliers=inliers[clust.labels_!=-1]
inliers

Unnamed: 0_level_0,water,protein,fat,lactose,ash,label
Animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HORSE,0.948806,-1.009291,-0.903208,1.542217,-1.037554,0
ORANGUTAN,0.821407,-1.344603,-0.660619,1.040773,-1.259945,0
MONKEY,0.813445,-1.121062,-0.738247,1.263637,-1.381249,0
DONKEY,0.964731,-1.260775,-0.864394,1.152205,-0.936467,0
CAMEL,0.757707,-0.757806,-0.670322,0.372182,-0.30973,1
BISON,0.694008,-0.394551,-0.835283,0.873626,0.0744,1
BUFFALO,0.31181,-0.087181,-0.233662,0.316466,-0.168208,2
FOX,0.271998,0.108418,-0.427733,0.427898,0.135052,2
LLAMA,0.662158,-0.646036,-0.689729,0.81791,-0.127774,1
MULE,0.940844,-1.176947,-0.825579,0.762194,-0.794946,0


In [11]:
silhouette_score(inliers.iloc[:,:-1],inliers['label'])

0.5934459505692155

In [24]:
eps=[0.1,0.2,0.3,0.4,0.5,1]
min_p=[2,3,4,5,6,7]
for e in eps:
    for m in min_p:
        clust=DBSCAN(eps=e,min_samples=m)
        clust.fit(milk_scaled)
        df_copy=milk_scaled.copy()
        df_copy['clust']=clust.labels_
        inliers = milk_scaled.copy()
        inliers['label'] = clust.labels_
        inliers = inliers[clust.labels_ != -1]
        inlier_mask = clust.labels_ != -1
        X_inliers = milk_scaled[inlier_mask]


        y_inliers = clust.labels_[inlier_mask]

        num_samples = len(X_inliers)
        num_unique_clusters = len(np.unique(y_inliers))

        if num_samples >= 2 and num_unique_clusters >= 2:

            print(silhouette_score(inliers.iloc[:,:-1],inliers['label']))





0.6518937593821538
0.5385180352469559
0.5934459505692155
0.5344431042454363
0.4344818095328392
0.6473871775367226


Usage with Supervised Learning

In [25]:
kyph=pd.read_csv('Kyphosis.csv')
y=kyph['Kyphosis']
X=kyph.drop('Kyphosis',axis=1)
train,test=train_test_split(kyph,test_size=0.3,random_state=25,stratify=y)
y_train=train['Kyphosis']
X_train=train.drop('Kyphosis',axis=1)
y_test=test['Kyphosis']
X_test=test.drop('Kyphosis',axis=1)


In [33]:
scaler=StandardScaler().set_output(transform='pandas')
X_train_scaled=scaler.fit_transform(X_train)
scores=[]
for i in range(2,11):
    clust=KMeans(random_state=25,n_clusters=i)
    clust.fit(X_train_scaled)
    scores.append([i,silhouette_score(X_train_scaled,clust.labels_)])
df_score=pd.DataFrame(scores,columns=['clusters','scores'])
df_score.sort_values('scores',ascending=False)

Unnamed: 0,clusters,scores
2,4,0.396206
8,10,0.39334
7,9,0.366559
3,5,0.351667
0,2,0.348402
1,3,0.330521
6,8,0.323228
5,7,0.287655
4,6,0.285068


In [34]:
clust=KMeans(random_state=25,n_clusters=4)
clust.fit(X_train_scaled)
train['cluster']=clust.labels_
train

Unnamed: 0,Kyphosis,Age,Number,Start,cluster
63,absent,118,3,16,3
45,present,139,3,10,3
66,absent,195,2,17,3
75,absent,178,4,15,3
60,present,130,4,1,1
25,absent,9,5,13,2
21,present,105,6,5,0
17,absent,175,5,13,3
58,absent,51,7,9,0
0,absent,71,3,5,1
