In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

# D1

In [None]:
df = pd.read_csv("D1.csv")

In [None]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [None]:
X = df.drop(columns=[ 'target']).to_numpy()
y = df['target'].to_numpy()

In [None]:
from sklearn import metrics
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering, AffinityPropagation, MiniBatchKMeans, Birch
from sklearn.cluster import DBSCAN

number_of_clusters=2

algorithms = []
algorithms.append(KMeans(n_clusters=number_of_clusters, random_state=1))
algorithms.append(MiniBatchKMeans(n_clusters=number_of_clusters,random_state=1))
algorithms.append(AgglomerativeClustering(n_clusters=number_of_clusters))
algorithms.append(SpectralClustering(n_clusters=number_of_clusters, random_state=1,
                                     affinity='nearest_neighbors'))
algorithms.append(DBSCAN(min_samples=2))


data = []
for algo in algorithms:

    algo.fit(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, algo.labels_),
        'AMI': metrics.adjusted_mutual_info_score(y, algo.labels_,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, algo.labels_),
        'Completeness': metrics.completeness_score(y, algo.labels_),
        'V-measure': metrics.v_measure_score(y, algo.labels_),
        'Silhouette': metrics.silhouette_score(X, algo.labels_)}))

results1 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['K-means','MiniBatchKMeans', 'AgglomerativeClustering', 'SpectralClustering', 'DBSCAN'])

results1

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
K-means,0.020501,0.011402,0.013501,0.014202,0.013843,0.389411
MiniBatchKMeans,0.020501,0.011402,0.013501,0.014202,0.013843,0.389411
AgglomerativeClustering,0.011664,0.008685,0.011095,0.011032,0.011064,0.343187
SpectralClustering,0.027166,0.016351,0.018456,0.01907,0.018758,0.382121
DBSCAN,-0.002073,0.004824,0.005848,0.101469,0.011058,0.090555


In [None]:

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture

number_of_clusters=2

algorithms = []
algorithms.append(GaussianMixture(n_components= number_of_clusters, random_state=1))
algorithms.append(BayesianGaussianMixture(n_components=number_of_clusters, random_state=1))

data = []
for algo in algorithms:

    algo.fit(X)
    predicted_label =algo.predict(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, predicted_label),
        'AMI': metrics.adjusted_mutual_info_score(y, predicted_label,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, predicted_label),
        'Completeness': metrics.completeness_score(y, predicted_label),
        'V-measure': metrics.v_measure_score(y, predicted_label),
        'Silhouette': metrics.silhouette_score(X, predicted_label)}))

results2 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['GaussianMixture','BayesianGaussianMixture' ])

results2

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
GaussianMixture,0.001229,-0.00231,0.000569,0.000934,0.000707,0.002203
BayesianGaussianMixture,0.001229,-0.00231,0.000569,0.000934,0.000707,0.002203


In [None]:
results1.append(results2)

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
K-means,0.020501,0.011402,0.013501,0.014202,0.013843,0.389411
MiniBatchKMeans,0.020501,0.011402,0.013501,0.014202,0.013843,0.389411
AgglomerativeClustering,0.011664,0.008685,0.011095,0.011032,0.011064,0.343187
SpectralClustering,0.027166,0.016351,0.018456,0.01907,0.018758,0.382121
DBSCAN,-0.002073,0.004824,0.005848,0.101469,0.011058,0.090555
GaussianMixture,0.001229,-0.00231,0.000569,0.000934,0.000707,0.002203
BayesianGaussianMixture,0.001229,-0.00231,0.000569,0.000934,0.000707,0.002203


#D2

In [None]:
df = pd.read_csv("D2.csv")

In [None]:
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [None]:
df.isnull().sum()

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

In [None]:
X = df.drop(columns=['DEATH_EVENT']).to_numpy()
y = df['DEATH_EVENT'].to_numpy()

In [None]:
from sklearn import metrics
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering, AffinityPropagation, MiniBatchKMeans, Birch
from sklearn.cluster import DBSCAN

number_of_clusters=2

algorithms = []
algorithms.append(KMeans(n_clusters=number_of_clusters, random_state=1))
algorithms.append(MiniBatchKMeans(n_clusters=number_of_clusters,random_state=1))
algorithms.append(AgglomerativeClustering(n_clusters=number_of_clusters))
algorithms.append(SpectralClustering(n_clusters=number_of_clusters, random_state=1,
                                     affinity='nearest_neighbors'))
algorithms.append(DBSCAN(eps = 100, min_samples=1))


data = []
for algo in algorithms:

    algo.fit(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, algo.labels_),
        'AMI': metrics.adjusted_mutual_info_score(y, algo.labels_,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, algo.labels_),
        'Completeness': metrics.completeness_score(y, algo.labels_),
        'V-measure': metrics.v_measure_score(y, algo.labels_),
        'Silhouette': metrics.silhouette_score(X, algo.labels_)}))

results1 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['K-means','MiniBatchKMeans', 'AgglomerativeClustering', 'SpectralClustering', 'DBSCAN'])

results1

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
K-means,0.017453,-0.000411,0.002315,0.002775,0.002524,0.582889
MiniBatchKMeans,-0.002689,-0.002454,0.000105,9.5e-05,0.0001,0.456097
AgglomerativeClustering,0.004368,-0.003978,0.00023,0.000762,0.000353,0.678929
SpectralClustering,-0.00214,-0.002229,0.000342,0.000309,0.000325,0.457618
DBSCAN,-0.0002,0.004151,0.894754,0.102648,0.184168,0.160405


In [None]:

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture

number_of_clusters=2

algorithms = []
algorithms.append(GaussianMixture(n_components= number_of_clusters, random_state=42, n_init=10))
algorithms.append(BayesianGaussianMixture(n_components=number_of_clusters, random_state=42))

data = []
for algo in algorithms:

    algo.fit(X)
    predicted_label =algo.predict(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, predicted_label),
        'AMI': metrics.adjusted_mutual_info_score(y, predicted_label,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, predicted_label),
        'Completeness': metrics.completeness_score(y, predicted_label),
        'V-measure': metrics.v_measure_score(y, predicted_label),
        'Silhouette': metrics.silhouette_score(X, predicted_label)}))

results2 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['GaussianMixture','BayesianGaussianMixture' ])

results2

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
GaussianMixture,0.047103,0.009436,0.011154,0.013965,0.012402,0.174072
BayesianGaussianMixture,0.04964,0.010392,0.011774,0.015647,0.013437,0.19024


# D3

In [None]:
df = pd.read_csv("D3.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Pedigree,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
df.isnull().sum()

Pregnancies      0
Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
BMI              0
Pedigree         0
Age              0
Outcome          0
dtype: int64

In [None]:
X = df.drop(columns=['Outcome']).to_numpy()
y = df['Outcome'].to_numpy()

In [None]:
from sklearn import metrics
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering, AffinityPropagation, MiniBatchKMeans, Birch
from sklearn.cluster import DBSCAN

number_of_clusters=2

algorithms = []
algorithms.append(KMeans(n_clusters=number_of_clusters, random_state=1))
algorithms.append(MiniBatchKMeans(n_clusters=number_of_clusters,random_state=1))
algorithms.append(AgglomerativeClustering(n_clusters=number_of_clusters))
algorithms.append(SpectralClustering(n_clusters=number_of_clusters, random_state=1,
                                     affinity='nearest_neighbors'))
algorithms.append(DBSCAN(eps = 5, min_samples=1))


data = []
for algo in algorithms:

    algo.fit(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, algo.labels_),
        'AMI': metrics.adjusted_mutual_info_score(y, algo.labels_,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, algo.labels_),
        'Completeness': metrics.completeness_score(y, algo.labels_),
        'V-measure': metrics.v_measure_score(y, algo.labels_),
        'Silhouette': metrics.silhouette_score(X, algo.labels_)}))

results1 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['K-means','MiniBatchKMeans', 'AgglomerativeClustering', 'SpectralClustering', 'DBSCAN'])

results1

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
K-means,0.074387,0.028461,0.026659,0.033141,0.029549,0.56879
MiniBatchKMeans,0.069677,0.0258,0.024744,0.029393,0.026869,0.56205
AgglomerativeClustering,0.100328,0.044746,0.042331,0.049863,0.045789,0.553268
SpectralClustering,-0.000208,-0.000172,0.000831,0.000775,0.000802,0.407966
DBSCAN,6e-05,0.002905,0.994418,0.097207,0.177101,0.017635


In [None]:

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture

number_of_clusters=2

algorithms = []
algorithms.append(GaussianMixture(n_components= number_of_clusters, random_state=42, n_init=10))
algorithms.append(BayesianGaussianMixture(n_components=number_of_clusters, random_state=42))

data = []
for algo in algorithms:

    algo.fit(X)
    predicted_label =algo.predict(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, predicted_label),
        'AMI': metrics.adjusted_mutual_info_score(y, predicted_label,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, predicted_label),
        'Completeness': metrics.completeness_score(y, predicted_label),
        'V-measure': metrics.v_measure_score(y, predicted_label),
        'Silhouette': metrics.silhouette_score(X, predicted_label)}))

results2 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['GaussianMixture','BayesianGaussianMixture' ])

results2

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
GaussianMixture,0.000953,0.000277,0.001295,0.001209,0.001251,0.39195
BayesianGaussianMixture,0.0021,0.000178,0.001192,0.001118,0.001154,0.345188


#D4

In [None]:
df = pd.read_csv("D4.csv")
df.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence


In [None]:
df.isnull().sum()

Age                        0
Sex                        0
Chest pain type            0
BP                         0
Cholesterol                0
FBS over 120               0
EKG results                0
Max HR                     0
Exercise angina            0
ST depression              0
Slope of ST                0
Number of vessels fluro    0
Thallium                   0
Heart Disease              0
dtype: int64

In [None]:
X = df.drop(columns=['Heart Disease']).to_numpy()
y = df['Heart Disease'].to_numpy()

In [None]:
from sklearn import metrics
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering, AffinityPropagation, MiniBatchKMeans, Birch
from sklearn.cluster import DBSCAN

number_of_clusters=2

algorithms = []
algorithms.append(KMeans(n_clusters=number_of_clusters, random_state=1))
algorithms.append(MiniBatchKMeans(n_clusters=number_of_clusters,random_state=1))
algorithms.append(AgglomerativeClustering(n_clusters=number_of_clusters))
algorithms.append(SpectralClustering(n_clusters=number_of_clusters, random_state=1,
                                     affinity='nearest_neighbors'))
algorithms.append(DBSCAN(eps = 5, min_samples=1))


data = []
for algo in algorithms:

    algo.fit(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, algo.labels_),
        'AMI': metrics.adjusted_mutual_info_score(y, algo.labels_,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, algo.labels_),
        'Completeness': metrics.completeness_score(y, algo.labels_),
        'V-measure': metrics.v_measure_score(y, algo.labels_),
        'Silhouette': metrics.silhouette_score(X, algo.labels_)}))

results1 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['K-means','MiniBatchKMeans', 'AgglomerativeClustering', 'SpectralClustering', 'DBSCAN'])

results1

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
K-means,0.03023,0.017199,0.019565,0.020273,0.019912,0.380418
MiniBatchKMeans,0.02481,0.013207,0.015577,0.016335,0.015947,0.384696
AgglomerativeClustering,0.01026,0.010761,0.013422,0.013475,0.013448,0.319338
SpectralClustering,0.049039,0.031695,0.034183,0.034483,0.034332,0.366991
DBSCAN,5.4e-05,0.001036,1.0,0.122819,0.218769,0.002352


In [None]:

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture

number_of_clusters=2

algorithms = []
algorithms.append(GaussianMixture(n_components= number_of_clusters, random_state=42, n_init=10))
algorithms.append(BayesianGaussianMixture(n_components=number_of_clusters, random_state=42))

data = []
for algo in algorithms:

    algo.fit(X)
    predicted_label =algo.predict(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, predicted_label),
        'AMI': metrics.adjusted_mutual_info_score(y, predicted_label,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, predicted_label),
        'Completeness': metrics.completeness_score(y, predicted_label),
        'V-measure': metrics.v_measure_score(y, predicted_label),
        'Silhouette': metrics.silhouette_score(X, predicted_label)}))

results2 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['GaussianMixture','BayesianGaussianMixture' ])

results2

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
GaussianMixture,0.047103,0.009436,0.011154,0.013965,0.012402,0.174072
BayesianGaussianMixture,0.04964,0.010392,0.011774,0.015647,0.013437,0.19024


#D5

In [None]:
df = pd.read_csv("D5.csv")
df.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0
1,20.57,17.77,132.9,1326.0,0.08474,0
2,19.69,21.25,130.0,1203.0,0.1096,0
3,11.42,20.38,77.58,386.1,0.1425,0
4,20.29,14.34,135.1,1297.0,0.1003,0


In [None]:
df.isnull().sum()

mean_radius        0
mean_texture       0
mean_perimeter     0
mean_area          0
mean_smoothness    0
diagnosis          0
dtype: int64

In [None]:
X = df.drop(columns=['diagnosis']).to_numpy()
y = df['diagnosis'].to_numpy()

In [None]:
from sklearn import metrics
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering, AffinityPropagation, MiniBatchKMeans, Birch
from sklearn.cluster import DBSCAN

number_of_clusters=2

algorithms = []
algorithms.append(KMeans(n_clusters=number_of_clusters, random_state=1))
algorithms.append(MiniBatchKMeans(n_clusters=number_of_clusters,random_state=1))
algorithms.append(AgglomerativeClustering(n_clusters=number_of_clusters))
algorithms.append(SpectralClustering(n_clusters=number_of_clusters, random_state=1,
                                     affinity='nearest_neighbors'))
algorithms.append(DBSCAN(eps = 5, min_samples=1))


data = []
for algo in algorithms:

    algo.fit(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, algo.labels_),
        'AMI': metrics.adjusted_mutual_info_score(y, algo.labels_,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, algo.labels_),
        'Completeness': metrics.completeness_score(y, algo.labels_),
        'V-measure': metrics.v_measure_score(y, algo.labels_),
        'Silhouette': metrics.silhouette_score(X, algo.labels_)}))

results1 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['K-means','MiniBatchKMeans', 'AgglomerativeClustering', 'SpectralClustering', 'DBSCAN'])

results1

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
K-means,0.445745,0.418769,0.376407,0.474083,0.419636,0.699136
MiniBatchKMeans,0.435779,0.410993,0.368033,0.467574,0.411875,0.700069
AgglomerativeClustering,0.274513,0.307588,0.251491,0.399693,0.308727,0.682661
SpectralClustering,0.466065,0.375928,0.384245,0.369531,0.376745,0.510216
DBSCAN,0.075158,0.119081,0.725317,0.116563,0.200848,-0.051103


In [None]:

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture

number_of_clusters=2

algorithms = []
algorithms.append(GaussianMixture(n_components= number_of_clusters, random_state=42, n_init=10))
algorithms.append(BayesianGaussianMixture(n_components=number_of_clusters, random_state=42))

data = []
for algo in algorithms:

    algo.fit(X)
    predicted_label =algo.predict(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, predicted_label),
        'AMI': metrics.adjusted_mutual_info_score(y, predicted_label,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, predicted_label),
        'Completeness': metrics.completeness_score(y, predicted_label),
        'V-measure': metrics.v_measure_score(y, predicted_label),
        'Silhouette': metrics.silhouette_score(X, predicted_label)}))

results2 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['GaussianMixture','BayesianGaussianMixture' ])

results2

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
GaussianMixture,0.635905,0.521522,0.509432,0.535576,0.522177,0.610674
BayesianGaussianMixture,0.6413,0.533724,0.518597,0.551123,0.534366,0.616994


#D6

In [None]:
df = pd.read_csv("D6.csv")
df.head()

Unnamed: 0,behavior_sexualRisk,behavior_eating,behavior_personalHygine,intention_aggregation,intention_commitment,attitude_consistency,attitude_spontaneity,norm_significantPerson,norm_fulfillment,perception_vulnerability,perception_severity,motivation_strength,motivation_willingness,socialSupport_emotionality,socialSupport_appreciation,socialSupport_instrumental,empowerment_knowledge,empowerment_abilities,empowerment_desires,ca_cervix
0,10,13,12,4,7,9,10,1,8,7,3,14,8,5,7,12,12,11,8,1
1,10,11,11,10,14,7,7,5,5,4,2,15,13,7,6,5,5,4,4,1
2,10,15,3,2,14,8,10,1,4,7,2,7,3,3,6,11,3,3,15,1
3,10,11,10,10,15,7,7,1,5,4,2,15,13,7,4,4,4,4,4,1
4,8,11,7,8,10,7,8,1,5,3,2,15,5,3,6,12,5,4,7,1


In [None]:
X = df.drop(columns=['ca_cervix']).to_numpy()
y = df['ca_cervix'].to_numpy()

In [None]:
from sklearn import metrics
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering, AffinityPropagation, MiniBatchKMeans, Birch
from sklearn.cluster import DBSCAN

number_of_clusters=2

algorithms = []
algorithms.append(KMeans(n_clusters=number_of_clusters, random_state=1))
algorithms.append(MiniBatchKMeans(n_clusters=number_of_clusters,random_state=1))
algorithms.append(AgglomerativeClustering(n_clusters=number_of_clusters))
algorithms.append(SpectralClustering(n_clusters=number_of_clusters, random_state=1,
                                     affinity='nearest_neighbors'))
algorithms.append(DBSCAN(eps = 5, min_samples=1))


data = []
for algo in algorithms:

    algo.fit(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, algo.labels_),
        'AMI': metrics.adjusted_mutual_info_score(y, algo.labels_,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, algo.labels_),
        'Completeness': metrics.completeness_score(y, algo.labels_),
        'V-measure': metrics.v_measure_score(y, algo.labels_),
        'Silhouette': metrics.silhouette_score(X, algo.labels_)}))

results1 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['K-means','MiniBatchKMeans', 'AgglomerativeClustering', 'SpectralClustering', 'DBSCAN'])

results1

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
K-means,0.296139,0.219228,0.241275,0.216048,0.227966,0.280125
MiniBatchKMeans,0.240303,0.209232,0.233496,0.204369,0.217964,0.267026
AgglomerativeClustering,0.429168,0.305751,0.324652,0.303499,0.313719,0.27041
SpectralClustering,0.138516,0.220922,0.244752,0.216163,0.229571,0.211825
DBSCAN,0.003388,0.022388,1.0,0.144649,0.252739,0.056575


In [None]:

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture

number_of_clusters=2

algorithms = []
algorithms.append(GaussianMixture(n_components= number_of_clusters, random_state=42, n_init=10))
algorithms.append(BayesianGaussianMixture(n_components=number_of_clusters, random_state=42))

data = []
for algo in algorithms:

    algo.fit(X)
    predicted_label =algo.predict(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, predicted_label),
        'AMI': metrics.adjusted_mutual_info_score(y, predicted_label,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, predicted_label),
        'Completeness': metrics.completeness_score(y, predicted_label),
        'V-measure': metrics.v_measure_score(y, predicted_label),
        'Silhouette': metrics.silhouette_score(X, predicted_label)}))

results2 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['GaussianMixture','BayesianGaussianMixture' ])

results2

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
GaussianMixture,0.327063,0.238168,0.259944,0.23481,0.246739,0.282129
BayesianGaussianMixture,0.327063,0.238168,0.259944,0.23481,0.246739,0.282129


#D7

In [None]:
df = pd.read_csv("D7.csv")
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [None]:
df['Gender'].replace('Female', 0, inplace=True)
df['Gender'].replace('Male', 1, inplace=True)

In [None]:
df

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,1,0.5,0.1,500,20,34,5.9,1.6,0.37,2
579,40,1,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,1,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,1,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [None]:
df.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

In [None]:
X = df.drop(columns=['Dataset', 'Albumin_and_Globulin_Ratio']).to_numpy()
y = df['Dataset'].to_numpy()

In [None]:
from sklearn import metrics
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering, AffinityPropagation, MiniBatchKMeans, Birch
from sklearn.cluster import DBSCAN

number_of_clusters=2

algorithms = []
algorithms.append(KMeans(n_clusters=number_of_clusters, random_state=1))
algorithms.append(MiniBatchKMeans(n_clusters=number_of_clusters,random_state=1))
algorithms.append(AgglomerativeClustering(n_clusters=number_of_clusters))
algorithms.append(SpectralClustering(n_clusters=number_of_clusters, random_state=1,
                                     affinity='nearest_neighbors'))
algorithms.append(DBSCAN(eps = 5, min_samples=1))


data = []
for algo in algorithms:

    algo.fit(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, algo.labels_),
        'AMI': metrics.adjusted_mutual_info_score(y, algo.labels_,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, algo.labels_),
        'Completeness': metrics.completeness_score(y, algo.labels_),
        'V-measure': metrics.v_measure_score(y, algo.labels_),
        'Silhouette': metrics.silhouette_score(X, algo.labels_)}))

results1 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['K-means','MiniBatchKMeans', 'AgglomerativeClustering', 'SpectralClustering', 'DBSCAN'])

results1

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
K-means,-0.020947,0.015952,0.010753,0.068809,0.0186,0.857255
MiniBatchKMeans,-0.070884,0.070261,0.063822,0.081975,0.071769,0.662823
AgglomerativeClustering,-0.004071,0.000246,0.001937,0.050678,0.003731,0.925598
SpectralClustering,-0.052872,0.054839,0.053741,0.059035,0.056264,0.595373
DBSCAN,8.9e-05,0.004086,0.977152,0.092831,0.169554,0.062416


In [None]:

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture

number_of_clusters=2

algorithms = []
algorithms.append(GaussianMixture(n_components= number_of_clusters, random_state=42, n_init=10))
algorithms.append(BayesianGaussianMixture(n_components=number_of_clusters, random_state=42))

data = []
for algo in algorithms:

    algo.fit(X)
    predicted_label =algo.predict(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, predicted_label),
        'AMI': metrics.adjusted_mutual_info_score(y, predicted_label,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, predicted_label),
        'Completeness': metrics.completeness_score(y, predicted_label),
        'V-measure': metrics.v_measure_score(y, predicted_label),
        'Silhouette': metrics.silhouette_score(X, predicted_label)}))

results2 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['GaussianMixture','BayesianGaussianMixture' ])

results2

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
GaussianMixture,-0.069814,0.051734,0.042942,0.070731,0.05344,0.575227
BayesianGaussianMixture,-0.023899,0.09557,0.097717,0.096017,0.09686,0.472295


#D8

In [None]:
df = pd.read_csv("D8.csv")
df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [None]:
df['GENDER'].replace('M', 0, inplace=True)
df['GENDER'].replace('F', 1, inplace=True)
df['LUNG_CANCER'].replace('YES', 1, inplace=True)
df['LUNG_CANCER'].replace('NO', 0, inplace=True)
df

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,0,69,1,2,2,1,1,2,1,2,2,2,2,2,2,1
1,0,74,2,1,1,1,2,2,2,1,1,1,2,2,2,1
2,1,59,1,1,1,2,1,2,1,2,1,2,2,1,2,0
3,0,63,2,2,2,1,1,1,1,1,2,1,1,2,2,0
4,1,63,1,2,1,1,1,1,1,2,1,2,2,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,1,56,1,1,1,2,2,2,1,1,2,2,2,2,1,1
305,0,70,2,1,1,1,1,2,2,2,2,2,2,1,2,1
306,0,58,2,1,1,1,1,1,2,2,2,2,1,1,2,1
307,0,67,2,1,2,1,1,2,2,1,2,2,2,1,2,1


In [None]:
df.isnull().sum()

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64

In [None]:
X = df.drop(columns=['LUNG_CANCER']).to_numpy()
y = df['LUNG_CANCER'].to_numpy()

In [None]:
from sklearn import metrics
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering, AffinityPropagation, MiniBatchKMeans, Birch
from sklearn.cluster import DBSCAN

number_of_clusters=2

algorithms = []
algorithms.append(KMeans(n_clusters=number_of_clusters, random_state=1))
algorithms.append(MiniBatchKMeans(n_clusters=number_of_clusters,random_state=1))
algorithms.append(AgglomerativeClustering(n_clusters=number_of_clusters))
algorithms.append(SpectralClustering(n_clusters=number_of_clusters, random_state=1,
                                     affinity='nearest_neighbors'))
algorithms.append(DBSCAN(eps = 5, min_samples=1))


data = []
for algo in algorithms:

    algo.fit(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, algo.labels_),
        'AMI': metrics.adjusted_mutual_info_score(y, algo.labels_,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, algo.labels_),
        'Completeness': metrics.completeness_score(y, algo.labels_),
        'V-measure': metrics.v_measure_score(y, algo.labels_),
        'Silhouette': metrics.silhouette_score(X, algo.labels_)}))

results1 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['K-means','MiniBatchKMeans', 'AgglomerativeClustering', 'SpectralClustering', 'DBSCAN'])

results1

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
K-means,-0.021331,0.004365,0.010276,0.005933,0.007523,0.508842
MiniBatchKMeans,-0.001318,-0.001367,0.002398,0.001312,0.001696,0.476274
AgglomerativeClustering,-0.059457,0.016574,0.024764,0.01683,0.020041,0.472297
SpectralClustering,-0.021095,0.001696,0.006613,0.003899,0.004906,0.51389
DBSCAN,0.061167,0.046018,0.037926,0.174092,0.062283,0.436432


In [None]:

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture

number_of_clusters=2

algorithms = []
algorithms.append(GaussianMixture(n_components= number_of_clusters, random_state=42, n_init=10))
algorithms.append(BayesianGaussianMixture(n_components=number_of_clusters, random_state=42))

data = []
for algo in algorithms:

    algo.fit(X)
    predicted_label =algo.predict(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, predicted_label),
        'AMI': metrics.adjusted_mutual_info_score(y, predicted_label,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, predicted_label),
        'Completeness': metrics.completeness_score(y, predicted_label),
        'V-measure': metrics.v_measure_score(y, predicted_label),
        'Silhouette': metrics.silhouette_score(X, predicted_label)}))

results2 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['GaussianMixture','BayesianGaussianMixture' ])

results2

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
GaussianMixture,0.06088,0.007319,0.013081,0.00941,0.010946,0.021026
BayesianGaussianMixture,0.013617,0.001034,0.005759,0.003242,0.004149,0.014532


#D9

In [None]:
thyroidDF = pd.read_csv("thyroidDF(D9).csv")
thyroidDF.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target,patient_id
0,29,F,f,f,f,f,f,f,f,t,...,,f,,f,,f,,other,-,840801013
1,29,F,f,f,f,f,f,f,f,f,...,128.0,f,,f,,f,,other,-,840801014
2,41,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,11.0,other,-,840801042
3,36,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,26.0,other,-,840803046
4,32,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,36.0,other,S,840803047


In [None]:
# dropping redundant attributes from thyroidDF dataset
thyroidDF.drop(['TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'patient_id', 'referral_source'], axis=1, inplace=True)

# re-mapping target vaues to diagnostic groups
diagnoses = {'-': 'negative',
             'A': 'hyperthyroid',
             'B': 'hyperthyroid',
             'C': 'hyperthyroid',
             'D': 'hyperthyroid',
             'E': 'hypothyroid',
             'F': 'hypothyroid',
             'G': 'hypothyroid',
             'H': 'hypothyroid'}

thyroidDF['target'] = thyroidDF['target'].map(diagnoses) # re-mapping
# dropping observations with 'target' null after re-mapping
thyroidDF.dropna(subset=['target'], inplace=True)

# dataset initial summary
thyroidDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7546 entries, 0 to 9171
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  7546 non-null   int64  
 1   sex                  7296 non-null   object 
 2   on_thyroxine         7546 non-null   object 
 3   query_on_thyroxine   7546 non-null   object 
 4   on_antithyroid_meds  7546 non-null   object 
 5   sick                 7546 non-null   object 
 6   pregnant             7546 non-null   object 
 7   thyroid_surgery      7546 non-null   object 
 8   I131_treatment       7546 non-null   object 
 9   query_hypothyroid    7546 non-null   object 
 10  query_hyperthyroid   7546 non-null   object 
 11  lithium              7546 non-null   object 
 12  goitre               7546 non-null   object 
 13  tumor                7546 non-null   object 
 14  hypopituitary        7546 non-null   object 
 15  psych                7546 non-null   o

In [None]:
thyroidDF['age'] = np.where((thyroidDF.age > 100), np.nan, thyroidDF.age)

In [None]:
# replacing boolean strings with binary 0 and 1
thyroidDF.replace('f', 0, inplace=True)
thyroidDF.replace('t', 1, inplace=True)

# replacing sex with binary 0 and 1
thyroidDF.replace('M', 0, inplace=True) # male mapped to 0
thyroidDF.replace('F', 1, inplace=True) # female mapped to 1

xgbDF = thyroidDF.replace(np.nan, 0)

# re-mapping target vaues to diagnostic groups
# *** I get different final results by ordering these classes differently ***
diagnoses = {'negative': 0,
             'hypothyroid': 1,
             'hyperthyroid': 2}

xgbDF['target'] = xgbDF['target'].map(diagnoses) # re-mapping

# train and test split --> stratified
X = xgbDF.drop('target', axis=1).copy()
y = xgbDF['target'].copy()

In [None]:
from sklearn import metrics
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering, AffinityPropagation, MiniBatchKMeans, Birch
from sklearn.cluster import DBSCAN

number_of_clusters=2

algorithms = []
algorithms.append(KMeans(n_clusters=number_of_clusters, random_state=1))
algorithms.append(MiniBatchKMeans(n_clusters=number_of_clusters,random_state=1))
algorithms.append(AgglomerativeClustering(n_clusters=number_of_clusters))
algorithms.append(SpectralClustering(n_clusters=number_of_clusters, random_state=1,
                                     affinity='nearest_neighbors'))
algorithms.append(DBSCAN(eps = 5, min_samples=1))


data = []
for algo in algorithms:

    algo.fit(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, algo.labels_),
        'AMI': metrics.adjusted_mutual_info_score(y, algo.labels_,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, algo.labels_),
        'Completeness': metrics.completeness_score(y, algo.labels_),
        'V-measure': metrics.v_measure_score(y, algo.labels_),
        'Silhouette': metrics.silhouette_score(X, algo.labels_)}))

results1 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['K-means','MiniBatchKMeans', 'AgglomerativeClustering', 'SpectralClustering', 'DBSCAN'])

results1

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
K-means,0.186766,0.083954,0.087774,0.081011,0.084257,0.550874
MiniBatchKMeans,0.051425,0.06736,0.091663,0.053538,0.067596,0.33926
AgglomerativeClustering,0.11343,0.033853,0.03211,0.036598,0.034207,0.600233
SpectralClustering,0.026573,0.021355,0.011369,0.346839,0.022017,0.842634
DBSCAN,0.245913,0.145192,0.791271,0.108727,0.191183,-0.501864


In [None]:

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture

number_of_clusters=2

algorithms = []
algorithms.append(GaussianMixture(n_components= number_of_clusters, random_state=42, n_init=10))
algorithms.append(BayesianGaussianMixture(n_components=number_of_clusters, random_state=42))

data = []
for algo in algorithms:

    algo.fit(X)
    predicted_label =algo.predict(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, predicted_label),
        'AMI': metrics.adjusted_mutual_info_score(y, predicted_label,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, predicted_label),
        'Completeness': metrics.completeness_score(y, predicted_label),
        'V-measure': metrics.v_measure_score(y, predicted_label),
        'Silhouette': metrics.silhouette_score(X, predicted_label)}))

results2 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['GaussianMixture','BayesianGaussianMixture' ])

results2

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
GaussianMixture,-0.046495,0.00584,0.006532,0.005833,0.006163,-0.097171
BayesianGaussianMixture,0.138256,0.067597,0.086844,0.05567,0.067847,0.278173


In [None]:
D9r = results1.append(results2)

In [None]:
D9r.to_csv("D9r.csv")

#D10

In [None]:
kidney = pd.read_csv("kidney_disease(D13).csv")
kidney.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [None]:
# Mapping the text to 1/0 and cleaning the dataset
kidney[['htn','dm','cad','pe','ane']] = kidney[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})
kidney[['rbc','pc']] = kidney[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
kidney[['pcc','ba']] = kidney[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
kidney[['appet']] = kidney[['appet']].replace(to_replace={'good':1,'poor':0,'no':np.nan})
kidney['classification'] = kidney['classification'].replace(to_replace={'ckd':1.0,'ckd\t':1.0,'notckd':0.0,'no':0.0})
kidney.rename(columns={'classification':'class'},inplace=True)

kidney['pe'] = kidney['pe'].replace(to_replace='good',value=0) # Not having pedal edema is good
kidney['appet'] = kidney['appet'].replace(to_replace='no',value=0)
kidney['cad'] = kidney['cad'].replace(to_replace='\tno',value=0)
kidney['dm'] = kidney['dm'].replace(to_replace={'\tno':0,'\tyes':1,' yes':1, '':np.nan})
kidney.drop('id',axis=1,inplace=True)

In [None]:
kidney = kidney.dropna()

In [None]:
X = kidney.iloc[:,:-1]
y = kidney['class']

In [None]:
from sklearn import metrics
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering, AffinityPropagation, MiniBatchKMeans, Birch
from sklearn.cluster import DBSCAN

number_of_clusters=2

algorithms = []
algorithms.append(KMeans(n_clusters=number_of_clusters, random_state=1))
algorithms.append(MiniBatchKMeans(n_clusters=number_of_clusters,random_state=1))
algorithms.append(AgglomerativeClustering(n_clusters=number_of_clusters))
algorithms.append(SpectralClustering(n_clusters=number_of_clusters, random_state=1,
                                     affinity='nearest_neighbors'))
algorithms.append(DBSCAN(eps = 5, min_samples=1))


data = []
for algo in algorithms:

    algo.fit(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, algo.labels_),
        'AMI': metrics.adjusted_mutual_info_score(y, algo.labels_,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, algo.labels_),
        'Completeness': metrics.completeness_score(y, algo.labels_),
        'V-measure': metrics.v_measure_score(y, algo.labels_),
       }))

results1 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           ],
                       index=['K-means','MiniBatchKMeans', 'AgglomerativeClustering', 'SpectralClustering', 'DBSCAN'])

results1

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure
K-means,0.32337,0.3013005,0.23541,0.439134,0.306508
MiniBatchKMeans,0.085916,0.04232042,0.050519,0.044341,0.047229
AgglomerativeClustering,0.213517,0.2083579,0.150887,0.374382,0.215088
SpectralClustering,0.04558,0.03028592,0.038283,0.032516,0.035165
DBSCAN,0.0,2.479722e-15,1.0,0.11563,0.207291


In [None]:

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture

number_of_clusters=2

algorithms = []
algorithms.append(GaussianMixture(n_components= number_of_clusters, random_state=42, n_init=10))
algorithms.append(BayesianGaussianMixture(n_components=number_of_clusters, random_state=42))

data = []
for algo in algorithms:

    algo.fit(X)
    predicted_label =algo.predict(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, predicted_label),
        'AMI': metrics.adjusted_mutual_info_score(y, predicted_label,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, predicted_label),
        'Completeness': metrics.completeness_score(y, predicted_label),
        'V-measure': metrics.v_measure_score(y, predicted_label),
        'Silhouette': metrics.silhouette_score(X, predicted_label)}))

results2 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['GaussianMixture','BayesianGaussianMixture' ])

results2

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
GaussianMixture,0.32337,0.3013,0.23541,0.439134,0.306508,0.682043
BayesianGaussianMixture,0.32337,0.3013,0.23541,0.439134,0.306508,0.682043


In [None]:
D10r = results1.append(results2)

In [None]:
D10r.to_csv("D10r.csv")

#D11

In [None]:
data = pd.read_csv("indian_liver_patient(D14).csv")
data.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [None]:
mean_ratio = data['Albumin_and_Globulin_Ratio'].mean()
data['Albumin_and_Globulin_Ratio'].fillna(mean_ratio, inplace=True)
data['Gender'] = data['Gender'].replace({'Male': 0, 'Female': 1})

In [None]:
X = data.drop('Dataset', axis=1)
y = data['Dataset']

In [None]:
from sklearn import metrics
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering, AffinityPropagation, MiniBatchKMeans, Birch
from sklearn.cluster import DBSCAN

number_of_clusters=2

algorithms = []
algorithms.append(KMeans(n_clusters=number_of_clusters, random_state=1))
algorithms.append(MiniBatchKMeans(n_clusters=number_of_clusters,random_state=1))
algorithms.append(AgglomerativeClustering(n_clusters=number_of_clusters))
algorithms.append(SpectralClustering(n_clusters=number_of_clusters, random_state=1,
                                     affinity='nearest_neighbors'))
algorithms.append(DBSCAN(eps = 5, min_samples=1))


data = []
for algo in algorithms:

    algo.fit(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, algo.labels_),
        'AMI': metrics.adjusted_mutual_info_score(y, algo.labels_,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, algo.labels_),
        'Completeness': metrics.completeness_score(y, algo.labels_),
        'V-measure': metrics.v_measure_score(y, algo.labels_),
        'Silhouette': metrics.silhouette_score(X, algo.labels_)}))

results1 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['K-means','MiniBatchKMeans', 'AgglomerativeClustering', 'SpectralClustering', 'DBSCAN'])

results1

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
K-means,-0.020947,0.015952,0.010753,0.068809,0.0186,0.857255
MiniBatchKMeans,-0.070884,0.070261,0.063822,0.081975,0.071769,0.662822
AgglomerativeClustering,-0.004071,0.000246,0.001937,0.050678,0.003731,0.925598
SpectralClustering,-0.052872,0.054839,0.053741,0.059035,0.056264,0.595371
DBSCAN,8.9e-05,0.004086,0.977152,0.092831,0.169554,0.062389


In [None]:

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture

number_of_clusters=2

algorithms = []
algorithms.append(GaussianMixture(n_components= number_of_clusters, random_state=42, n_init=10))
algorithms.append(BayesianGaussianMixture(n_components=number_of_clusters, random_state=42))

data = []
for algo in algorithms:

    algo.fit(X)
    predicted_label =algo.predict(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, predicted_label),
        'AMI': metrics.adjusted_mutual_info_score(y, predicted_label,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, predicted_label),
        'Completeness': metrics.completeness_score(y, predicted_label),
        'V-measure': metrics.v_measure_score(y, predicted_label),
        'Silhouette': metrics.silhouette_score(X, predicted_label)}))

results2 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['GaussianMixture','BayesianGaussianMixture' ])

results2

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
GaussianMixture,-0.004144,-0.001293,0.000111,0.000106,0.000109,0.065568
BayesianGaussianMixture,-0.008085,0.000655,0.002128,0.001911,0.002014,0.092262


In [None]:
D11r = results1.append(results2)

In [None]:
D11r.to_csv("D11r.csv")

#D12

In [None]:
data = pd.read_csv('Autism_Data(D18).arff')

In [None]:
data.replace("?",np.nan,inplace=True)

In [None]:
data=data.drop('used_app_before',axis=1)

In [None]:
data['age']=data['age'].apply(lambda x:float(x))

data.dropna(inplace=True)

In [None]:
data.loc[data.age == 383, 'age'] = 30

In [None]:
data['age']=data['age'].fillna(30)
data=data.drop('ethnicity',axis=1)


In [None]:
data.drop(['contry_of_res','age_desc','relation'],axis=1,inplace=True)
data.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,jundice,austim,result,Class/ASD
0,1,1,1,1,0,0,1,1,0,0,26.0,f,no,no,6,NO
1,1,1,0,1,0,0,0,1,0,1,24.0,m,no,yes,5,NO
2,1,1,0,1,1,0,1,1,1,1,27.0,m,yes,yes,8,YES
3,1,1,0,1,0,0,1,1,0,1,35.0,f,no,yes,6,NO
5,1,1,1,1,1,0,1,1,1,1,36.0,m,yes,no,9,YES


In [None]:
sex=pd.get_dummies(data['gender'],drop_first=True)
jaund=pd.get_dummies(data['jundice'],drop_first=True,prefix="Had_jaundice")
rel_autism=pd.get_dummies(data['austim'],drop_first=True,prefix="Rel_had")
detected=pd.get_dummies(data['Class/ASD'],drop_first=True,prefix="Detected")

In [None]:
data=data.drop(['gender','jundice','austim','Class/ASD'],axis=1)
data_featured=pd.concat([data,sex,jaund,rel_autism,detected],axis=1)
data_featured.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,result,m,Had_jaundice_yes,Rel_had_yes,Detected_YES
0,1,1,1,1,0,0,1,1,0,0,26.0,6,0,0,0,0
1,1,1,0,1,0,0,0,1,0,1,24.0,5,1,0,1,0
2,1,1,0,1,1,0,1,1,1,1,27.0,8,1,1,1,1
3,1,1,0,1,0,0,1,1,0,1,35.0,6,0,0,1,0
5,1,1,1,1,1,0,1,1,1,1,36.0,9,1,1,0,1


In [None]:
X=data_featured[['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'result', 'm',
       'Had_jaundice_yes', 'Rel_had_yes']]
y=data_featured['Detected_YES']

In [None]:
x = X.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
X = pd.DataFrame(x_scaled)

In [None]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.191489,0.6,0.0,0.0,0.0
1,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.148936,0.5,1.0,0.0,1.0
2,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.212766,0.8,1.0,1.0,1.0
3,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.382979,0.6,0.0,0.0,1.0
4,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.404255,0.9,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
604,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.212766,1.0,0.0,0.0,0.0
605,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.170213,0.7,0.0,0.0,0.0
606,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.361702,0.3,1.0,0.0,0.0
607,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.382979,0.6,1.0,0.0,0.0


In [None]:
from sklearn import metrics
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering, AffinityPropagation, MiniBatchKMeans, Birch
from sklearn.cluster import DBSCAN

number_of_clusters=2

algorithms = []
algorithms.append(KMeans(n_clusters=number_of_clusters, random_state=1))
algorithms.append(MiniBatchKMeans(n_clusters=number_of_clusters,random_state=1))
algorithms.append(AgglomerativeClustering(n_clusters=number_of_clusters))
algorithms.append(SpectralClustering(n_clusters=number_of_clusters, random_state=1,
                                     affinity='nearest_neighbors'))
algorithms.append(DBSCAN(eps = 5, min_samples=1))


data = []
for algo in algorithms:

    algo.fit(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, algo.labels_),
        'AMI': metrics.adjusted_mutual_info_score(y, algo.labels_,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, algo.labels_),
        'Completeness': metrics.completeness_score(y, algo.labels_),
        'V-measure': metrics.v_measure_score(y, algo.labels_),
        }))

results1 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           ],
                       index=['K-means','MiniBatchKMeans', 'AgglomerativeClustering', 'SpectralClustering', 'DBSCAN'])

results1

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure
K-means,0.717183,0.662051,0.691125,0.636134,0.66249
MiniBatchKMeans,0.684219,0.635516,0.666091,0.608488,0.635988
AgglomerativeClustering,0.374193,0.414437,0.444483,0.389499,0.415179
SpectralClustering,0.851002,0.75035,0.759905,0.741686,0.750685
DBSCAN,0.0,0.0,0.0,1.0,0.0


In [None]:

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture

number_of_clusters=2

algorithms = []
algorithms.append(GaussianMixture(n_components= number_of_clusters, random_state=42, n_init=10))
algorithms.append(BayesianGaussianMixture(n_components=number_of_clusters, random_state=42))

data = []
for algo in algorithms:

    algo.fit(X)
    predicted_label =algo.predict(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, predicted_label),
        'AMI': metrics.adjusted_mutual_info_score(y, predicted_label,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, predicted_label),
        'Completeness': metrics.completeness_score(y, predicted_label),
        'V-measure': metrics.v_measure_score(y, predicted_label),
        'Silhouette': metrics.silhouette_score(X, predicted_label)}))

results2 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['GaussianMixture','BayesianGaussianMixture' ])

results2

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
GaussianMixture,0.431831,0.293878,0.297177,0.292518,0.294829,0.15141
BayesianGaussianMixture,0.384907,0.340044,0.362921,0.321372,0.340885,0.151611


In [None]:
D12r = results1.append(results2)

In [None]:
D12r.to_csv("D12r.csv")

#D13

In [None]:
data=pd.read_csv('Prostate_Cancer(D17).csv')
data.head()

Unnamed: 0,id,diagnosis_result,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0,1,M,23,12,151,954,0.143,0.278,0.242,0.079
1,2,B,9,13,133,1326,0.143,0.079,0.181,0.057
2,3,M,21,27,130,1203,0.125,0.16,0.207,0.06
3,4,M,14,16,78,386,0.07,0.284,0.26,0.097
4,5,M,9,19,135,1297,0.141,0.133,0.181,0.059


In [None]:
data=data.drop_duplicates()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 100 non-null    int64  
 1   diagnosis_result   100 non-null    object 
 2   radius             100 non-null    int64  
 3   texture            100 non-null    int64  
 4   perimeter          100 non-null    int64  
 5   area               100 non-null    int64  
 6   smoothness         100 non-null    float64
 7   compactness        100 non-null    float64
 8   symmetry           100 non-null    float64
 9   fractal_dimension  100 non-null    float64
dtypes: float64(4), int64(5), object(1)
memory usage: 8.6+ KB


In [None]:
from sklearn.preprocessing import LabelEncoder
enc=LabelEncoder()
data['diagnosis_result']=enc.fit_transform(data['diagnosis_result'])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 100 non-null    int64  
 1   diagnosis_result   100 non-null    int64  
 2   radius             100 non-null    int64  
 3   texture            100 non-null    int64  
 4   perimeter          100 non-null    int64  
 5   area               100 non-null    int64  
 6   smoothness         100 non-null    float64
 7   compactness        100 non-null    float64
 8   symmetry           100 non-null    float64
 9   fractal_dimension  100 non-null    float64
dtypes: float64(4), int64(6)
memory usage: 8.6 KB


In [None]:
for column in data.columns:
    data[column] = (data[column] - data[column].min()) / (data[column].max() - data[column].min())
data.head()

Unnamed: 0,id,diagnosis_result,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0,0.0,1.0,0.875,0.0625,0.825,0.448687,1.0,0.781759,0.633136,0.590909
1,0.010101,0.0,0.0,0.125,0.675,0.670644,1.0,0.13355,0.272189,0.090909
2,0.020202,1.0,0.75,1.0,0.65,0.597255,0.753425,0.397394,0.426036,0.159091
3,0.030303,1.0,0.3125,0.3125,0.216667,0.109785,0.0,0.801303,0.739645,1.0
4,0.040404,1.0,0.0,0.5,0.691667,0.653341,0.972603,0.309446,0.272189,0.136364


In [None]:
data=data.drop(['id'],axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   diagnosis_result   100 non-null    float64
 1   radius             100 non-null    float64
 2   texture            100 non-null    float64
 3   perimeter          100 non-null    float64
 4   area               100 non-null    float64
 5   smoothness         100 non-null    float64
 6   compactness        100 non-null    float64
 7   symmetry           100 non-null    float64
 8   fractal_dimension  100 non-null    float64
dtypes: float64(9)
memory usage: 7.8 KB


In [None]:
cls_0=data[data['diagnosis_result']==0]
cls_1=data[data['diagnosis_result']==1]

In [None]:
df_class_1_over = cls_1.sample(250, replace=True)
df_class_0_over = cls_0.sample(250, replace=True)
df_test_over = pd.concat([df_class_0_over, df_class_1_over], axis=0)
df_test_over.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 37 to 64
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   diagnosis_result   500 non-null    float64
 1   radius             500 non-null    float64
 2   texture            500 non-null    float64
 3   perimeter          500 non-null    float64
 4   area               500 non-null    float64
 5   smoothness         500 non-null    float64
 6   compactness        500 non-null    float64
 7   symmetry           500 non-null    float64
 8   fractal_dimension  500 non-null    float64
dtypes: float64(9)
memory usage: 39.1 KB


In [None]:
y=df_test_over['diagnosis_result']
df_test_over=df_test_over.drop(['diagnosis_result'],axis=1)
X=df_test_over

In [None]:
x = X.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
X = pd.DataFrame(x_scaled)

In [None]:
X

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.7500,0.0000,0.258333,0.192124,0.273973,0.000000,0.071006,0.136364
1,0.5000,0.0000,0.291667,0.217184,0.383562,0.140065,0.319527,0.113636
2,0.1250,0.0000,0.233333,0.157518,0.246575,0.182410,0.343195,0.250000
3,1.0000,0.6250,0.208333,0.143795,0.369863,0.110749,0.431953,0.159091
4,1.0000,0.0000,0.291667,0.214200,0.246575,0.127036,0.272189,0.090909
...,...,...,...,...,...,...,...,...
495,0.4375,1.0000,0.350000,0.263126,0.383562,0.247557,0.313609,0.250000
496,0.8125,0.5000,0.375000,0.272673,0.602740,0.397394,0.562130,0.409091
497,0.1250,0.6250,0.633333,0.572792,0.328767,0.436482,0.295858,0.227273
498,0.6875,0.1875,0.650000,0.631265,0.383562,0.211726,0.136095,0.022727


In [None]:
from sklearn import metrics
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering, AffinityPropagation, MiniBatchKMeans, Birch
from sklearn.cluster import DBSCAN

number_of_clusters=2

algorithms = []
algorithms.append(KMeans(n_clusters=number_of_clusters, random_state=1))
algorithms.append(MiniBatchKMeans(n_clusters=number_of_clusters,random_state=1))
algorithms.append(AgglomerativeClustering(n_clusters=number_of_clusters))
algorithms.append(SpectralClustering(n_clusters=number_of_clusters, random_state=1,
                                     affinity='nearest_neighbors'))
algorithms.append(DBSCAN(min_samples=2))


data = []
for algo in algorithms:

    algo.fit(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, algo.labels_),
        'AMI': metrics.adjusted_mutual_info_score(y, algo.labels_,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, algo.labels_),
        'Completeness': metrics.completeness_score(y, algo.labels_),
        'V-measure': metrics.v_measure_score(y, algo.labels_),
        'Silhouette': metrics.silhouette_score(X, algo.labels_)}))

results1 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['K-means','MiniBatchKMeans', 'AgglomerativeClustering', 'SpectralClustering', 'DBSCAN'])

results1

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
K-means,0.055708,0.040573,0.041961,0.041963,0.041962,0.216889
MiniBatchKMeans,0.044791,0.033766,0.034851,0.03551,0.035177,0.210874
AgglomerativeClustering,0.074322,0.054363,0.055716,0.055747,0.055732,0.209449
SpectralClustering,0.001764,0.036255,0.022357,0.146577,0.038797,-0.00338
DBSCAN,0.000909,0.05402,0.054561,0.091801,0.068444,0.169227


In [None]:

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture

number_of_clusters=2

algorithms = []
algorithms.append(GaussianMixture(n_components= number_of_clusters, random_state=42, n_init=10))
algorithms.append(BayesianGaussianMixture(n_components=number_of_clusters, random_state=42))

data = []
for algo in algorithms:

    algo.fit(X)
    predicted_label =algo.predict(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, predicted_label),
        'AMI': metrics.adjusted_mutual_info_score(y, predicted_label,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, predicted_label),
        'Completeness': metrics.completeness_score(y, predicted_label),
        'V-measure': metrics.v_measure_score(y, predicted_label),
        'Silhouette': metrics.silhouette_score(X, predicted_label)}))

results2 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['GaussianMixture','BayesianGaussianMixture' ])

results2

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
GaussianMixture,0.052129,0.044563,0.044185,0.047978,0.046003,0.164388
BayesianGaussianMixture,0.008329,0.017766,0.015214,0.027565,0.019607,0.222527


In [None]:
D13r = results1.append(results2)
D13r.to_csv("D13r.csv")

#D14

In [None]:
from sklearn import preprocessing
df=pd.read_csv('dataR2.csv')
df.head()
X = df.drop(columns=['Classification']).to_numpy()
y = df['Classification'].to_numpy()

In [None]:
x = df.drop(columns=['Classification']).values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
X = pd.DataFrame(x_scaled)

In [None]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.369231,0.253850,0.070922,0.004908,0.000000,0.052299,0.221152,0.060665,0.224659
1,0.907692,0.114826,0.226950,0.012190,0.009742,0.052726,0.103707,0.010826,0.255926
2,0.892308,0.235278,0.219858,0.036874,0.022058,0.158526,0.571021,0.076906,0.307912
3,0.676923,0.148328,0.120567,0.014171,0.005911,0.064811,0.151538,0.121131,0.533934
4,0.953846,0.135640,0.226950,0.019936,0.013748,0.027782,0.086940,0.093375,0.440565
...,...,...,...,...,...,...,...,...,...
111,0.323077,0.419620,0.226950,0.016028,0.011727,0.585897,0.287049,0.098238,0.134568
112,0.584615,0.419125,0.283688,0.037446,0.026441,0.094674,0.543206,0.052098,0.172043
113,0.630769,0.676934,0.262411,0.058863,0.036757,0.664996,0.573988,0.090252,0.162294
114,0.738462,0.357271,0.156028,0.006925,0.004189,0.240191,0.882091,0.000761,0.209741


In [None]:
X = X.to_numpy()
y = df['Classification'].to_numpy()

In [None]:
from sklearn import metrics
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering, AffinityPropagation, MiniBatchKMeans, Birch
from sklearn.cluster import DBSCAN

number_of_clusters=2

algorithms = []
algorithms.append(KMeans(n_clusters=number_of_clusters, random_state=1))
algorithms.append(MiniBatchKMeans(n_clusters=number_of_clusters,random_state=1))
algorithms.append(AgglomerativeClustering(n_clusters=number_of_clusters))
algorithms.append(SpectralClustering(n_clusters=number_of_clusters, random_state=1,
                                     affinity='nearest_neighbors'))
algorithms.append(DBSCAN(min_samples=2))


data = []
for algo in algorithms:

    algo.fit(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, algo.labels_),
        'AMI': metrics.adjusted_mutual_info_score(y, algo.labels_,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, algo.labels_),
        'Completeness': metrics.completeness_score(y, algo.labels_),
        'V-measure': metrics.v_measure_score(y, algo.labels_),
        'Silhouette': metrics.silhouette_score(X, algo.labels_)}))

results1 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['K-means','MiniBatchKMeans', 'AgglomerativeClustering', 'SpectralClustering', 'DBSCAN'])

results1

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
K-means,-0.008383,-0.006268,0.000104,0.000104,0.000104,0.203267
MiniBatchKMeans,-0.008454,-0.005402,0.001037,0.001057,0.001047,0.217175
AgglomerativeClustering,-0.001515,0.008467,0.014321,0.016086,0.015152,0.214427
SpectralClustering,-0.007683,-0.00285,0.003621,0.003867,0.00374,0.175468
DBSCAN,-0.010267,0.028468,0.02602,0.071313,0.038128,0.40136


In [None]:

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture

number_of_clusters=2

algorithms = []
algorithms.append(GaussianMixture(n_components= number_of_clusters, random_state=42, n_init=10))
algorithms.append(BayesianGaussianMixture(n_components=number_of_clusters, random_state=42))

data = []
for algo in algorithms:

    algo.fit(X)
    predicted_label =algo.predict(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, predicted_label),
        'AMI': metrics.adjusted_mutual_info_score(y, predicted_label,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, predicted_label),
        'Completeness': metrics.completeness_score(y, predicted_label),
        'V-measure': metrics.v_measure_score(y, predicted_label),
        'Silhouette': metrics.silhouette_score(X, predicted_label)}))

results2 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['GaussianMixture','BayesianGaussianMixture' ])

results2

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
GaussianMixture,0.067686,0.105755,0.103866,0.121286,0.111902,0.191098
BayesianGaussianMixture,0.033878,0.084378,0.080691,0.104301,0.090989,0.210052


In [None]:
D14r = results1.append(results2)
D14r.to_csv("D14r.csv")

#D15

In [None]:
from sklearn import preprocessing
df=pd.read_csv('risk_factors_cervical_cancer(D22).csv')
df.head()


Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [None]:
df.columns=['Age', 'No_of_sex_partner', 'First_sexual_intercourse',\
                          'No_pregnancies','Smokes',  'Smokes_yrs',  'Smokes_packs_yr',  'Hormonal_Contraceptives',\
                          'Hormonal_Contraceptives_years','IUD', 'IUD_years', 'STDs', 'STDs_number', 'STDs_condylomatosis',\
                          'STDs_cervical_condylomatosis', 'STDs_vaginal_condylomatosis', 'STDs_vulvo_perineal_condylomatosis',\
                          'STDs_syphilis', 'STDs_pelvic_inflammatory_disease', 'STDs_genital_herpes', 'STDs_molluscum_contagiosum',\
                          'STDs_AIDS', 'STDs_HIV', 'STDs_Hepatitis_B', 'STDs_HPV', 'STDs_No_of_diagnosis', 'STD_Time_since_first_diagnosis',\
                          'STDs_Time_since_last_diagnosis', 'Dx_Cancer', 'Dx_CIN', 'Dx_HPV', 'Dx', 'Hinselmann','Schiller' ,'Citology', 'Biopsy']

In [None]:
df = df.replace('?', np.NaN)

In [None]:
df=df.drop(['STD_Time_since_first_diagnosis','STDs_Time_since_last_diagnosis'],axis=1)
df=df.drop(df.index[df.Smokes.isnull()] | df.index[df.First_sexual_intercourse.isnull()])

In [None]:
x_features=list(df.columns)
x_features.remove('Biopsy')
x_features_categorical=[
 'Smokes','Hormonal_Contraceptives','IUD','STDs','STDs_condylomatosis','STDs_cervical_condylomatosis','STDs_vaginal_condylomatosis','STDs_vulvo_perineal_condylomatosis','STDs_syphilis','STDs_pelvic_inflammatory_disease','STDs_genital_herpes','STDs_molluscum_contagiosum','STDs_AIDS','STDs_HIV','STDs_Hepatitis_B','STDs_HPV','Dx_Cancer','Dx_CIN','Dx_HPV','Dx']
x_features_categorical

['Smokes',
 'Hormonal_Contraceptives',
 'IUD',
 'STDs',
 'STDs_condylomatosis',
 'STDs_cervical_condylomatosis',
 'STDs_vaginal_condylomatosis',
 'STDs_vulvo_perineal_condylomatosis',
 'STDs_syphilis',
 'STDs_pelvic_inflammatory_disease',
 'STDs_genital_herpes',
 'STDs_molluscum_contagiosum',
 'STDs_AIDS',
 'STDs_HIV',
 'STDs_Hepatitis_B',
 'STDs_HPV',
 'Dx_Cancer',
 'Dx_CIN',
 'Dx_HPV',
 'Dx']

In [None]:
x_features_numerical=[i for i in x_features if i not in x_features_categorical]
x_features_numerical

['Age',
 'No_of_sex_partner',
 'First_sexual_intercourse',
 'No_pregnancies',
 'Smokes_yrs',
 'Smokes_packs_yr',
 'Hormonal_Contraceptives_years',
 'IUD_years',
 'STDs_number',
 'STDs_No_of_diagnosis',
 'Hinselmann',
 'Schiller',
 'Citology']

In [None]:
df.dropna(inplace=True)

In [None]:
X = df.drop('Biopsy', axis=1)
y = df['Biopsy']

In [None]:
X

Unnamed: 0,Age,No_of_sex_partner,First_sexual_intercourse,No_pregnancies,Smokes,Smokes_yrs,Smokes_packs_yr,Hormonal_Contraceptives,Hormonal_Contraceptives_years,IUD,...,STDs_Hepatitis_B,STDs_HPV,STDs_No_of_diagnosis,Dx_Cancer,Dx_CIN,Dx_HPV,Dx,Hinselmann,Schiller,Citology
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,0.0,0.0,0,1,0,1,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,0.0,0.0,0,0,0,0,0,0,0,0
5,42,3.0,23.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,34,3.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0,0,0,0,0,0,0
854,32,2.0,19.0,1.0,0.0,0.0,0.0,1.0,8.0,0.0,...,0.0,0.0,0,0,0,0,0,0,0,0
855,25,2.0,17.0,0.0,0.0,0.0,0.0,1.0,0.08,0.0,...,0.0,0.0,0,0,0,0,0,0,0,1
856,33,2.0,24.0,2.0,0.0,0.0,0.0,1.0,0.08,0.0,...,0.0,0.0,0,0,0,0,0,0,0,0


In [None]:
from sklearn import metrics
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering, AffinityPropagation, MiniBatchKMeans, Birch
from sklearn.cluster import DBSCAN

number_of_clusters=2

algorithms = []
algorithms.append(KMeans(n_clusters=number_of_clusters, random_state=1))
algorithms.append(MiniBatchKMeans(n_clusters=number_of_clusters,random_state=1))
algorithms.append(AgglomerativeClustering(n_clusters=number_of_clusters))
algorithms.append(SpectralClustering(n_clusters=number_of_clusters, random_state=1,
                                     affinity='nearest_neighbors'))
algorithms.append(DBSCAN(min_samples=2))


data = []
for algo in algorithms:

    algo.fit(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, algo.labels_),
        'AMI': metrics.adjusted_mutual_info_score(y, algo.labels_,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, algo.labels_),
        'Completeness': metrics.completeness_score(y, algo.labels_),
        'V-measure': metrics.v_measure_score(y, algo.labels_),
        'Silhouette': metrics.silhouette_score(X, algo.labels_)}))

results1 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['K-means','MiniBatchKMeans', 'AgglomerativeClustering', 'SpectralClustering', 'DBSCAN'])

results1

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
K-means,0.001554,-0.00157,0.000173,6.4e-05,9.4e-05,0.40503
MiniBatchKMeans,0.00488,-0.001197,0.00089,0.000335,0.000487,0.410755
AgglomerativeClustering,0.000823,-0.001585,0.000111,4e-05,5.9e-05,0.382229
SpectralClustering,-0.00607,-0.000656,0.001857,0.000686,0.001002,0.297742
DBSCAN,-0.00866,0.012373,0.046701,0.032974,0.038655,-0.494901


In [None]:

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture

number_of_clusters=2

algorithms = []
algorithms.append(GaussianMixture(n_components= number_of_clusters, random_state=42, n_init=10))
algorithms.append(BayesianGaussianMixture(n_components=number_of_clusters, random_state=42))

data = []
for algo in algorithms:

    algo.fit(X)
    predicted_label =algo.predict(X)
    data.append(({
        'ARI': metrics.adjusted_rand_score(y, predicted_label),
        'AMI': metrics.adjusted_mutual_info_score(y, predicted_label,
                                               average_method='arithmetic'),
        'Homogenity': metrics.homogeneity_score(y, predicted_label),
        'Completeness': metrics.completeness_score(y, predicted_label),
        'V-measure': metrics.v_measure_score(y, predicted_label),
        'Silhouette': metrics.silhouette_score(X, predicted_label)}))

results2 = pd.DataFrame(data=data, columns=['ARI', 'AMI', 'Homogenity',
                                           'Completeness', 'V-measure',
                                           'Silhouette'],
                       index=['GaussianMixture','BayesianGaussianMixture' ])

results2

Unnamed: 0,ARI,AMI,Homogenity,Completeness,V-measure,Silhouette
GaussianMixture,0.066881,0.019714,0.035727,0.015413,0.021536,0.199355
BayesianGaussianMixture,0.080689,0.022966,0.039626,0.018111,0.02486,0.225452


In [None]:
D15r = results1.append(results2)
D15r.to_csv("D15r.csv")