In [4]:
# 1-> (A) removal of decision attribute
import pandas as pd
import numpy as np

def remove_decision_attribute(input_file_path, output_file_path, decision_attribute_name):
    # Load the dataset into a Pandas DataFrame
    df = pd.read_csv(input_file_path)

    # Check if the decision attribute exists in the dataset
    if decision_attribute_name not in df.columns:
        print(f"The decision attribute '{decision_attribute_name}' does not exist in the dataset.")
        return

    # Drop the decision attribute column
    df = df.drop(columns=[decision_attribute_name])

    # Save the modified DataFrame back to a file (optional)
    df.to_csv(output_file_path, index=False)

    print(f"The decision attribute '{decision_attribute_name}' has been removed from the dataset.")

input_file_path = "/content/Iris.csv"
output_file_path = "/content/op1.csv"
decision_attribute_name = "Species"
remove_decision_attribute(input_file_path, output_file_path, decision_attribute_name)

The decision attribute 'Species' has been removed from the dataset.


In [5]:
# 1->(B) Dataset in matrix form

import pandas as pd
import numpy as np

def dataset_to_matrix(input_file_path):
    # Load the dataset into a Pandas DataFrame
    df = pd.read_csv(input_file_path)

    # Convert the DataFrame to a matrix
    matrix = df.to_numpy()

    return matrix

# Usage example:
input_file_path = "/content/op1.csv"
matrix = dataset_to_matrix(input_file_path)

print("Matrix:")
print(matrix)

Matrix:
[[1.00e+00 5.10e+00 3.50e+00 1.40e+00 2.00e-01]
 [2.00e+00 4.90e+00 3.00e+00 1.40e+00 2.00e-01]
 [3.00e+00 4.70e+00 3.20e+00 1.30e+00 2.00e-01]
 [4.00e+00 4.60e+00 3.10e+00 1.50e+00 2.00e-01]
 [5.00e+00 5.00e+00 3.60e+00 1.40e+00 2.00e-01]
 [6.00e+00 5.40e+00 3.90e+00 1.70e+00 4.00e-01]
 [7.00e+00 4.60e+00 3.40e+00 1.40e+00 3.00e-01]
 [8.00e+00 5.00e+00 3.40e+00 1.50e+00 2.00e-01]
 [9.00e+00 4.40e+00 2.90e+00 1.40e+00 2.00e-01]
 [1.00e+01 4.90e+00 3.10e+00 1.50e+00 1.00e-01]
 [1.10e+01 5.40e+00 3.70e+00 1.50e+00 2.00e-01]
 [1.20e+01 4.80e+00 3.40e+00 1.60e+00 2.00e-01]
 [1.30e+01 4.80e+00 3.00e+00 1.40e+00 1.00e-01]
 [1.40e+01 4.30e+00 3.00e+00 1.10e+00 1.00e-01]
 [1.50e+01 5.80e+00 4.00e+00 1.20e+00 2.00e-01]
 [1.60e+01 5.70e+00 4.40e+00 1.50e+00 4.00e-01]
 [1.70e+01 5.40e+00 3.90e+00 1.30e+00 4.00e-01]
 [1.80e+01 5.10e+00 3.50e+00 1.40e+00 3.00e-01]
 [1.90e+01 5.70e+00 3.80e+00 1.70e+00 3.00e-01]
 [2.00e+01 5.10e+00 3.80e+00 1.50e+00 3.00e-01]
 [2.10e+01 5.40e+00 3.40e+00 1.7

In [7]:
# 1->(C)MIN-MAX NORMALIZATION
import pandas as pd
import numpy as np

def min_max_normalization(dataset):
    min_values = np.min(dataset, axis=0)
    max_values = np.max(dataset, axis=0)
    normalized_dataset = (dataset - min_values) / (max_values - min_values)
    return normalized_dataset

dataset_file = '/content/op1.csv'
df = pd.read_csv(dataset_file)

# Assuming the columns in the CSV file are the attributes and rows are instances.

# Extract the dataset as a numpy array from the DataFrame (excluding any non-numeric columns if needed).
dataset = df.select_dtypes(include=[np.number]).values

normalized_dataset =  min_max_normalization(dataset)
print(normalized_dataset)

[[0.         0.22222222 0.625      0.06779661 0.04166667]
 [0.00671141 0.16666667 0.41666667 0.06779661 0.04166667]
 [0.01342282 0.11111111 0.5        0.05084746 0.04166667]
 [0.02013423 0.08333333 0.45833333 0.08474576 0.04166667]
 [0.02684564 0.19444444 0.66666667 0.06779661 0.04166667]
 [0.03355705 0.30555556 0.79166667 0.11864407 0.125     ]
 [0.04026846 0.08333333 0.58333333 0.06779661 0.08333333]
 [0.04697987 0.19444444 0.58333333 0.08474576 0.04166667]
 [0.05369128 0.02777778 0.375      0.06779661 0.04166667]
 [0.06040268 0.16666667 0.45833333 0.08474576 0.        ]
 [0.06711409 0.30555556 0.70833333 0.08474576 0.04166667]
 [0.0738255  0.13888889 0.58333333 0.10169492 0.04166667]
 [0.08053691 0.13888889 0.41666667 0.06779661 0.        ]
 [0.08724832 0.         0.41666667 0.01694915 0.        ]
 [0.09395973 0.41666667 0.83333333 0.03389831 0.04166667]
 [0.10067114 0.38888889 1.         0.08474576 0.125     ]
 [0.10738255 0.30555556 0.79166667 0.05084746 0.125     ]
 [0.11409396 0

In [8]:
# 2->(A) similarity matrix
import pandas as pd
import numpy as np

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

def similarity_matrix(objects):
    m = len(objects)
    n = len(objects[0])  # Assuming all objects have the same number of features

    # Initialize an empty similarity matrix
    S = np.zeros((m, m))

    # Calculate the similarity (Euclidean distance) between each pair of objects
    for i in range(m):
        for j in range(i+1, m):  # Since S is symmetric, we only need to calculate half of it
            distance = euclidean_distance(objects[i], objects[j])
            S[i, j] = distance
            S[j, i] = distance   # Assign the same value to the symmetric position

    return S

# Load the dataset from CSV file
file_path = "/content/op1.csv"
df = pd.read_csv(file_path)

# Assuming the dataset contains only feature columns, no ID or labels.
# Extract the feature columns from the DataFrame
feature_columns = df.columns.tolist()

# Convert the DataFrame to a numpy array for calculations
objects = df[feature_columns].values

# Get the similarity matrix
similarity_matrix  = similarity_matrix(objects)

# Save the similarity matrix to a new CSV file
output_file_path = "/content/op2.csv"
pd.DataFrame(similarity_matrix).to_csv(output_file_path, index=False, header=False)

print("Similarity matrix saved to:", output_file_path)

print(similarity_matrix)

Similarity matrix saved to: /content/op2.csv
[[  0.           1.13578167   2.06397674 ... 147.0676375  148.07305629
  149.05750568]
 [  1.13578167   0.           1.04403065 ... 146.06929862 147.0756948
  148.05826556]
 [  2.06397674   1.04403065   0.         ... 145.07491168 146.08049151
  147.06284371]
 ...
 [147.0676375  146.06929862 145.07491168 ...   0.           1.17473401
    2.1       ]
 [148.07305629 147.0756948  146.08049151 ...   1.17473401   0.
    1.26095202]
 [149.05750568 148.05826556 147.06284371 ...   2.1          1.26095202
    0.        ]]


In [9]:
# 2->(B) AVG DISSIMILARITY OF OBJECTS(ROWS)

import pandas as pd
import numpy as np

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

def similarity_matrix(objects):
    m = len(objects)
    n = len(objects[0])  # Assuming all objects have the same number of features

    # Initialize an empty similarity matrix
    S = np.zeros((m, m))

    # Calculate the similarity (Euclidean distance) between each pair of objects
    for i in range(m):
        for j in range(i+1, m):  # Since S is symmetric, we only need to calculate half of it
            distance = euclidean_distance(objects[i], objects[j])
            S[i, j] = distance
            S[j, i] = distance  # Assign the same value to the symmetric position

    return S

def form_clusters(similarity_matrix):
    m = similarity_matrix.shape[0]
    clusters = []

    for i in range(m):
        avg_dissimilarity = np.mean(similarity_matrix[i])
        cluster_i = [j for j in range(m) if similarity_matrix[i, j] < avg_dissimilarity]
        clusters.append(cluster_i)

    return clusters

# Load the dataset from CSV file
file_path = "/content/op1.csv"
df = pd.read_csv(file_path)

# Assuming the dataset contains only feature columns, no ID or labels.
# Extract the feature columns from the DataFrame
feature_columns = df.columns.tolist()

# Convert the DataFrame to a numpy array for calculations
objects = df[feature_columns].values

# Get the similarity matrix
similarity_matrix = similarity_matrix(objects)

# Form clusters based on average dissimilarity
clusters = form_clusters(similarity_matrix)

# Save the clusters to a new CSV file
output_file_path = "/content/cluster.csv"
df_clusters = pd.DataFrame({"Cluster": clusters})
df_clusters.to_csv(output_file_path, index=False)

print("Clusters saved to:", output_file_path)

# Print the clusters
#for i, cluster in enumerate(clusters):
#print(f"Cluster C{i + 1}: {cluster}")

Clusters saved to: /content/cluster.csv


In [10]:
# 3->(A) subset free cluster
import pandas as pd

def remove_subset_clusters(clusters):
    non_subset_clusters = []
    for cluster in clusters:
        is_subset = False
        for other_cluster in clusters:
            if cluster != other_cluster and set(cluster).issubset(set(other_cluster)):
                is_subset = True
                break
        if not is_subset:
            non_subset_clusters.append(cluster)
    return non_subset_clusters

# Load the clusters from the CSV file
cluster_file_path = "/content/cluster.csv"
df_clusters = pd.read_csv(cluster_file_path)

# Assuming the CSV file has a column named "Cluster" that contains the cluster information as lists of indices
clusters = [eval(cluster_list) for cluster_list in df_clusters["Cluster"].values]

# Remove subset clusters
non_subset_clusters = remove_subset_clusters(clusters)

# Print the non-subset clusters
for i, cluster in enumerate(non_subset_clusters):
    print(f"Cluster C{i + 1}: {cluster}")

# Save the non-subset clusters to a new CSV file
output_file_path = "/content/SFcluster.csv"
df_non_subset_clusters = pd.DataFrame({"Cluster": non_subset_clusters})
df_non_subset_clusters.to_csv(output_file_path, index=False)

print("Non-subset clusters saved to:", output_file_path)

Cluster C1: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87]
Cluster C2: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88]
Cluster C3: [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 

In [11]:
# 3->(B) similarity matrix of subset free cluster
import pandas as pd

def calculate_similarity(cluster_i, cluster_j):
    intersection = len(set(cluster_i) & set(cluster_j))
    union = len(set(cluster_i) | set(cluster_j))
    similarity = intersection / union
    return similarity

# Load the non-subset clusters from the CSV file
cluster_file_path = "/content/SFcluster.csv"
df_clusters = pd.read_csv(cluster_file_path)

# Assuming the CSV file has a column named "Cluster" that contains the cluster information as lists of indices
clusters = [eval(cluster_list) for cluster_list in df_clusters["Cluster"].values]

# Calculate the similarity matrix
p = len(clusters)
C = [[0.0 for _ in range(p)] for _ in range(p)]

for i in range(p):
    for j in range(i, p):  # The matrix is symmetric, so we only need to calculate half of it
        similarity = calculate_similarity(clusters[i], clusters[j])
        C[i][j] = similarity
        C[j][i] = similarity  # Assign the same value to the symmetric position

# Print the similarity matrix
for row in C:
    print(row)

# Save the similarity matrix to a new CSV file
output_file_path = "/content/Simatcluster.csv"
df_similarity_matrix = pd.DataFrame(C)
df_similarity_matrix.to_csv(output_file_path, index=False, header=False)

print("Similarity matrix C saved to:", output_file_path)

[1.0, 0.9662921348314607, 0.9222222222222223, 0.9010989010989011, 0.8586956521739131, 0.8387096774193549, 0.7978723404255319, 0.7789473684210526, 0.75, 0.7216494845360825, 0.7040816326530612, 0.6868686868686869, 0.66, 0.6336633663366337, 0.6176470588235294, 0.6019417475728155, 0.5865384615384616, 0.5619047619047619, 0.5377358490566038, 0.5233644859813084, 0.5092592592592593, 0.4954128440366973, 0.4818181818181818, 0.46846846846846846, 0.45535714285714285, 0.4424778761061947, 0.4298245614035088, 0.41739130434782606, 0.4051724137931034, 0.39316239316239315, 0.3813559322033898, 0.3697478991596639, 0.35833333333333334, 0.34146341463414637, 0.33064516129032256, 0.32, 0.30952380952380953, 0.2992125984251969, 0.2846153846153846, 0.2748091603053435, 0.26515151515151514, 0.2518518518518518, 0.2426470588235294, 0.23357664233576642, 0.22142857142857142, 0.2127659574468085, 0.2013888888888889, 0.19310344827586207, 0.18243243243243243, 0.17333333333333334]
[0.9662921348314607, 1.0, 0.95454545454545

In [12]:
# 3->(c) merging two  clusters

import pandas as pd
import random

def calculate_similarity(cluster_i, cluster_j):
    intersection = len(set(cluster_i) & set(cluster_j))
    union = len(set(cluster_i) | set(cluster_j))
    similarity = intersection / union
    return similarity

# Load the non-subset clusters from the CSV file
cluster_file_path = "/content/SFcluster.csv"
df_clusters = pd.read_csv(cluster_file_path)

# Assuming the CSV file has a column named "Cluster" that contains the cluster information as lists of indices
clusters = [eval(cluster_list) for cluster_list in df_clusters["Cluster"].values]

# Calculate the similarity matrix
p = len(clusters)
C = [[0.0 for _ in range(p)] for _ in range(p)]

for i in range(p):
    for j in range(i, p):  # The matrix is symmetric, so we only need to calculate half of it
        similarity = calculate_similarity(clusters[i], clusters[j])
        C[i][j] = similarity
        C[j][i] = similarity  # Assign the same value to the symmetric position

# Find the maximum value in matrix C and corresponding indices (k, l)
max_value = max([max(row) for row in C])
k, l = [(i, j) for i, row in enumerate(C) for j, value in enumerate(row) if value == max_value][0]

# Merge clusters Ck and Cl to get a new cluster Ckl
Ck = clusters[k]
Cl = clusters[l]
Ckl = Ck + Cl

# Remove the merged clusters from the list of clusters
new_clusters = [cluster for idx, cluster in enumerate(clusters) if idx not in (k, l)]
new_clusters.append(Ckl)

# Print the merged clusters
print(f"Clusters C{k+1} and C{l+1} are merged to form the new cluster Ckl: {Ckl}")

# Save the updated clusters to a new CSV file
output_file_path = "/content/updcluster.csv"
df_updated_clusters = pd.DataFrame({"Cluster": new_clusters})
df_updated_clusters.to_csv(output_file_path, index=False)

print("Updated clusters saved to:", output_file_path)

Clusters C1 and C1 are merged to form the new cluster Ckl: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87]
Updated clusters saved to: /content/updcluster.csv


In [13]:
#  3->(D)

import pandas as pd

def read_clusters_from_csv(file_path):
    clusters = pd.read_csv(file_path, header=None)
    return [set(row.dropna()) for _, row in clusters.iterrows()]

def print_clusters(clusters):
    for i, cluster in enumerate(clusters):
        print(f"Cluster {i + 1}:")
        for element in cluster:
            print(element)
        print("=" * 40)

def save_clusters_to_csv(clusters, output_file):
    with open(output_file, 'w') as f:
        for cluster in clusters:
            cluster_str = ",".join(cluster)
            f.write(cluster_str + '\n')

def main():
    file_path = "/content/SFcluster.csv"
    output_csv_base = "/content/final.csv"

    clusters = read_clusters_from_csv(file_path)
    original_clusters = clusters.copy()

    print("Original clusters:")
    print_clusters(clusters)

    step = 1
    while len(clusters) > 1:
        merged_clusters = []
        for i in range(0, len(clusters), 2):
            if i + 1 < len(clusters):
                merged_clusters.append(clusters[i].union(clusters[i + 1]))
            else:
                merged_clusters.append(clusters[i])
        clusters = merged_clusters

        output_csv_file = f'{output_csv_base}_{step}.csv'
        save_clusters_to_csv(clusters, output_csv_file)

        print(f"Clusters formed after step {step} saved to {output_csv_file}:")
        print_clusters(clusters)

        # Increment the step counter
        step += 1

        # Write cluster contents to a CSV file after each step
        step_output_csv_file = f'{output_csv_base}_{step}_output.csv'
        save_clusters_to_csv(clusters, step_output_csv_file)
        print(f"Step {step} clusters written to {step_output_csv_file}")

if __name__ == "__main__":
    main()

Original clusters:
Cluster 1:
Cluster
Cluster 2:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87]
Cluster 3:
[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88]
Cluster 4:
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 6