In [1]:
import numpy as np
from my_io import read_dataset_to_X_and_y, print_array_with_dataframe
from copy import deepcopy
import matplotlib.pyplot as plt

In [2]:
class UniSet():
    def __init__(self, file, range_feature, range_label,
                 normalization=None, shuffle=False, about_nan='class_mean'):
        np.random.seed(1)
        sample, label = read_dataset_to_X_and_y(
            file, range_feature, range_label, normalization, shuffle=shuffle,
            about_nan=about_nan)
        self.number_of_feature = sample.shape[1]
        self.size_of_universal = sample.shape[0]
        self.universal = sample.astype(float)
        self.label = label
        self.diffrent_label = np.unique(label)
        self.number_of_diffrent_label = self.diffrent_label.shape[0]
        self.relation = None
        self.equivalence_relation = None


uni_total = UniSet(
    'dataset/hcvdat0.csv', (2, 14), (1, 2),
    normalization='z_score', shuffle=True, about_nan='class_mean')


print(f'The whole dataset is {uni_total.universal.shape} matrix')


The whole dataset is (615, 12) matrix


In [3]:
def split_train_test(universe: UniSet, train_size: float) -> list[UniSet]:
    train = deepcopy(universe)
    test = deepcopy(universe)
    train.size_of_universal = \
        int(universe.size_of_universal*train_size)
    train.universal = \
        universe.universal[0:train.size_of_universal]
    train.label = \
        universe.label[0:train.size_of_universal]
    test.size_of_universal = (
        universe.size_of_universal - train.size_of_universal)
    test.universal = \
        universe.universal[train.size_of_universal:]
    test.label = \
        universe.label[train.size_of_universal:]

    return train, test


uni_train, uni_test = split_train_test(uni_total, 0.8)
print(f'The train dataset is {uni_train.universal.shape} matrix')
print(f'The test dataset is {uni_test.universal.shape} matrix')


The train dataset is (492, 12) matrix
The test dataset is (123, 12) matrix


In [4]:
def distance(sample1: np.ndarray, sample2: np.ndarray) -> float:
    return np.linalg.norm(sample1-sample2)


def find_relation(universal: UniSet) -> np.ndarray:
    dis = np.array(
        list(map(lambda x: list(map(
            lambda y: distance(
                universal.universal[x], universal.universal[y]),
            range(universal.size_of_universal))),
            range(universal.size_of_universal))))
    return 1 - dis / np.max(dis)


uni_train.relation = find_relation(uni_train)
print(f'The relation on train dataset is {uni_train.relation.shape} matrix')


uni_test.relation = find_relation(uni_test)
print(f'The relation on test dataset is {uni_test.relation.shape} matrix')


The relation on train dataset is (492, 492) matrix
The relation on test dataset is (123, 123) matrix


In [5]:
def max_min(sample1: np.ndarray, sample2: np.ndarray) -> float:
    both_sample = np.vstack((sample1, sample2))
    return np.max(np.min(both_sample, axis=0))


def composition_RoR(relation: np.ndarray) -> np.ndarray:
    result = np.array(
        list(map(lambda x: list(map(
            lambda y: max_min(relation[x], relation[y]),
            range(relation.shape[0]))),
            range(relation.shape[0]))))
    return result


def union_two_relation(
        relation1: np.ndarray, relation2: np.ndarray) -> np.ndarray:
    both_relation = np.dstack((relation1, relation2))
    return np.max(both_relation, axis=2)


def make_transitive(relation: np.ndarray) -> np.ndarray:
    R = None
    Rp = np.copy(relation)
    iter = 0
    while((Rp != R).any()):
        R = np.copy(Rp)
        RoR = composition_RoR(R)
        Rp = union_two_relation(R, RoR)
        iter += 1
        print(iter)
    return Rp


print('#Iteration to make train relation transitive')
uni_train.equivalence_relation = make_transitive(uni_train.relation)
print('The equivalence relation on train dataset is',
        f'{uni_train.equivalence_relation.shape} matrix')

print('\n#Iteration to make test relation transitive')
uni_test.equivalence_relation = make_transitive(uni_test.relation)
print('The equivalence relation on test dataset is',
        f'{uni_test.equivalence_relation.shape} matrix')


#Iteration to make train relation transitive
1
2
3
4
5
6
The equivalence relation on train dataset is (492, 492) matrix

#Iteration to make test relation transitive
1
2
3
4
5
The equivalence relation on test dataset is (123, 123) matrix


In [6]:
def is_reflexive(relation: np.ndarray) -> bool:
    return (relation.diagonal() != 0).all()


def is_symmetric(relation: np.ndarray) -> bool:
    return (relation == relation.T).all()


def is_transitive(relation: np.ndarray) -> bool:
    RoR = composition_RoR(relation)
    Rp = union_two_relation(relation, RoR)
    return (Rp == relation).all()


def is_equivalece(relation: np.ndarray) -> bool:
    return is_reflexive(relation) & is_symmetric(relation) & \
        is_transitive(relation)


print('Is our train equivalence relation, equivalence?',
    f'{is_equivalece(uni_train.equivalence_relation)}')

print('Is our test equivalence relation, equivalence?',
    f'{is_equivalece(uni_test.equivalence_relation)}')


Is our train equivalence relation, equivalence? True
Is our test equivalence relation, equivalence? True


In [7]:
def find_similarity_class(
        universal: UniSet, target_sample: int, alpha: float) -> np.ndarray:
    size_of_universal = universal.shape[0]
    similarity_class = []
    for sample in range(size_of_universal):
        if(universal[sample, target_sample] >= alpha):
            similarity_class.append(sample)
    return np.array(similarity_class)


def find_cluster(relation: np.ndarray, alpha: float, label=False):
    size_of_universal = relation.shape[0]
    classes = []
    predicted_label = np.full((size_of_universal, 1), -1.0)
    number_of_class = 0.0
    for sample in range(size_of_universal):
        if(predicted_label[sample] == -1):
            new_class = find_similarity_class(relation, sample, alpha)
            predicted_label[new_class] = number_of_class
            number_of_class += 1
            classes.append(new_class)
    number_of_class = int(number_of_class)
    if(label is True):
        return predicted_label, number_of_class
    return classes, number_of_class


cluster_alpha_cut_93 = find_cluster(uni_train.equivalence_relation, 0.93)
print('\nCluster with alpha-cut 0.93 on train equivalence relation is')
print(cluster_alpha_cut_93)


Cluster with alpha-cut 0.93 on train equivalence relation is
([array([  0,   1,   3,   6,   8,  12,  13,  14,  15,  16,  17,  20,  22,
        31,  41,  44,  45,  46,  47,  49,  52,  57,  58,  59,  60,  66,
        67,  76,  78,  79,  80,  82,  85,  89,  98, 101, 105, 107, 112,
       128, 142, 144, 146, 153, 154, 161, 164, 168, 171, 172, 173, 175,
       176, 178, 180, 185, 186, 190, 195, 197, 200, 203, 210, 214, 222,
       223, 224, 226, 227, 228, 229, 231, 232, 234, 236, 238, 242, 243,
       244, 245, 250, 253, 255, 258, 263, 264, 265, 266, 267, 272, 275,
       276, 279, 284, 285, 288, 290, 291, 293, 295, 297, 299, 302, 305,
       312, 313, 314, 316, 317, 322, 324, 326, 327, 328, 330, 332, 333,
       337, 340, 347, 349, 355, 359, 360, 361, 364, 367, 370, 371, 376,
       381, 382, 383, 388, 395, 398, 413, 423, 427, 437, 439, 441, 442,
       444, 445, 446, 448, 450, 452, 462, 463, 466, 469, 470, 480, 481,
       489, 490]), array([  2,   4,   7,   9,  10,  11,  18,  19,  23,  

In [8]:
def evaluate(gold_label: np.ndarray, predict_label: np.ndarray,
             method: str = 'f1-score') -> float:
    diffrent_label_in_gold_label = np.unique(gold_label)
    diffrent_label_in_predict_label = np.unique(predict_label)
    conf_mat = np.array(
        list(map(lambda k: list(map(
            lambda s: sum((predict_label == k)*(gold_label == s))[0],
            diffrent_label_in_gold_label)),
            diffrent_label_in_predict_label)))
    precision = np.sum(np.max(conf_mat, axis=1)) / np.sum(conf_mat)
    recall = np.sum(np.max(conf_mat, axis=0)) / np.sum(conf_mat)
    if(method == 'precision'):
        return precision
    if(method == 'recall'):
        return recall
    if(method == 'f1-score'):
        return 2 * ((precision*recall)/(precision+recall))


def find_best_alpha_cut(universal: UniSet, plotter: bool = False) -> float:
    alpha_cut = []
    accuracy = []
    last_point = -1.0
    for alpha in np.unique(universal.equivalence_relation):
        if(alpha - last_point >= 0.001):
            alpha_cut.append(alpha)
            accuracy.append(evaluate(
                universal.label,
                find_cluster(universal.equivalence_relation, alpha, True)[0]))
            last_point = alpha
    
    if(plotter is True):
        plt.plot(alpha_cut, accuracy)
        plt.show()
    
    return alpha_cut[np.argmax(accuracy)], np.max(accuracy)


best_alpha_cut, best_alpha_cut_accuracy = find_best_alpha_cut(uni_train)
print(f'Best alpha-cut is {best_alpha_cut} on train-dataset',
        f' with f1-score {best_alpha_cut_accuracy}')

Best alpha-cut is 0.8845327547116397 on train-dataset  with f1-score 0.9296903460837888


In [9]:
predicted_label_test = find_cluster(
    uni_test.equivalence_relation, best_alpha_cut, True)[1]
test_accuracy = evaluate(uni_test.label, predicted_label_test)
print('Our f1-score on test-dataset with best alpha-cut on train-set is',
        test_accuracy)

Our f1-score on test-dataset with best alpha-cut on train-set is 0.9396551724137931


In [10]:
print('Hi')

Hi
