In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import MeanShift
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
from sklearn.neighbors import KernelDensity
from sklearn.mixture import GaussianMixture
import warnings
import time
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb
from typing import List, Tuple
from numpy import arange, argsort, argwhere, empty, full, inf, intersect1d, max, ndarray, sort, sum, zeros
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
def averageDis(distance):
    sum = 0
    for i in range(len(distance)):
        for j in range(len(distance)):
            if j < i:
                sum += distance[i][j]
    return sum / len(distance)

def getCentroid(point):
    random_factor = 1
    point_array = np.array(point) * random_factor
    return np.mean(point_array, 0).tolist()


In [3]:
def GIN_DPC(k: int, nc: int, data: ndarray) -> Tuple[ndarray, ndarray]:
    unassigned = -1

    n, d = data.shape

    # Compute distance
    distance = squareform(pdist(data))

    # Compute neighbor
    indexDistanceAsc: ndarray = argsort(distance)
    indexNeighbor: ndarray = indexDistanceAsc[:, :k]

    # Compute shared neighbor
    indexSharedNeighbor = empty([n, n, k], int)
    numSharedNeighbor = empty([n, n], int)
    for i in range(n):
        numSharedNeighbor[i, i] = 0
        for j in range(i):
            shared: ndarray = intersect1d(indexNeighbor[i], indexNeighbor[j], assume_unique=True)
            numSharedNeighbor[j, i] = numSharedNeighbor[i, j] = shared.size
            indexSharedNeighbor[j, i, :shared.size] = indexSharedNeighbor[i, j, :shared.size] = shared

    # Compute similarity
    similarity = zeros([n, n])
    for i in range(n):
        for j in range(i):
            if i in indexSharedNeighbor[i, j] and j in indexSharedNeighbor[i, j]:
                indexShared = indexSharedNeighbor[i, j, :numSharedNeighbor[i, j]]
                distanceSum = sum(distance[i, indexShared] * distance[j, indexShared])
                similarity[i, j] = similarity[j, i] = numSharedNeighbor[i, j] ** 4 / (distanceSum * (distance[i, j] ** 2))

    rho = sum(sort(similarity)[:, -k:], axis=1)

    distanceNeighborSum = empty(n)
    for i in range(n):
        distanceNeighborSum[i] = sum(distance[i, indexNeighbor[i]])
    indexRhoDesc = argsort(rho)[::-1]
    delta = full(n, inf)
    for i, a in enumerate(indexRhoDesc[1:], 1):
        for b in indexRhoDesc[:i]:
            delta[a] = min(delta[a], distance[a, b] * (distanceNeighborSum[a] + distanceNeighborSum[b]))
    delta[indexRhoDesc[0]] = -inf
    delta[indexRhoDesc[0]] = max(delta)
    gamma = rho * delta
    indexAssignment = full(n, unassigned)
    indexCentroid: ndarray = sort(argsort(gamma)[-nc:])
    indexAssignment[indexCentroid] = arange(nc)

    queue: List[int] = indexCentroid.tolist()
    while queue:
        a = queue.pop(0)
        for b in indexNeighbor[a]:
            if indexAssignment[b] == unassigned and numSharedNeighbor[a, b] >= k / 2:
                indexAssignment[b] = indexAssignment[a]
                queue.append(b)

    indexUnassigned = argwhere(indexAssignment == unassigned).flatten()
    while indexUnassigned.size:
        numNeighborAssignment = zeros([indexUnassigned.size, nc], int)
        for i, a in enumerate(indexUnassigned):
            for b in indexDistanceAsc[a, :k]:
                if indexAssignment[b] != unassigned:
                    numNeighborAssignment[i, indexAssignment[b]] += 1
        if most := max(numNeighborAssignment):
            temp = argwhere(numNeighborAssignment == most)
            indexAssignment[indexUnassigned[temp[:, 0]]] = temp[:, 1]
            indexUnassigned = argwhere(indexAssignment == unassigned).flatten()
        else:
            k += 1

    return indexCentroid, indexAssignment, indexNeighbor, numSharedNeighbor


In [4]:
def TSO_MCN(X_train, y_train, cluster=3, K=6):

    indices_class_1 = np.where(y_train == 1)[0]
    # print(type(X_train))
    X_train_class_1 = X_train.iloc[indices_class_1]

    gmm = GaussianMixture(n_components=cluster, random_state=42)
    gmm.fit(X_train_class_1)
    gmm_labels = gmm.predict(X_train_class_1)
    gmm_centers = gmm.means_

    kmeans = KMeans(n_clusters=cluster, random_state=42)
    kmeans.fit(X_train_class_1)
    kmeans_labels = kmeans.predict(X_train_class_1)
    kmeans_centers = kmeans.cluster_centers_

    mean_shift = MeanShift()
    mean_shift.fit(X_train_class_1)
    mean_shift_labels = mean_shift.labels_
    mean_shift_centers = mean_shift.cluster_centers_

    X_train_with_centers = np.vstack([X_train.values, gmm_centers])
    y_train_with_centers = np.concatenate([y_train, np.ones(gmm_centers.shape[0])])

    X_train_with_centers = np.vstack([X_train_with_centers, kmeans_centers])
    y_train_with_centers = np.concatenate([y_train_with_centers, np.ones(kmeans_centers.shape[0])])

    X_train_with_centers = np.vstack([X_train_with_centers, mean_shift_centers])
    y_train_with_centers = np.concatenate([y_train_with_centers, np.ones(mean_shift_centers.shape[0])])

    minority_class_count = (y_train_with_centers == 1).sum()
    majority_class_count = (y_train_with_centers == 0).sum()

    minority_class_indices = np.where(y_train_with_centers == 1)[0]
    X_train_minority_class = X_train_with_centers[minority_class_indices]

    P_array = X_train_minority_class
    G = majority_class_count - minority_class_count
    P_index = minority_class_indices

    # GIN_DPC
    centroid, assignment, indexNeighbor, numSharedNeighbor = GIN_DPC(K, cluster, P_array)

    centroid = centroid.tolist()
    assignment = assignment.tolist()
    numSharedNeighbor_vector = numSharedNeighbor

    SimilarNeighbor_K = int(K / 2) + 1
    SimilarNeighbor_vector_indices = [[] for _ in range(len(centroid))]

    for i in range(len(numSharedNeighbor_vector)):
        for j in range(i + 1, len(numSharedNeighbor_vector[i])):
            if numSharedNeighbor_vector[i][j] > SimilarNeighbor_K and assignment[i] == assignment[j]:
                SimilarNeighbor_vector_indices[assignment[i]].append((i, j, numSharedNeighbor_vector[i][j]))

    sorted_SimilarNeighbor_vector_indices = [
        sorted(row, key=lambda x: x[2], reverse=True) for row in SimilarNeighbor_vector_indices
    ]
    cleaned_sorted_SimV = [[(x[0], x[1]) for x in row] for row in sorted_SimilarNeighbor_vector_indices]

    P_assignment = [[] for _ in range(len(centroid))]
    for index in range(len(P_array)):
        if index not in indexNeighbor[assignment[index]]:
            P_assignment[assignment[index]].append(P_array[index])

    weight = []
    for index in range(len(centroid)):
        count = assignment.count(index)
        weight.append(count / majority_class_count)

    syn_num = [int(w * G) for w in weight]

    Synthetic = []
    np.random.seed(42)

    for index in range(len(centroid)):
        selected_points = set()
        arr_P = np.array(P_assignment[index])
        SimVnum = 0

        for num in range(syn_num[index]):
            available_points = [tuple(pt) for pt in arr_P if tuple(pt) not in selected_points]
            if len(available_points) == 0:
                if len(cleaned_sorted_SimV[index]) == 0:
                    break
                elif SimVnum < len(cleaned_sorted_SimV[index]):
                    SimVnum_tuple = cleaned_sorted_SimV[index][SimVnum]
                    Simpoint = [P_array[SimVnum_tuple[0]], P_array[SimVnum_tuple[1]]]
                    synthetic = getCentroid(Simpoint)
                    Synthetic.append(synthetic)
                    SimVnum += 1
                else:
                    break
            else:
                if len(available_points) < 2:
                    point = [available_points[np.random.choice(len(available_points))]]
                else:
                    idx = np.random.choice(len(available_points), 2, replace=False)
                    point = [available_points[i] for i in idx]

                for pt in point:
                    selected_points.add(tuple(pt))

                point.append(P_array[centroid[index]])
                synthetic = getCentroid(point)
                Synthetic.append(synthetic)

    Synthetic = np.array(Synthetic)

    X_csmote = np.r_[X_train_with_centers, Synthetic]
    y_csmote = np.r_[np.array(y_train_with_centers).flatten(), np.ones(Synthetic.shape[0])]

    return X_csmote, y_csmote


In [5]:
data = pd.read_csv('.\data\led7digit-0-2-4-5-6-7-8-9_vs_1.dat')
X = data.iloc[:, data.columns != "Class"]
y = data["Class"].values.ravel()   # 转为1维
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)
X_train_cv_TSO_MCN, y_train_cv_TSO_MCN = TSO_MCN(X_train, y_train)

knn = KNeighborsClassifier()
knn.fit(X_train_cv_TSO_MCN, y_train_cv_TSO_MCN)
y_pred = knn.predict(X_test)

auc = roc_auc_score(y_test, y_pred)
print("AUC:", auc)


AUC: 0.9634146341463415
