# 数据预处理

In [52]:
import numpy as np
import pandas as pd

In [53]:
path = "./hcvdat.csv"

In [54]:
data = pd.read_csv(path)
data.head()

Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,1,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,2,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,3,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,4,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,5,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


### 缺失值均值填充

In [55]:
data.isnull().sum()

Unnamed: 0     0
Category       0
Age            0
Sex            0
ALB            1
ALP           18
ALT            1
AST            0
BIL            0
CHE            0
CHOL          10
CREA           0
GGT            0
PROT           1
dtype: int64

In [56]:
data['ALB'] = data['ALB'].fillna(data['ALB'].mean())
data['ALP'] = data['ALP'].fillna(data['ALP'].mean())
data['ALT'] = data['ALT'].fillna(data['ALT'].mean())
data['CHOL'] = data['CHOL'].fillna(data['CHOL'].mean())
data['PROT'] = data['PROT'].fillna(data['PROT'].mean())

In [57]:
# 确保无缺失值
data.isnull().sum()

Unnamed: 0    0
Category      0
Age           0
Sex           0
ALB           0
ALP           0
ALT           0
AST           0
BIL           0
CHE           0
CHOL          0
CREA          0
GGT           0
PROT          0
dtype: int64

### 字符型数值化

In [58]:
# 确定Category和Sex列取值情况
cat_total = list(data['Category'].unique())
sex_total = list(data['Sex'].unique())
cat_total, sex_total

(['0=Blood Donor',
  '0s=suspect Blood Donor',
  '1=Hepatitis',
  '2=Fibrosis',
  '3=Cirrhosis'],
 ['m', 'f'])

In [59]:
# 数值化
data['Category'] = data['Category'].replace(cat_total, range(len(cat_total)))
data['Sex'] = data['Sex'].replace(sex_total, range(len(sex_total)))
data['Category'].unique(), data['Sex'].unique()

(array([0, 1, 2, 3, 4], dtype=int64), array([0, 1], dtype=int64))

In [60]:
data.head()

Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,1,0,32,0,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,2,0,32,0,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,3,0,32,0,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,4,0,32,0,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,5,0,32,0,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


### 划分标签

In [61]:
# 得到标签
y = data['Category'].values
y.shape

(615,)

### 划分特征

In [62]:
# 得到特征列
data_x = data.drop(['Unnamed: 0', 'Category'], axis=1)
data_x.head()

Unnamed: 0,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,32,0,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,32,0,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,32,0,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,32,0,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,32,0,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [63]:
x = data_x.values

# K-means

### sklearn实现

In [64]:
from sklearn.cluster import KMeans

In [65]:
k_means = KMeans(n_clusters=5, random_state=39)
k_means.fit(x)
y_pred_skl = k_means.predict(x)

### 手动实现

In [66]:
# 获得样本中心点(均值中心)
def get_cen(sample, k):
    cla_cen = []
    for i in range(k):
        cla_cen.append(np.mean(sample[i], axis=0))
    cla_cen = np.array(cla_cen)
    return cla_cen 

In [67]:
# 获得初始化中心点
def get_init_cen(x, k, num=3):
    ind = np.random.randint(0, x.shape[0], (k, num))  # 随机索引，其中num为每个类别样本数
    init_sample = x[ind]  # 初始化选择样本
    cla_cen = get_cen(init_sample, k)
    return cla_cen

In [68]:
# 获得样本距离中心点的距离矩阵
def get_distance(x, cla_cen, k):
    dis = []
    for i in range(k):
        dis.append(np.linalg.norm(x-cla_cen[i], ord=2, axis=1))
    dis = np.array(dis)
    return dis

In [69]:
# 获得聚类标签
def K_means(x, y, k, epoch):
    # 初始化中心点
    cla_cen = get_init_cen(x, k)
    
    # 迭代
    for i in range(epoch):
        # 获得样本距离中心点距离
        dis = get_distance(x, cla_cen, k)
        
        # 获得各类样本
        cla_sample = []
        ind = np.argmin(dis, axis=0)
        for cla in range(k):
            cla_sample.append(x[np.where(ind==cla)])
        
        # 获得新的中心点
        cla_cen = get_cen(cla_sample, k)
        
    # 迭代训练完成,获得x的聚类标签y_pred
    dis = get_distance(x, cla_cen, k)
    y_pred = np.argmin(dis, axis=0)
    return y_pred 

#### 考虑到k_means聚类算法易受初始化中心点的影响，事实上在实验过程中我们也发现正确率存在很大的波动

#### sklearn中的k_means方法实际上是默认进行十次聚类，取其中效果最佳的方式

#### 因此我们也选择进行若干次聚类，取正确率最高的作为理想聚类方式

In [70]:
k = 5  # 类别数
epoch = 300  # 迭代次数 

In [71]:
# 获得聚类标签
y_pred_self =[]
for i in range(10):
    y_pred_self.append(K_means(x, y, k, epoch))

# 实验结果评估

### 获得有实际意义的聚类标签

#### 聚类是一种无标签算法，其所得聚类标签没有实际含义，但数据集中提供的标签具有实际含义，因此两者取值可能会有所不同

#### 为了便于评价聚类性能，我们希望得到聚类标签经过某种排列能够具有实际含义以便与真实标签比较

#### 实现上，考虑类别较小， 我们遍历各种取值排列，选用正确率最高的排列作为我们进行评估的聚类标签

In [72]:
from itertools import permutations

In [73]:
# 替换类别标签
def change_label(y_pred, ind, arr, k):
    y_change = y_pred.copy()
    # 遍历各类别，替换标签
    for i in range(k):
        y_change[ind[i]] = arr[i]
    
    return y_change

In [74]:
# 获得用于评估的聚类标签
def get_y_eval(y_pred, y, k):
    # 获得所有排列方式
    all_arr = list(permutations(range(k), k))
    
    # 获得聚类标签中各类的索引
    y_ind = []
    for i in range(k):
        y_ind.append(np.where(y_pred==i))
    
    # 遍历获得能使正确率最高的类别排列方式
    best_acc = 0.0
    best_arr = 0
    for i in range(len(all_arr)):
        y_change = change_label(y_pred, y_ind, all_arr[i], k)  # 获取更换后的类别标签
        acc = np.sum(y_change==y) / y.shape[0]
        if acc>best_acc:
            best_acc = acc
            best_arr = i
    
    print('best_acc: '+str(best_acc))
    y_eval = change_label(y_pred, y_ind, all_arr[best_arr], k)
    return y_eval, best_acc


In [75]:
y_eval_skl,_ = get_y_eval(y_pred_skl, y, k=5)

best_acc: 0.8276422764227642


#### 在我们手动代码实现的十次聚类中，选择正确率最高的作为聚类标签

In [76]:
ind = 0
best_acc = 0
for i in range(10):
    y_eval_self, acc = get_y_eval(y_pred_self[i], y, k=5)
    if acc>best_acc:
        best_acc = acc
        ind = i
y_eval_self,_ = get_y_eval(y_pred_self[ind], y, k=5)

best_acc: 0.7983739837398374
best_acc: 0.4813008130081301
best_acc: 0.4764227642276423
best_acc: 0.7983739837398374
best_acc: 0.8227642276422764
best_acc: 0.4682926829268293
best_acc: 0.7983739837398374
best_acc: 0.7983739837398374
best_acc: 0.4764227642276423
best_acc: 0.8227642276422764
best_acc: 0.8227642276422764


### 获得Micro-F1和Macro-F1

#### 我们在求取Macro-F1的实验过程中出现问题：

#### 若聚类标签恰好使得某类的精确率Precison和召回率Recall都为0，即TP为0时，则F1公式的分母为0，导致F1值为nan。

#### F1分数被定义为精确率和召回率的调和平均数，因此我们认为对于这种情况，应当设F1=0或为极小值

In [77]:
# 得到各类的precison
def get_precison(y_eval, y, k):
    class_precison = []
    for i in range(k):
        TP = np.sum((y_eval==i)&(y==i))  # 预测为正，实际也为正
        FP = np.sum((y_eval==i)&(y!=i))  # 预测为正，实际为负
        cla_pre = TP/(TP+FP)
        class_precison.append(cla_pre)
    class_precison = np.array(class_precison)
    return class_precison

In [78]:
# 得到各类的recall
def get_recall(y_eval, y, k):
    class_recall = []
    for i in range(k):
        TP = np.sum((y_eval==i)&(y==i))  # 预测为正，实际也为正
        FN = np.sum((y_eval!=i)&(y==i))  # 预测为负，实际为正
        cla_rec = TP/(TP+FN)
        class_recall.append(cla_rec)
    class_recall = np.array(class_recall)
    return class_recall

In [79]:
# 获得Micro-F1
def get_Micro_F1(y_eval, y, k=5):
    # 获得各类的精确率和召回率
    Precison = get_precison(y_eval, y, k) 
    Recall = get_recall(y_eval, y, k)
    
    # 获得总的精确率和召回率
    Precison = np.sum(Precison) / k
    Recall = np.sum(Recall) / k
    
    # 获得F1分数
    Micro_F1 = (2*Precison* Recall)/(Precison+Recall)
    
    return Micro_F1

In [80]:
# 获得Macro-F1
def get_Macro_F1(y_eval, y, k=5):
    # 获得各类的精确率和召回率
    Precison = get_precison(y_eval, y, k) 
    Recall = get_recall(y_eval, y, k)

    # 获得各类F1分数
    Macro_F1 = (2*Precison* Recall)/(Precison+Recall) 
    # 处理nan值 
    Macro_F1 = np.nan_to_num(Macro_F1)
    # 获得平均F1分数
    Macro_F1 = np.sum(Macro_F1) / k
    
    return Macro_F1

In [83]:
Micro_F1 = get_Micro_F1(y_eval_skl, y, k=5)
Macro_F1 = get_Macro_F1(y_eval_skl, y, k=5)
print('Micro_F1: '+str(Micro_F1))
print('Macro_F1: '+str(Macro_F1))

Micro_F1: 0.39497281179102967
Macro_F1: 0.36719536049569257


  Macro_F1 = (2*Precison* Recall)/(Precison+Recall)


In [84]:
Micro_F1 = get_Micro_F1(y_eval_self, y, k=5)
Macro_F1 = get_Macro_F1(y_eval_self, y, k=5)
print('Micro_F1: '+str(Micro_F1))
print('Macro_F1: '+str(Macro_F1))

Micro_F1: 0.3938601470691342
Macro_F1: 0.365015000197371


  Macro_F1 = (2*Precison* Recall)/(Precison+Recall)


# 实验结果分析

### 我们采用两种方式实现K-means聚类: sklearn库提供的KMeans方法和我们手动实现的方法，

### 分别取得了82.76%, 82.28%的正确率，证明我们的聚类能很好的划分类别

### 另外，两种方法取得的Micro_F1和Macro_F1值介于36%-40%之间

### 证明我们的聚类对样本量较小的类别的分类效果并不理想，模型缺乏对小样本类别较好的划分能力