In [1]:
import numpy as np
import math
import random
import matplotlib.pyplot as plt
import pandas as pd
import copy
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from scipy import stats
from sklearn.metrics import confusion_matrix
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier 

In [3]:
#加载数据 预处理
data = pd.read_csv("../data/gene_version2020/generate/ExperimentalDatasets/MM/MM-MF.csv",index_col=0)
label = data["longevity influence"]
#删除无用列
data = data.drop(["longevity influence"],axis=1)

#初始化基因本体的DAG
DAG = pd.read_csv("../data/gene_version2020/generate/GOPath/MM/MM-MF.csv",index_col=0)

#过滤低纬度特征
for c in data.columns:
    data[c] = data[c].astype('int')
    if(data[c].sum() < 3):
        data.drop(c,axis=1,inplace=True)
DAG = DAG.loc[data.columns,data.columns]

#计算相关性
data["label"] = label
Relevance = pd.Series(0.0,index=DAG.columns)
for name in DAG.columns:
    temp = data[data[name] == 1]
    prob = temp['label'].mean()
    Relevance[name] = (prob-0.5)*(prob-0.5)+(0.5-prob)*(0.5-prob)
data = data.drop(["label"],axis=1)

#计算所有节点的祖先节点
AncDict = {}
relativeList = []
for name in DAG.columns:
    global relativeList
    relativeList = []
    getAncestors(name)
    AncDict[name] = relativeList

In [15]:
data.shape

(130, 503)

In [2]:
def GM_score(x,y): #计算GM 传入numpy
    cm1 = confusion_matrix(x,y)
    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    return sensitivity1,specificity1

def getPathsToRoot(path,name):#传入list 方便递归
    global pathList
    flag = 0
    path.append(name)
    for anc in DAG.columns:
        if(DAG.loc[anc,name] == 1):
            getPathsToRoot(copy.copy(path),anc)
            flag = 1
    if(flag == 0):
        pathList.append(path)
        
def RPV(x):
    dropList = []
    for name in DAG.columns:
        if(x[name] == 1):
            #遍历当前节点的所有祖先节点
            for anc in AncDict[name]:
                #保留相关性最高特征，其它特征加入dropSet
                if(Relevance[anc] < Relevance[name]):
                    dropList.append(anc)
        else:
            dropList.append(name)
    return list(set(DAG.columns)-set(dropList))

def myknn(train,label,x_test):
    predictList = []
    for j in range(x_test.shape[0]):
        test = x_test.iloc[j,:]
        maxJcd = 0
        prediction = 0
        for i in range(train.shape[0]):
            temp = jaccard_score(train.iloc[i,:],test)
            if(temp > maxJcd):
                prediction = label[i]
                maxJcd = temp
        predictList.append(prediction)
    return predictList

def getAncestors(name):
    global relativeList
    for relative in DAG.columns:
        if(DAG.loc[relative,name] == 1):
            relativeList.append(relative)
            getAncestors(relative)

In [19]:
sensitivity = []
specificity = []
F1 = []
AUC = []

#10折交叉验证
kf = KFold(n_splits=10,shuffle=True)
for train_index ,test_index in kf.split(data):
    trainData = data.iloc[train_index,:]
    testData = data.iloc[test_index,:]
    Y_train = label.values[train_index] #用于训练模型
    Y_test = label.values[test_index]#用于交叉验证
    predicetList = []
    
    
    for i in range(testData.shape[0]):  #对每一个样本进行特征选择和预测
        
        selectFeatures = RPV(testData.iloc[i,:]) #MR特征选择
        
        print(len(selectFeatures))
        if(len(selectFeatures) == 0):
            selectFeatures = DAG.columns
        X_train = trainData.loc[:,selectFeatures]
        X_test = testData.loc[:,selectFeatures].iloc[i,:]
        
        gnb = GaussianNB()
        predicetList.append(gnb.fit(X_train, Y_train).predict(X_test.values.reshape(1,len(selectFeatures)))[0])#把当前样本预测结果加入List
        #knn=KNeighborsClassifier()
        #knn.fit(X_train,Y_train)
        #predicetList.append(knn.predict(X_test.values.reshape(1,len(selectFeatures)))[0])
    print(predicetList)
    print(Y_test)
    sensi,speci= GM_score(np.array(predicetList),Y_test)
    sensitivity.append(sensi)
    specificity.append(speci)
    F1.append(f1_score(np.array(predicetList),Y_test))
    try:
        AUC.append(roc_auc_score(np.array(predicetList),Y_test))
    except ValueError:
        pass

6
4
5
7
0
9
5
13
1
3
1
1
1
[0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0]
[1 0 0 1 0 1 1 1 0 1 0 1 0]
8
2
6
2
8
2
4
1
3
4
3
2
8
[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1 1 1 1 1 1 1 0 1 0 1 1 0]
1
12
8
1
3
2
7
2
9
7
2
4
2
[1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1]
[1 1 1 1 1 1 0 1 0 1 1 1 1]
5
4
3
3
3
9
3
8
4
12
10
2
3
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0 1 0 1 1 1 1 0 1 1 1 0 1]
8
8
3
5
10
1
0
15
4
7
7
3
7
[1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0]
[1 1 1 0 1 1 1 1 1 0 1 1 0]
8
4
12
6
7
13
6
9

  sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])



9
6
10
5
1
[1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1]
[1 1 0 0 0 1 1 1 0 0 1 0 1]
8
3
9
4
4
8
0
1
5
10
2
1
2
[1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1]
[1 0 0 0 1 1 0 1 1 0 0 0 1]
5
1
2
4
9
1
10
6
2
7
0
6
3
[0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0]
[1 1 0 1 1 1 1 1 1 1 0 1 1]
2
3
5
2
7
9
6
2
10
6
2
1
0
[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0]
[0 0 1 1 1 1 1 0 0 1 0 0 1]
5
11
8
5
24
2
8
6
17
5
2
2
7
[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0 1 0 1 1 0 1 0 1 0 0 1 1]


In [20]:
a = np.nanmean(sensitivity)
b = np.nanmean(specificity)
print(a)
print(b)
print("GM")
print(math.sqrt(a*b))
print("f1")
print(np.nanmean(F1))
print("AUC")
print(np.nanmean(AUC))

0.5555555555555556
0.7138339438339437
GM
0.6297415447952277
f1
0.764064271641671
AUC
0.6358906525573194
