In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy

In [2]:
def array_to_dist(dataset, feature_name, label, step):
    # 左闭右开
    if dataset is None:
        return []
    
    min_val = min(dataset[feature_name])
    max_val = max(dataset[feature_name]) 
    
    #print('feature:{},min:{}, max:{}'.format(feature_name, min_val, max_val))
    
    #print('min:{}, max:{}'.format(min_val, max_val))
    val = np.linspace(min_val, max_val, num=step+1)
    val = val.tolist()
    #print('val:{}'.format(val))
    
    dist=[]
    deal_data = dataset.loc[dataset['label']==label]
    if deal_data is None:
        return []
    sum_val = 0
    for i in range (step):
       # print('i:{},i+1:{}'.format(val[i], val[i+1]))
        x = deal_data.loc[(deal_data[feature_name]>=val[i]) & (deal_data[feature_name]<val[i+1])]
        sum_val = sum_val + len(x)
        val_size = len(x)
        dist.append(val_size)
    
    
    for i, val in enumerate(dist):
        if sum_val == 0:
            break
        dist[i] = dist[i]/sum_val
        if dist[i] == 0:
            dist[i] = 0.000001
    return dist
    

def kl_divergence(feature_name, dataset):
    '''
    计算feature_name对应的kl散度
    feature_name:特征名称
    dataset 数据集合
    面临一个问题，normal和disease两个数据长度不一致，需要对齐
    就算长度对齐也不对，需要进行分桶，计算好分布
    '''
   # neg = array_to_dist(dataset, feature_name, 0, 20)
    #pos = array_to_dist(dataset, feature_name, 1, 20)
    
    #print('neg:{},pos:{}'.format(neg, pos))
    '''
    这里使用numpy.histogram来代替之前自己写的函数，看看速度会提升不
    '''
    neg_feature_data = dataset[dataset['label'] == 0][feature_name] # 负样本数据
    pos_feature_data = dataset[dataset['label'] == 1][feature_name] # 正样本数据
    
    min_value = pos_feature_data.min() if neg_feature_data.min() > pos_feature_data.min() else neg_feature_data.min()
    max_value = neg_feature_data.max() if neg_feature_data.max() > pos_feature_data.max() else pos_feature_data.max()
    neg_hist, neg_bin = np.histogram(neg_feature_data, bins = 10, range=(min_value, max_value), density = True)
    pos_hist, pos_bin = np.histogram(pos_feature_data, bins = 10, range=(min_value, max_value), density = True)
    pos_hist[pos_hist == 0] = 1e-7
    neg_hist[neg_hist == 0] = 1e-7
    return entropy(neg_hist, pos_hist, base = 2)

# 获取所有df的
def get_all_kl_divergence(df):
    features =[]
    feature_kl = []
    size = len(df.columns)
    i = 0
    for feature in df.columns:
        if feature == 'label':
            continue
        features.append(feature)
        feature_kl.append(kl_divergence(feature, df))
        i = i + 1
        print('process:{}'.format(i))
    kl_feature_dataset = pd.DataFrame()
    kl_feature_dataset['feature_name'] = features
    kl_feature_dataset['kl_divergence'] = feature_kl
    final = kl_feature_dataset.sort_values('kl_divergence', ascending=False)
    return final

In [None]:
def array_to_hist(arr):
    # 数组生成hist

In [None]:
df = pd.read_csv('final_train_v4.csv', index_col = 0)

dd = get_all_kl_divergence(df)

In [None]:
len(dd)

In [None]:
dd.head(30)

In [None]:
def plot_train(feature_name, dataset):
    '''
     feature_name : 特征名称
     dataset: 训练集合的dataFrame的全集
    '''
    if  dataset is None:
        return
    if feature_name is None:
        return
    # 获取最后正样本和负样本两种数据的差异
    #ax.set_title('{}患病和正常组的数据分布图'.format(feature_name))
    #f, ax= plt.subplots()
    g = sns.displot(dataset, x=feature_name, kind='kde', hue='label', fill=True, rug=True)
    #g.set_axis_labels('{}'.format(feature_name))
    #g.set_titles('ontrol group and disease group PDF',)
    #g.set_ylabels("control group and disease group PDF")
    g.ax.set_title(u'{} disease group and control group PDF'.format(feature_name))
    #g.savefig("{}.png".format(feature_name))
    #sns.displot(disease_data,kind='kde')

In [None]:
plot_train('FXYD1', df1)

In [None]:
choosed_features = dd[dd['kl_divergence'] > 3].reset_index(drop = True)
choosed_features

In [None]:
len(choosed_features)

In [None]:
choosed_features

In [3]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.utils import shuffle
from datetime import datetime
from tensorflow import keras


def df_to_dataset(dataframe, shuffle=True, batch_size=20):
  dataframe1 = dataframe.copy()
  labels = dataframe1.pop('label')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe1), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe1))
  ds = ds.batch(batch_size)
  return ds

# 定义一个函数，输入数据，然后将列作为feature_column 然后构建lr模型来看哪个特征重要
def train_model(data):
    if data is None:
        return None
    if len(data.columns) == 0:
        return None
    feature_columns = []
    for feature in data.columns:
        if feature == 'label':
            continue
        feature_columns.append(tf.feature_column.numeric_column(feature))
    
    # 生成训练数据
    batch_size=20
    train, test = train_test_split(data, test_size=0.2)
    train_ds = df_to_dataset(train, batch_size=batch_size)
    test_ds = df_to_dataset(test, batch_size=batch_size)
    
    model = tf.keras.Sequential([
          tf.keras.layers.DenseFeatures(feature_columns),
          tf.keras.layers.Dense(1, activation = 'sigmoid')
    ])
    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['AUC','accuracy', 'Recall', 'Precision'])
    logdir="logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

    model.fit(train_ds,
          validation_data = test_ds,
          epochs = 100,
          callbacks = [tensorboard_callback])
    

In [None]:
#df_final = df[choosed_features['feature_name'] + ['label']]
#df_final['label'] = df['label']
#choosed_features['feature_name'].values.insert(['label'])
#choosed_features['feature_name'].append('label')
index_columns = np.append(choosed_features['feature_name'].values, ['label'])
df_final = df[index_columns]

In [None]:
df_final.head(10)

In [None]:
df_t2 = shuffle(df_final)
train_model(df_t2)

In [5]:
df = pd.read_csv('final_train_v4.csv', index_col = 0)

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
df_t2 = shuffle(df)
train_model(df_t2)

Epoch 1/100
