In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy

print(tf.__version__)

2.0.0


In [2]:
def plot_train(feature_name, dataset):
    '''
     feature_name : 特征名称
     dataset: 训练集合的dataFrame的全集
    '''
    if  dataset is None:
        return
    if feature_name is None:
        return
    # 获取最后正样本和负样本两种数据的差异
    #ax.set_title('{}患病和正常组的数据分布图'.format(feature_name))
    #f, ax= plt.subplots()
    g = sns.displot(dataset, x=feature_name, kind='kde', hue='label', fill=True, rug=True)
    #g.set_axis_labels('{}'.format(feature_name))
    #g.set_titles('ontrol group and disease group PDF',)
    #g.set_ylabels("control group and disease group PDF")
    g.ax.set_title(u'{} disease group and control group PDF'.format(feature_name))
    g.savefig("{}.png".format(feature_name))
    #sns.displot(disease_data,kind='kde')

In [3]:
def array_to_dist(dataset, feature_name, label, step):
    # 左闭右开
    if dataset is None:
        return []
    
    min_val = min(dataset[feature_name])
    max_val = max(dataset[feature_name]) 
    
    #print('feature:{},min:{}, max:{}'.format(feature_name, min_val, max_val))
    
    #print('min:{}, max:{}'.format(min_val, max_val))
    val = np.linspace(min_val, max_val, num=step+1)
    val = val.tolist()
    #print('val:{}'.format(val))
    
    dist=[]
    deal_data = dataset.loc[dataset['label']==label]
    if deal_data is None:
        return []
    sum_val = 0
    for i in range (step):
       # print('i:{},i+1:{}'.format(val[i], val[i+1]))
        x = deal_data.loc[(deal_data[feature_name]>=val[i]) & (deal_data[feature_name]<val[i+1])]
        sum_val = sum_val + len(x)
        val_size = len(x)
        dist.append(val_size)
    
    
    for i, val in enumerate(dist):
        if sum_val == 0:
            break
        dist[i] = dist[i]/sum_val
        if dist[i] == 0:
            dist[i] = 0.000001
    return dist
    

def kl_divergence(feature_name, dataset):
    '''
    计算feature_name对应的kl散度
    feature_name:特征名称
    dataset 数据集合
    面临一个问题，normal和disease两个数据长度不一致，需要对齐
    就算长度对齐也不对，需要进行分桶，计算好分布
    '''
    neg = array_to_dist(dataset, feature_name, 0, 20)
    pos = array_to_dist(dataset, feature_name, 1, 20)
    
    #print('neg:{},pos:{}'.format(neg, pos))
    return entropy(neg, pos, base = 2)

# 获取所有df的
def get_all_kl_divergence(df):
    features =[]
    feature_kl = []
    size = len(df.columns)
    i = 0
    for feature in df.columns:
        if feature == 'label':
            continue
        features.append(feature)
        feature_kl.append(kl_divergence(feature, df))
        i = i + 1
        print('process:{}'.format(i))
    kl_feature_dataset = pd.DataFrame()
    kl_feature_dataset['feature_name'] = features
    kl_feature_dataset['kl_divergence'] = feature_kl
    final = kl_feature_dataset.sort_values('kl_divergence', ascending=False)
    return final
    

In [None]:
import math
df = pd.read_csv('final_train.csv', index_col=0)

df.drop(['name'], axis=1, inplace = True)

df3 = get_all_kl_divergence(df)

In [None]:
x = df.columns.to_list()

In [None]:
x[19562]

In [None]:
df4 = df3.reset_index(drop=True)
df4.head(1000)

In [None]:
plot_train('AGER', df)

In [None]:

df['label'] = df['label'].apply(lambda x: int(not x))

df.to_csv('final_train_v2.csv')



# 问题和改进点
发现数据正负样本差别比较多，负样本比较少，正样本比较多，需要对正样本进行负采样

In [None]:
def get_gen_length(matrix, gene_len):
    GFF3 = pd.read_csv(
    filepath_or_buffer='Homo_sapiens.GRCh38.103.gtf', 
    sep='\t', 
    header=None,
    names=['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'],
    skiprows=[i for i in range(5)])

    GFF3 = GFF3[GFF3['source'].notnull()]

    return GFF3['attributes'].head()

get_gen_length(None, None)

In [None]:
def remove_invalid_gene(file):
    df = pd.read_csv(file, index_col=0)

def non_zero_count(ser):
    val_np = ser.to_numpy()
    #return np.max(val_np)
    return np.percentile(val_np, 90)        
    
df = pd.read_csv('final_train_v2.csv', index_col=0)
gene_name = []
nonzero_count = []
for index, row in df.iteritems():
    if index == 'label':
        continue
   # print('index:{},row:{}'.format(index, row))
    gene_name.append(index)
    nonzero_count.append(row.aggregate(non_zero_count))
df1 = pd.DataFrame({'gene':gene_name, 'nonzero_count':nonzero_count})
df1.sort_values('nonzero_count', inplace = True)

df2 = df1[df1['nonzero_count'] == 0]

df.drop(df2['gene'], axis = 1, inplace = True)

df.to_csv('final_train_v4.csv')

In [None]:
df[df['label'] == 0]['KREMEN1'].describe()

In [None]:
df[df['label'] == 1]['KREMEN1'].describe()

In [None]:
len(df.columns)

In [None]:
df2

In [None]:
df3 = pd.read_csv('final_train_v2.csv', index_col=0)


In [None]:
df3['RBMY1J'].describe()

In [None]:
df3[df3['label'] == 1]['RBMY1J'].describe()

In [None]:
df3[df3['label'] == 0]['RBMY1J'].describe()

In [None]:
df = pd.read_csv('final_train_v4.csv', index_col = 0)

normal_df = df[df['label'] == 0]

disease_df = df[df['label'] == 1].sample(n = 60)

df1 = normal_df.append(disease_df)

x = get_all_kl_divergence(df1)

x

In [None]:
x.reset_index(drop = True)


In [16]:
choosed_features = x[x['kl_divergence'] > 6.5].reset_index(drop = True)
choosed_features

Unnamed: 0,feature_name,kl_divergence
0,EZH2,16.572341
1,RS1,16.393907
2,STX11,16.268993
3,STIL,15.646141
4,GPM6A,15.468277
...,...,...
443,EFCC1,6.541349
444,EBNA1BP2,6.534038
445,TUFM,6.532677
446,NUP35,6.531717


In [17]:
choosed

NameError: name 'choosed' is not defined

In [None]:
plot_train('NPNT', df1)

In [18]:
#choosed_features['feature_name'].append('label')

df_final = df[choosed_features['feature_name'] + ['label']]

KeyError: "None of [Index(['EZH2label', 'RS1label', 'STX11label', 'STILlabel', 'GPM6Alabel',\n       'SRPK1label', 'SFTPClabel', 'PAICSlabel', 'GALNT7label', 'AGERlabel',\n       ...\n       'MEFVlabel', 'LRRK2label', 'GRK5label', 'SEMA6Dlabel', 'ZNF280Clabel',\n       'EFCC1label', 'EBNA1BP2label', 'TUFMlabel', 'NUP35label', 'NPNTlabel'],\n      dtype='object', length=448)] are in the [columns]"

In [None]:
df_final['label'] = df['label']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.utils import shuffle


def df_to_dataset(dataframe, shuffle=True, batch_size=20):
  dataframe1 = dataframe.copy()
  labels = dataframe1.pop('label')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe1), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe1))
  ds = ds.batch(batch_size)
  return ds

# 定义一个函数，输入数据，然后将列作为feature_column 然后构建lr模型来看哪个特征重要
def train_model(data):
    if data is None:
        return None
    if len(data.columns) == 0:
        return None
    feature_columns = []
    for feature in data.columns:
        if feature == 'label':
            continue
        feature_columns.append(tf.feature_column.numeric_column(feature))
    
    # 生成训练数据
    batch_size=10
    train, test = train_test_split(data, test_size=0.2)
    train_ds = df_to_dataset(train, batch_size=batch_size)
    test_ds = df_to_dataset(test, batch_size=batch_size)
    
    model = tf.keras.Sequential([
          tf.keras.layers.DenseFeatures(feature_columns),
          tf.keras.layers.Dense(1, activation = 'sigmoid', kernel_regularizer = 'l1')
    ])
    model.compile(optimizer='sgd',
              loss='binary_crossentropy',
              metrics=['AUC'])

    model.fit(train_ds,
          validation_data = test_ds,
          epochs = 20)
    auc = 0
    for m in model.metrics:
        if m.name == 'AUC':
            auc = m.result().numpy()
    return auc
    

df_t2 = shuffle(df_final)
train_model(df_t2)
    

## 画箱线图看看效果

In [None]:
df_t2.boxplot(column = 'CLEC3B', showmeans = True, notch = True, vert = False)

In [None]:
x