In [None]:
import tensorflow as tf
import pandas as pd

from tensorflow import keras


from tensorflow import feature_column
from tensorflow.keras import layers

## 线性模型
使用逻辑回归模型来进行预测

In [None]:
def get_allowed_snp_names():
    df = pd.read_csv('allowed_snp.csv')
    df['snp'] = df['snp'].apply(lambda x : x.split('_')[1])
    df = df.sort_values(by=['snp'])
    snps = df['snp'].tolist()
    return snps

In [None]:
def value_to_int(hl):
    if hl>=0 and hl<10:
        return 0
    if hl>=10 and hl<20:
        return 1
    if hl>=20 and hl<30:
        return 2
    if hl>=30 and hl<40:
        return 3
    if hl>=40 and hl<50:
        return 4
    if hl>=50 and hl<60:
        return 5
    if hl>=60 and hl<70:
        return 6
    if hl>=70 and hl<80:
        return 7
    if hl>=80 and hl <90:
        return 8
    return 2

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('caco')
    #dataframe['HL'] = dataframe['HL'].map(value_to_int)
    #labels = dataframe.pop('HL')
    dataframe.pop('Simple Name')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds.repeat()
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
        ds = ds.batch(batch_size)
    #it = ds.make_initializable_iterator()
    return ds

In [None]:
def build_input_fn(file):
    # 生成输入样本
    df = pd.read_csv(file)
    df = df.dropna()
    return df_to_dataset(df)

In [None]:
def build_all_input_fn():
    df = pd.read_csv('1.csv')
    df = df.dropna()
    df1 = pd.read_csv('2.csv')
    df1 = df1.dropna()
    final_df = pd.concat([df, df1], ignore_index=True)
    #######开始剔除错误数据###########
    final_case_df = final_df[(final_df['caco'] == 1) & (final_df['HL'] >= 40)]
    final_control_df = final_df[(final_df['caco'] == 0) & (final_df['HL']) < 40]
    final_df = pd.concat([final_case_df, final_control_df], ignore_index=True)
    final_df = final_df[final_df['age']>10]
    ######end######################
    #######开始正则化#####
    final_df['CNE'] = (final_df['CNE']-final_df['CNE'].mean())/final_df['CNE'].std()
    final_df['age'] = (final_df['age']-final_df['age'].mean())/final_df['age'].std()
    final_df['BMI'] = (final_df['BMI']-final_df['BMI'].mean())/final_df['BMI'].std()
    final_df['HL'] = (final_df['HL']-final_df['HL'].mean())/final_df['HL'].std()
    
    ######end###########
    final_df = final_df.sample(frac=1).reset_index(drop=True)
    final_train_dataset = final_df.sample(frac=0.9, random_state=0, axis=0)
    final_test_dataset = final_df[~final_df.index.isin(final_train_dataset.index)]
    final_test_dataset.to_csv('test.csv')
    #return df_to_dataset(final_train_dataset)
    return df_to_dataset(final_train_dataset), df_to_dataset(final_test_dataset)

In [None]:
# 定义所有的feature
def get_feature_columns():
    features = []
    
    cne_feature = feature_column.numeric_column('CNE')  #累积噪声量
    category_cne = feature_column.bucketized_column(cne_feature, [85, 90, 95, 100, 105, 110])
    features.append(cne_feature)
    
    age_feature = feature_column.numeric_column('age') # 年龄
    category_age_feature = feature_column.bucketized_column(age_feature, [20, 25, 30, 35, 40, 45, 50, 55])
    features.append(age_feature)
    
    sex_feature = feature_column.categorical_column_with_vocabulary_list('sex', [1, 2])
    sex_feature_one_hot = feature_column.indicator_column(sex_feature)
    features.append(sex_feature_one_hot)
    
    smoke_feature = feature_column.categorical_column_with_vocabulary_list('smoke', [1, 0])
    smoke_feature_one_hot = feature_column.indicator_column(smoke_feature)
    features.append(smoke_feature_one_hot)
    
    drink_feature = feature_column.categorical_column_with_vocabulary_list('drink', [1, 0])
    drink_feature_one_hot = feature_column.indicator_column(drink_feature)
    features.append(drink_feature_one_hot)
    
    excercise_feature = feature_column.categorical_column_with_vocabulary_list('excercise', [1, 0])
    excercise_feature_one_hot = feature_column.indicator_column(excercise_feature)
    features.append(excercise_feature_one_hot)
    
    hp_feature = feature_column.categorical_column_with_vocabulary_list('HP', [1, 0])
    hp_feature_one_hot = feature_column.indicator_column(hp_feature)
    features.append(hp_feature_one_hot)
    
    bmi_feature = feature_column.numeric_column('BMI')
    category_bmi_feature = feature_column.bucketized_column(bmi_feature, [15, 20, 25, 30, 35, 40])
    features.append(category_bmi_feature)
    
    rs1358714_x_rs1200130_feature = feature_column.crossed_column(['rs1358714', 'rs1200130'], 9)
    features.append(feature_column.indicator_column(rs1358714_x_rs1200130_feature))
    
    rs17412009_x_rs1200130_feature = feature_column.crossed_column(['rs17412009', 'rs1200130'], 9)
    features.append(feature_column.indicator_column(rs17412009_x_rs1200130_feature))
    
    rs2070703_x_rs1200130_feature = feature_column.crossed_column(['rs2070703', 'rs1200130'], 9)
    features.append(feature_column.indicator_column(rs2070703_x_rs1200130_feature))
    
    rs6458080_x_rs1200130_feature = feature_column.crossed_column(['rs6458080', 'rs1200130'], 9)
    features.append(feature_column.indicator_column(rs6458080_x_rs1200130_feature))
    
    rs17412009_x_rs1200135_feature = feature_column.crossed_column(['rs17412009', 'rs1200135'], 9)
    features.append(feature_column.indicator_column(rs17412009_x_rs1200135_feature))
    
    rs1200137_x_rs6458080_feature = feature_column.crossed_column(['rs1200137', 'rs6458080'], 9)
    features.append(feature_column.indicator_column(rs1200137_x_rs6458080_feature))
    
    rs17412009_x_rs1358714_feature = feature_column.crossed_column(['rs17412009', 'rs1358714'], 9)
    features.append(feature_column.indicator_column(rs17412009_x_rs1358714_feature))
    
    rs6458080_x_rs1678690_x_sex_feature = feature_column.crossed_column(['rs6458080', 'rs1678690','sex'], 18)
    features.append(feature_column.indicator_column(rs6458080_x_rs1678690_x_sex_feature))
    
    rs17412009_x_rs6458080_x_sex_feature = feature_column.crossed_column(['rs17412009', 'rs6458080','sex'], 18)
    features.append(feature_column.indicator_column(rs17412009_x_rs6458080_x_sex_feature))
    
    
    rs1200137_x_rs17412009_x_rs1200135_feature = feature_column.crossed_column(['rs1200137', 'rs17412009','rs1200135'], 27)
    features.append(feature_column.indicator_column(rs1200137_x_rs17412009_x_rs1200135_feature))

    rs17412009_x_rs1200135_x_rs1358714_feature = feature_column.crossed_column(['rs17412009', 'rs1200135','rs1358714'], 27)
    features.append(feature_column.indicator_column(rs17412009_x_rs1200135_x_rs1358714_feature))

    rs1200137_x_rs17412009_x_rs1358714_feature = feature_column.crossed_column(['rs1200137', 'rs17412009','rs1358714'], 27)
    features.append(feature_column.indicator_column(rs1200137_x_rs17412009_x_rs1358714_feature))
    
    rs1200137_x_rs1200135_x_rs1358714_feature = feature_column.crossed_column(['rs1200137', 'rs1200135','rs1358714'], 27)
    features.append(feature_column.indicator_column(rs1200137_x_rs1200135_x_rs1358714_feature))
    
    smoke_x_sex_feature = feature_column.crossed_column(['sex', 'smoke'], 4)
    features.append(feature_column.indicator_column(smoke_x_sex_feature))
    
    rs1200137_x_rs1200130_x_sex = feature_column.crossed_column(['sex', 'rs1200137', 'rs1200130'], 18)
    features.append(feature_column.indicator_column(rs1200137_x_rs1200130_x_sex))
    
    rs159153_x_rs1200130_x_sex = feature_column.crossed_column(['sex', 'rs159153', 'rs1200130'], 18)
    features.append(feature_column.indicator_column(rs159153_x_rs1200130_x_sex))
    
    rs17412009_x_rs1200130_x_sex = feature_column.crossed_column(['sex', 'rs17412009', 'rs1200130'], 18)
    features.append(feature_column.indicator_column(rs17412009_x_rs1200130_x_sex))
    
    rs34996498_x_rs1200130_x_sex = feature_column.crossed_column(['sex', 'rs34996498', 'rs1200130'], 18)
    features.append(feature_column.indicator_column(rs34996498_x_rs1200130_x_sex))
    
    rs3766031_x_rs1200130_x_sex = feature_column.crossed_column(['sex', 'rs3766031', 'rs1200130'], 18)
    features.append(feature_column.indicator_column(rs3766031_x_rs1200130_x_sex))
    
    
    
    
    
    
    
    
    
    
    

    
    # hl_feature = feature_column.numeric_column('HL')
    # features.append(hl_feature)
    
    valid_snp_names = get_allowed_snp_names()
    
    snp_ga = ('rs10091503', 'rs1026435', 'rs10503675', 
              'rs11778205', 'rs1200135', 'rs1358714', 'rs1678674',
              'rs1738254', 'rs3737094', 'rs3807154', 'rs3823430', 
              'rs4452640', 'rs874808', 'rs9357283','rs3745504','rs4801822','rs34996498')
    snp_gc = ('rs1678690','rs919390','rs378811','rs3872717','rs7621556') # C/C G/C G/G
    snp_tg=('rs627491')
    snp_ta=('rs7641176')
    
    for snp in valid_snp_names:
        if snp in snp_ga:
            snp_feature = feature_column.categorical_column_with_vocabulary_list(snp, ['G/A', 'G/G', 'A/A' ])
        elif snp in snp_gc:
            snp_feature = feature_column.categorical_column_with_vocabulary_list(snp, ['C/C', 'G/G', 'G/C' ])
        elif snp in snp_tg:
            snp_feature = feature_column.categorical_column_with_vocabulary_list(snp, ['T/G', 'G/G', 'T/T' ])
        elif snp in snp_ta:
            snp_feature = feature_column.categorical_column_with_vocabulary_list(snp, ['A/A', 'T/A', 'T/T' ])
        else:
            snp_feature = feature_column.categorical_column_with_vocabulary_list(snp, ['C/C', 'T/C', 'T/T' ])
        snp_feature_one_hot = feature_column.indicator_column(snp_feature)
        features.append(snp_feature_one_hot)
    return features

In [None]:
# 建立逻辑回归模型
def build_model(feature_columns):   
    model=tf.keras.Sequential()
    model.add(tf.keras.layers.DenseFeatures(feature_columns))
    model.add(tf.keras.layers.Dense(32, activation = 'relu', name='layer1'))
    model.add(tf.keras.layers.Dense(1,activation='sigmoid',name='last_layer'))
    model.compile(optimizer = tf.keras.optimizers.Adagrad(lr=0.001),
                  loss = tf.keras.losses.BinaryFocalCrossentropy(),
                  metrics=['AUC'])
    return model

In [None]:
feature_columns = get_feature_columns()
model = build_model(feature_columns)
model.summary()

In [None]:
# 训练和评估模型
def train_and_eval_model(batch_size=20):
    feature_columns = get_feature_columns()
    model = build_model(feature_columns)
    trainset, test_set = build_all_input_fn()
    model.fit(x=trainset, 
              epochs=200,
              validation_data=test_set,
              callbacks=[tf.keras.callbacks.TensorBoard(log_dir='./linear_model_logs')])
    return model

In [None]:
model = train_and_eval_model()

In [None]:
model.save('./save_models/20220213/v3')

In [None]:
model.history.history

In [None]:
def predict_test_set(model):
    df = pd.read_csv('test.csv')
    df.pop('Unnamed: 0')
    df.pop('Simple Name')
    labels = df.pop('caco')
    df = df.dropna()
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    ds.batch(20)
    print(ds)
    return model(dict(df))
    

In [None]:
predict_test_set(model)

In [None]:
train = build_all_input_fn()

In [None]:
train.take(10)