In [0]:
#@title
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow import feature_column

In [102]:
!git clone https://github.com/jyluo1994/python-binder.git

fatal: destination path 'python-binder' already exists and is not an empty directory.


In [103]:
df = pd.read_csv('python-binder/7y_nona.csv')
df.columns

Index(['death', 'recurrence', 'metastasis', 'progression', 'age', 'gender',
       'treatment', 'IC.N', 'CC.N', 'AC.N', 'pathology', 'T', 'N',
       'UICC.Stage', 'smoking', 'family.history', 'EBVDNA', 'BMI', 'WBC',
       'NEU', 'LYM', 'MON', 'HGB', 'PLT', 'LDH', 'ALB', 'TBIL', 'DBIL', 'IBIL',
       'BUN', 'CRE', 'UA', 'CHO', 'TG', 'HDL.C', 'LDL.C', 'ApoA1', 'ApoB',
       'CRP', 'VCA.IgA', 'EA.IgA'],
      dtype='object')

然后我们来分割数据。这里使用的是 Scikit-learn 中的 train_test_split 函数。指定分割比例即可。
我们先按照 80:20 的比例，把总体数据分成训练集和测试集。

In [0]:
tf.random.set_seed(1)
train, test = train_test_split(df, test_size=0.2, random_state=1)

然后，再把现有训练集的数据，按照 80:20 的比例，分成最终的训练集，以及验证集。

In [0]:
train,valid = train_test_split(train, test_size=0.25, random_state=1)

这里，我们都指定了 random_state ，为的是保证咱们随机分割的结果一致。
我们看看几个不同集合的长度。

In [106]:
print(len(train))
print(len(valid))
print(len(test))

486
162
163


In [107]:
feature_columns = []
numeric_columns = ['age', 'EBVDNA', 'BMI', 'WBC',
       'NEU', 'LYM', 'MON', 'HGB', 'PLT', 'LDH', 'ALB', 'TBIL', 'DBIL', 'IBIL',
       'BUN', 'CRE', 'UA', 'CHO', 'TG', 'HDL.C', 'LDL.C', 'ApoA1', 'ApoB',
       'CRP', 'VCA.IgA', 'EA.IgA']
    
for header in numeric_columns:
  feature_columns.append(
      feature_column.numeric_column(
          header, 
          normalizer_fn=lambda x: (tf.cast(x, dtype=float)-train[header].mean())/train[header].std()))    
    
    
categorical_columns = ['gender',
       'treatment', 'IC.N', 'CC.N', 'AC.N', 'pathology', 'T', 'N',
       'UICC.Stage', 'smoking', 'family.history',]

def get_one_hot_from_categorical(colname):
    categorical = feature_column.categorical_column_with_vocabulary_list(colname, train[colname].unique().tolist())
    return feature_column.indicator_column(categorical)



for col in categorical_columns:
    feature_columns.append(get_one_hot_from_categorical(col))

feature_columns

[NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function <lambda> at 0x7f5c98a2c9d8>),
 NumericColumn(key='EBVDNA', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function <lambda> at 0x7f5c98a2cae8>),
 NumericColumn(key='BMI', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function <lambda> at 0x7f5c98a2cb70>),
 NumericColumn(key='WBC', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function <lambda> at 0x7f5c98a2cbf8>),
 NumericColumn(key='NEU', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function <lambda> at 0x7f5c98a2cc80>),
 NumericColumn(key='LYM', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function <lambda> at 0x7f5c98a2cd08>),
 NumericColumn(key='MON', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function <lambda> at 0x7f5c98a2cd90>),
 NumericColumn(key='HGB', shape=(1,), default_value=None, dtype=tf.float32, normalizer_

In [108]:
from tensorflow.keras import layers
feature_layer = layers.DenseFeatures(feature_columns)
feature_layer

<tensorflow.python.feature_column.feature_column_v2.DenseFeatures at 0x7f5c98bd5d30>

In [120]:
import sklearn.metrics
model = keras.Sequential([
    feature_layer,
    layers.Dense(200, activation='relu'),
    layers.Dense(100, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

      
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('progression')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds      
    
batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(valid, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

model.fit(train_ds,
          validation_data=valid_ds,
          epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f5c96d30a58>

In [121]:
pred = model.predict(test_ds)
pred = np.rint(pred)
np.unique(pred)

array([0.], dtype=float32)

In [111]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(test['progression'], pred))

              precision    recall  f1-score   support

           0       0.80      1.00      0.89       130
           1       0.00      0.00      0.00        33

   micro avg       0.80      0.80      0.80       163
   macro avg       0.40      0.50      0.44       163
weighted avg       0.64      0.80      0.71       163



  'precision', 'predicted', average, warn_for)


In [112]:
print(confusion_matrix(test['progression'], pred))

[[130   0]
 [ 33   0]]


In [122]:
x_val = train.
x_train = train_x[10000:]

y_val = train_y[:10000]
y_train = train_y[10000:]

history = model.fit(x_train,y_train,
                   epochs=40, batch_size=5,
                   validation_data=(x_val, y_val),
                   verbose=1)

result = model.evaluate(test_x, text_y)
print(result)



NameError: ignored

In [0]:
import matplotlib.pyplot as plt
history_dict = history.history
history_dict.keys()
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(acc)+1)

plt.plot(epochs, loss, 'bo', label='train loss')
plt.plot(epochs, val_loss, 'b', label='val loss')
plt.title('Train and val loss')
plt.xlabel('Epochs')
plt.xlabel('loss')
plt.legend()
plt.show()