In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras import utils

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

import os
import math

import h5py

In [2]:
# remove one dropout layer
def create_superct_model(n_features, n_targets):
    model = Sequential()
    model.add(Dense(200, input_dim = n_features, activation = 'relu'))
    model.add(Dropout(0.4))
    model.add(Dropout(0.4))
    model.add(Dense(100, activation = 'relu'))
    model.add(Dropout(0.4))
    model.add(Dropout(0.4))
    model.add(Dense(n_targets, activation = 'relu'))
    model.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [3]:
data_f = h5py.File('/content/drive/MyDrive/Colab Notebooks/process_dataset.h5ad', 'r')

In [4]:
data_f.keys()

<KeysViewHDF5 ['X', 'labels', 'obs', 'var']>

In [5]:
X = data_f['X']

In [6]:
y = data_f['labels']

In [7]:
X.shape

(333778, 34947)

In [8]:
kfold = KFold(n_splits=10, shuffle=True)

In [9]:
enc = LabelEncoder()
enc.fit(y)
encoded_Y = enc.transform(y)
dummy_y = utils.to_categorical(encoded_Y)

In [10]:
dummy_y.shape

(333778, 104)

In [11]:
X

<HDF5 dataset "X": shape (333778, 34947), type "<f4">

In [12]:
batch_size = 1024
# round up the epches
num_epoches = math.ceil(X.shape[0]/batch_size)

In [None]:
print('start training')
estimator = KerasClassifier(build_fn=create_superct_model,n_features=X.shape[1], n_targets=dummy_y.shape[1], epochs=num_epoches, batch_size=batch_size, verbose=0)
kfold = KFold(n_splits=10, shuffle=True)
results = cross_val_score(estimator, X, dummy_y, cv=kfold, verbose=1)

start training


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
# save files
file_name = os.path.basename('/content/drive/MyDrive/Colab Notebooks/whole_data.txt')

# content to be saved
results_mean = str(results.mean()*100)+'%'
results_std = str(results.std()*100)+'%'
results_str = 'mean accuracy: '+results_mean+'\t'+'std: '+results_std
print('start writing files')
with open(file_name, 'w') as f:
    f.write(file_name)
    f.write('\n')
    f.write('the number of observations: %d'%X.shape[0])
    f.write('\n')
    f.write('the number of features: %d'%X.shape[1])
    f.write('\n')
    f.write('the number of targets: %d'%dummy_y.shape[1])
    f.write('\n')
    f.write(results_str)
print('finished processing %s'%file_name)