In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras import utils

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

import os
import math

In [None]:
# remove one dropout layer
def create_superct_model(n_features, n_targets):
    model = Sequential()
    model.add(Dense(200, input_dim = n_features, activation = 'relu'))
    model.add(Dropout(0.4))
    model.add(Dense(100, activation = 'relu'))
    model.add(Dropout(0.4))
    model.add(Dense(n_targets, activation = 'relu'))
    model.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
def train_and_save(file_dir, output_dir):
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    print('start to process %s'%file_dir)

    d_df = pd.read_csv(file_dir)
    X = d_df.iloc[:,1:-2]
    y = d_df['target_id']
    enc = LabelEncoder()
    enc.fit(y)
    encoded_Y = enc.transform(y)
    dummy_y = utils.to_categorical(encoded_Y)

    batch_size = 128
    # round up the epches
    num_epoches = math.ceil(X.shape[0]/batch_size)

    # training the model
    print('start training')
    estimator = KerasClassifier(build_fn=create_superct_model,n_features=X.shape[1], n_targets=dummy_y.shape[1], epochs=num_epoches, batch_size=batch_size, verbose=0)
    kfold = KFold(n_splits=10, shuffle=True)
    results = cross_val_score(estimator, X, dummy_y, cv=kfold, verbose=1)

    # save files
    file_name = os.path.basename(file_dir)
    history_name = file_name.replace('.csv', '.txt')
    history_name = os.path.join(output_dir, history_name)

    # content to be saved
    results_mean = str(results.mean()*100)+'%'
    results_std = str(results.std()*100)+'%'
    results_str = 'mean accuracy: '+results_mean+'\t'+'std: '+results_std
    print('start writing files')
    with open(history_name, 'w') as f:
        f.write(file_name)
        f.write('\n')
        f.write('the number of observations: %d'%X.shape[0])
        f.write('\n')
        f.write('the number of features: %d'%X.shape[1])
        f.write('\n')
        f.write('the number of targets: %d'%dummy_y.shape[1])
        f.write('\n')
        f.write(results_str)
    print('finished processing %s'%file_name)

In [None]:
processed_file_list = []
for root, dirs, files in os.walk("/content/drive/MyDrive/Colab Notebooks/superct_v1", topdown=False):
     for name in files:
       processed_file_list.append(name.replace('.txt','.csv'))

In [None]:
 # processed_dge folder contains all new defined datasets
 # get all shapes from processed_dge files
for root, dirs, files in os.walk("/content/drive/MyDrive/Colab Notebooks/pre_processed_datasets", topdown=False):
  for name in files:
    if name not in processed_file_list:
      file_dir = os.path.join(root, name)
      train_and_save(file_dir, '/content/drive/MyDrive/Colab Notebooks/superct_v1')

start to process /content/drive/MyDrive/Colab Notebooks/pre_processed_datasets/TrophoblastStemCells_1.csv
start training


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
# remove all dropout layers
def create_superct_model(n_features, n_targets):
    model = Sequential()
    model.add(Dense(200, input_dim = n_features, activation = 'relu'))
    model.add(Dense(100, activation = 'relu'))
    model.add(Dense(n_targets, activation = 'relu'))
    model.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def train_and_save(file_dir, output_dir):
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    print('start to process %s'%file_dir)

    d_df = pd.read_csv(file_dir)
    X = d_df.iloc[:,1:-2]
    y = d_df['target_id']
    enc = LabelEncoder()
    enc.fit(y)
    encoded_Y = enc.transform(y)
    dummy_y = utils.to_categorical(encoded_Y)

    batch_size = 128
    # round up the epches
    num_epoches = math.ceil(X.shape[0]/batch_size)

    # training the model
    print('start training')
    estimator = KerasClassifier(build_fn=create_superct_model,n_features=X.shape[1], n_targets=dummy_y.shape[1], epochs=num_epoches, batch_size=batch_size, verbose=0)
    kfold = KFold(n_splits=10, shuffle=True)
    results = cross_val_score(estimator, X, dummy_y, cv=kfold, verbose=1)

    # save files
    file_name = os.path.basename(file_dir)
    history_name = file_name.replace('.csv', '.txt')
    history_name = os.path.join(output_dir, history_name)

    # content to be saved
    results_mean = str(results.mean()*100)+'%'
    results_std = str(results.std()*100)+'%'
    results_str = 'mean accuracy: '+results_mean+'\t'+'std: '+results_std
    print('start writing files')
    with open(history_name, 'w') as f:
        f.write(file_name)
        f.write('\n')
        f.write('the number of observations: %d'%X.shape[0])
        f.write('\n')
        f.write('the number of features: %d'%X.shape[1])
        f.write('\n')
        f.write('the number of targets: %d'%dummy_y.shape[1])
        f.write('\n')
        f.write(results_str)
    print('finished processing %s'%file_name)

In [None]:
processed_file_list = []
for root, dirs, files in os.walk("/content/drive/MyDrive/Colab Notebooks/superct_v2", topdown=False):
     for name in files:
       processed_file_list.append(name.replace('.txt','.csv'))

In [None]:
 # processed_dge folder contains all new defined datasets
 # get all shapes from processed_dge files
for root, dirs, files in os.walk("/content/drive/MyDrive/Colab Notebooks/pre_processed_datasets", topdown=False):
  for name in files:
    if name not in processed_file_list:
      file_dir = os.path.join(root, name)
      train_and_save(file_dir, '/content/drive/MyDrive/Colab Notebooks/superct_v2')