In [1]:
import h5py
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.utils import HDF5Matrix
import math
import os

In [2]:
def create_superct_model(n_features):
    model = Sequential()
    model.add(Dense(200, input_dim = n_features, activation = 'relu'))
    model.add(Dropout(0.6))
    model.add(Dropout(0.6))
    model.add(Dense(100, activation = 'relu'))
    model.add(Dropout(0.6))
    model.add(Dropout(0.6))
    model.add(Dense(1, activation = 'relu'))
    model.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [3]:
def load_dataset_from_dir(file_dir):
    d_df = pd.read_csv(file_dir)
    X = d_df.iloc[:,1:-2]
    y = d_df['target_id']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [4]:
def dictionary_list (input_dic):
    return_str = ''
    for k, v in input_dic.items():
        return_str+=str(k)
        return_str+='\n\t'
        float_list = map(str, v) 
        return_str+='\n\t'.join(float_list)
        return_str+='\n'
        return_str+='\n'
    return return_str

In [5]:
def train_and_save(file_dir):
    print('start to process %s'%file_dir)
    X_train, X_test, y_train, y_test = load_dataset_from_dir(file_dir)
    batch_size = 128
    # round up the epches
    num_epoches = math.ceil(X_train.shape[0]/batch_size)
    model = create_superct_model(X_train.shape[1])
    history = model.fit(X_train, y_train, epochs=num_epoches, validation_data=(X_test, y_test))
    file_name = os.path.basename(file_dir)
    file_name = 'superct_v4_'+file_name
    history_name = file_name.replace('.csv', '.txt')
    output_model_name = file_name.replace('.csv', '.hdf5')
    with open(history_name, 'w') as f:
        f.write(file_name)
        f.write('\n')
        f.write('the number of observations: %d'%X_train.shape[0])
        f.write('\n')
        f.write('the number of features: %d'%X_train.shape[1])
        f.write('\n')
        f.write(dictionary_list(history.history))
    model.save(output_model_name)
    print('finished processing %s'%file_name)

In [6]:
 # processed_dge folder contains all new defined datasets
 # get all shapes from processed_dge files
for root, dirs, files in os.walk("/home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets", topdown=False):
    for name in files:
        file_dir = os.path.join(root, name)
        train_and_save(file_dir)

print('done')

acy: 0.0115 - val_loss: 5.1667e-07 - val_accuracy: 0.0000e+00
Epoch 34/52
Epoch 35/52
Epoch 36/52
Epoch 37/52
Epoch 38/52
Epoch 39/52
Epoch 40/52
Epoch 41/52
Epoch 42/52
Epoch 43/52
Epoch 44/52
Epoch 45/52
Epoch 46/52
Epoch 47/52
Epoch 48/52
Epoch 49/52
Epoch 50/52
Epoch 51/52
Epoch 52/52
finished processing superct_v4_BoneMarrowcKit_3.csv
start to process /home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets/Uterus_2.csv
Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11
finished processing superct_v4_Uterus_2.csv
start to process /home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets/Brain_1.csv
Epoch 1/21
Epoch 2/21
Epoch 3/21
Epoch 4/21
Epoch 5/21
Epoch 6/21
Epoch 7/21
Epoch 8/21
Epoch 9/21
Epoch 10/21
Epoch 11/21
Epoch 12/21
Epoch 13/21
Epoch 14/21
Epoch 15/21
Epoch 16/21
Epoch 17/21
Epoch 18/21
Epoch 19/21
Epoch 20/21
Epoch 21/21
finished processing