In [1]:
import h5py
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.utils import HDF5Matrix
import math

In [23]:
def create_superct_model(n_features):
    model = Sequential()
    model.add(Dense(200, input_dim = n_features, activation = 'relu'))
    model.add(Dropout(0.4))
    model.add(Dropout(0.4))
    model.add(Dense(100, activation = 'relu'))
    model.add(Dropout(0.4))
    model.add(Dropout(0.4))
    model.add(Dense(1, activation = 'relu'))
    model.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [62]:
def load_dataset_from_dir(file_dir):
    d_df = pd.read_csv(file_dir)
    X = d_df.iloc[:,1:-2]
    y = d_df['target_id']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [63]:
def dictionary_list (input_dic):
    return_str = ''
    for k, v in input_dic.items():
        return_str+=str(k)
        return_str+='\n\t'
        float_list = map(str, v) 
        return_str+='\n\t'.join(float_list)
        return_str+='\n'
        return_str+='\n'
    return return_str

In [67]:
def train_and_save(file_dir):
    print('start to process')
    X_train, X_test, y_train, y_test = load_dataset_from_dir(file_dir)
    batch_size = 128
    # round up the epches
    num_epoches = math.ceil(X_train.shape[0]/batch_size)
    model = create_superct_model(X_train.shape[1])
    history = model.fit(X_train, y_train, epochs=num_epoches, validation_data=(X_test, y_test))
    file_name = os.path.basename(file_dir)
    file_name = 'superct_v0_'+file_name
    history_name = file_name.replace('.csv', '.txt')
    output_model_name = file_name.replace('.csv', '.hdf5')
    with open(history_name, 'w') as f:
        f.write(file_name)
        f.write('\n')
        f.write('the number of observations: %d'%X_train.shape[0])
        f.write('\n')
        f.write('the number of features: %d'%X_train.shape[1])
        f.write('\n')
        f.write(dictionary_list(history.history))
    model.save(output_model_name)
    print('finished processing %s'%file_name)

In [68]:
 # processed_dge folder contains all new defined datasets
 # get all shapes from processed_dge files
 for root, dirs, files in os.walk("/home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets", topdown=False):
     for name in files:
        file_dir = os.path.join(root, name)
        train_and_save(file_dir)

start to process
Epoch 1/27
Epoch 2/27
Epoch 3/27
Epoch 4/27
Epoch 5/27
Epoch 6/27
Epoch 7/27
Epoch 8/27
Epoch 9/27
Epoch 10/27
Epoch 11/27
Epoch 12/27
Epoch 13/27
Epoch 14/27
Epoch 15/27
Epoch 16/27
Epoch 17/27
Epoch 18/27
Epoch 19/27
Epoch 20/27
Epoch 21/27
Epoch 22/27
Epoch 23/27
Epoch 24/27
Epoch 25/27
Epoch 26/27
Epoch 27/27
finished processing superct_v0_BoneMarrowcKit_1.csv
start to process
Epoch 1/22
Epoch 2/22
Epoch 3/22
Epoch 4/22
Epoch 5/22
Epoch 6/22
Epoch 7/22
Epoch 8/22
Epoch 9/22
Epoch 10/22
Epoch 11/22
Epoch 12/22
Epoch 13/22
Epoch 14/22
Epoch 15/22
Epoch 16/22
Epoch 17/22
Epoch 18/22
Epoch 19/22
Epoch 20/22
Epoch 21/22
Epoch 22/22
finished processing superct_v0_NeonatalCalvaria_1.csv
start to process


KeyboardInterrupt: 

In [65]:
train_and_save('/home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets/Bladder_1.csv')

Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14
finished processing Bladder_1.csv


In [25]:
X_train, X_test, y_train, y_test = load_dataset_from_dir('/home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets/Bladder_1.csv')

In [26]:
model = create_superct_model(X_train.shape[1])

In [28]:
batch_size = 128
# round up the epches
num_epoches = math.ceil(X_train.shape[0]/batch_size)

In [58]:
with open('histor_test.txt','w') as f:
    f.write('/home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets/Bladder_1.csv')
    f.write('\n')
    f.write('the number of observations: %d'%X_train.shape[0])
    f.write('\n')
    f.write('the number of features: %d'%X_train.shape[1])
    f.write('\n')
    f.write(dictionary_list(history.history))

In [31]:
print(os.path.basename('/home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets/Bladder_1.csv'))

Bladder_1.csv


In [32]:
'Bladder_1.csv'.replace('.csv', '.txt')

'Bladder_1txt'

In [39]:
import json

In [41]:
json.dumps(history.history)

'{"loss": [5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07], "accuracy": [0.02504553645849228, 0.024134790524840355, 0.025956284254789352, 0.023679416626691818, 0.019581057131290436, 0.025956284254789352, 0.022313296794891357, 0.0173041895031929, 0.02185792289674282, 0.023679416626691818, 0.02185792289674282, 0.018670309334993362, 0.02185792289674282, 0.02322404459118843, 0.025500910356640816, 0.02504553645849228, 0.018670309334993362, 0.02641165815293789]}'

In [42]:
str(history.history)

"{'loss': [5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07], 'accuracy': [0.02504553645849228, 0.024134790524840355, 0.025956284254789352, 0.023679416626691818, 0.019581057131290436, 0.025956284254789352, 0.022313296794891357, 0.0173041895031929, 0.02185792289674282, 0.023679416626691818, 0.02185792289674282, 0.018670309334993362, 0.02185792289674282, 0.02322404459118843, 0.025500910356640816, 0.02504553645849228, 0.018670309334993362, 0.02641165815293789]}"

In [45]:
print("{" + "\n".join("{!r}: {!r},".format(k, "\n".join(v)) for k, v in history.history.items()) + "}")

TypeError: sequence item 0: expected str instance, float found

In [57]:
def dictionary_list (input_dic):
    return_str = ''
    for k, v in input_dic.items():
        return_str+=str(k)
        return_str+='\n\t'
        float_list = map(str, v) 
        return_str+='\n\t'.join(float_list)
        return_str+='\n'
        return_str+='\n'
    return return_str


In [56]:
dictionary_list(history.history)

'loss\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\n\naccuracy\n0.02504553645849228\t\n0.024134790524840355\t\n0.025956284254789352\t\n0.023679416626691818\t\n0.019581057131290436\t\n0.025956284254789352\t\n0.022313296794891357\t\n0.0173041895031929\t\n0.02185792289674282\t\n0.023679416626691818\t\n0.02185792289674282\t\n0.018670309334993362\t\n0.02185792289674282\t\n0.02322404459118843\t\n0.025500910356640816\t\n0.02504553645849228\t\n0.018670309334993362\t\n0.02641165815293789\n\n'