In [8]:
import h5py
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.utils import HDF5Matrix
import math
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras import utils

In [10]:
def create_superct_model(n_features, n_targets):
    model = Sequential()
    model.add(Dense(200, input_dim = n_features, activation = 'relu'))
    model.add(Dropout(0.4))
    model.add(Dropout(0.4))
    model.add(Dense(100, activation = 'relu'))
    model.add(Dropout(0.4))
    model.add(Dropout(0.4))
    model.add(Dense(n_targets, activation = 'relu'))
    model.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [29]:
def train_and_save(file_dir, output_dir):
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    print('start to process %s'%file_dir)

    d_df = pd.read_csv(file_dir)
    X = d_df.iloc[:,1:-2]
    y = d_df['target_id']
    enc = LabelEncoder()
    enc.fit(y)
    encoded_Y = enc.transform(y)
    dummy_y = utils.to_categorical(encoded_Y)

    batch_size = 128
    # round up the epches
    num_epoches = math.ceil(X.shape[0]/batch_size)

    # training the model
    print('start training')
    estimator = KerasClassifier(build_fn=create_superct_model,n_features=X.shape[1], n_targets=dummy_y.shape[1], epochs=num_epoches, batch_size=batch_size, verbose=0)
    kfold = KFold(n_splits=10, shuffle=True)
    results = cross_val_score(estimator, X, dummy_y, cv=kfold)

    # save files
    file_name = os.path.basename(file_dir)
    history_name = file_name.replace('.csv', '.txt')
    history_name = os.path.join(output_dir, history_name)

    # content to be saved
    results_mean = str(results.mean()*100)+'%'
    results_std = str(results.std()*100)+'%'
    results_str = results_mean+'\t'+results_std
    print('start writing files')
    with open(history_name, 'w') as f:
        f.write(file_name)
        f.write('\n')
        f.write('the number of observations: %d'%X.shape[0])
        f.write('\n')
        f.write('the number of features: %d'%X.shape[1])
        f.write('\n')
        f.write('the number of targets: %d'%dummy_y.shape[1])
        f.write('\n')
        f.write(results_str)
    model.save(output_model_name)
    print('finished processing %s'%file_name)

In [28]:
 # processed_dge folder contains all new defined datasets
 # get all shapes from processed_dge files
 for root, dirs, files in os.walk("/home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets", topdown=False):
     for name in files:
        file_dir = os.path.join(root, name)
        train_and_save(file_dir, './superct_v0')

start to process /home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets/BoneMarrowcKit_1.csv
start training
start writing files


NameError: name 'X_train' is not defined

In [4]:
def dictionary_list (input_dic):
    return_str = ''
    for k, v in input_dic.items():
        return_str+=str(k)
        return_str+='\n\t'
        float_list = map(str, v) 
        return_str+='\n\t'.join(float_list)
        return_str+='\n'
        return_str+='\n'
    return return_str

In [17]:
def train_and_save(file_dir):
    print('start to process %s'%file_dir)
    X_train, X_test, y_train, y_test = load_dataset_from_dir(file_dir)
    batch_size = 128
    # round up the epches
    num_epoches = math.ceil(X_train.shape[0]/batch_size)
    model = create_superct_model(X_train.shape[1])
    history = model.fit(X_train, y_train, epochs=num_epoches, validation_data=(X_test, y_test))
    file_name = os.path.basename(file_dir)
    file_name = 'superct_v0_'+file_name
    history_name = file_name.replace('.csv', '.txt')
    output_model_name = file_name.replace('.csv', '.hdf5')
    with open(history_name, 'w') as f:
        f.write(file_name)
        f.write('\n')
        f.write('the number of observations: %d'%X_train.shape[0])
        f.write('\n')
        f.write('the number of features: %d'%X_train.shape[1])
        f.write('\n')
        f.write(dictionary_list(history.history))
    model.save(output_model_name)
    print('finished processing %s'%file_name)

In [18]:
 # processed_dge folder contains all new defined datasets
 # get all shapes from processed_dge files
 for root, dirs, files in os.walk("/home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets", topdown=False):
     for name in files:
        file_dir = os.path.join(root, name)
        train_and_save(file_dir)

Epoch 34/52
Epoch 35/52
Epoch 36/52
Epoch 37/52
Epoch 38/52
Epoch 39/52
Epoch 40/52
Epoch 41/52
Epoch 42/52
Epoch 43/52
Epoch 44/52
Epoch 45/52
Epoch 46/52
Epoch 47/52
Epoch 48/52
Epoch 49/52
Epoch 50/52
Epoch 51/52
Epoch 52/52
finished processing superct_v0_BoneMarrowcKit_3.csv
start to process /home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets/Uterus_2.csv
Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11
finished processing superct_v0_Uterus_2.csv
start to process /home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets/Brain_1.csv
Epoch 1/21
Epoch 2/21
Epoch 3/21
Epoch 4/21
Epoch 5/21
Epoch 6/21
Epoch 7/21
Epoch 8/21
Epoch 9/21
Epoch 10/21
Epoch 11/21
Epoch 12/21
Epoch 13/21
Epoch 14/21
Epoch 15/21
Epoch 16/21
Epoch 17/21
Epoch 18/21
Epoch 19/21
Epoch 20/21
Epoch 21/21
finished processing superct_v0_Brain_1.csv
start to process /home/jay/Documents/p

In [65]:
train_and_save('/home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets/Bladder_1.csv')
print('done')

Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14
finished processing Bladder_1.csv


In [25]:
X_train, X_test, y_train, y_test = load_dataset_from_dir('/home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets/Bladder_1.csv')

In [26]:
model = create_superct_model(X_train.shape[1])

In [28]:
batch_size = 128
# round up the epches
num_epoches = math.ceil(X_train.shape[0]/batch_size)

In [58]:
with open('histor_test.txt','w') as f:
    f.write('/home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets/Bladder_1.csv')
    f.write('\n')
    f.write('the number of observations: %d'%X_train.shape[0])
    f.write('\n')
    f.write('the number of features: %d'%X_train.shape[1])
    f.write('\n')
    f.write(dictionary_list(history.history))

In [31]:
print(os.path.basename('/home/jay/Documents/projects/todo_files/ml_final_pro/datasets/pre_processed_datasets/Bladder_1.csv'))

Bladder_1.csv


In [32]:
'Bladder_1.csv'.replace('.csv', '.txt')

'Bladder_1txt'

In [39]:
import json

In [41]:
json.dumps(history.history)

'{"loss": [5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07], "accuracy": [0.02504553645849228, 0.024134790524840355, 0.025956284254789352, 0.023679416626691818, 0.019581057131290436, 0.025956284254789352, 0.022313296794891357, 0.0173041895031929, 0.02185792289674282, 0.023679416626691818, 0.02185792289674282, 0.018670309334993362, 0.02185792289674282, 0.02322404459118843, 0.025500910356640816, 0.02504553645849228, 0.018670309334993362, 0.02641165815293789]}'

In [42]:
str(history.history)

"{'loss': [5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07, 5.1315367954885e-07], 'accuracy': [0.02504553645849228, 0.024134790524840355, 0.025956284254789352, 0.023679416626691818, 0.019581057131290436, 0.025956284254789352, 0.022313296794891357, 0.0173041895031929, 0.02185792289674282, 0.023679416626691818, 0.02185792289674282, 0.018670309334993362, 0.02185792289674282, 0.02322404459118843, 0.025500910356640816, 0.02504553645849228, 0.018670309334993362, 0.02641165815293789]}"

In [45]:
print("{" + "\n".join("{!r}: {!r},".format(k, "\n".join(v)) for k, v in history.history.items()) + "}")

TypeError: sequence item 0: expected str instance, float found

In [57]:
def dictionary_list (input_dic):
    return_str = ''
    for k, v in input_dic.items():
        return_str+=str(k)
        return_str+='\n\t'
        float_list = map(str, v) 
        return_str+='\n\t'.join(float_list)
        return_str+='\n'
        return_str+='\n'
    return return_str


In [56]:
dictionary_list(history.history)

'loss\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\t\n5.1315367954885e-07\n\naccuracy\n0.02504553645849228\t\n0.024134790524840355\t\n0.025956284254789352\t\n0.023679416626691818\t\n0.019581057131290436\t\n0.025956284254789352\t\n0.022313296794891357\t\n0.0173041895031929\t\n0.02185792289674282\t\n0.023679416626691818\t\n0.02185792289674282\t\n0.018670309334993362\t\n0.02185792289674282\t\n0.02322404459118843\t\n0.025500910356640816\t\n0.02504553645849228\t\n0.018670309334993362\t\n0.02641165815293789\n\n'