In [1]:
import pandas as pd
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense
from tensorflow import random
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from keras.metrics import categorical_accuracy
import keras
import numpy as np
import h5py
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from numpy.random import seed
seed(1)
random.set_seed(2)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from datetime import datetime
import keras.optimizers

# for the scale data good hyperparameters are: nodes=500, batchsize=32, epochs=10
# for the banknote data good parameters are: nodes=500, batchsize=20, n_epochs=3
# for the user data good parameters are: nodes=500, batchsize=3, n_epochs=25
dataset_name = 'user_data'
n_MC = 21
n_nodes = 500
batchsize = 3
n_epochs = 25

In [2]:
def load_train_data(MD_percentage):
    dataset_list = []
    for i in ['a','b','c','d','e']:
        file_path = 'C:/Users/20175878/Documents/DSAI/Y1Q1_RTDM/research_paper/code/' + dataset_name + '/' + MC_ID + '/train' + MD_percentage + i + '.csv'
        df_i = pd.read_csv(file_path, index_col=0)
        data_i = df_i.to_numpy()
        dataset_list.append(data_i)
        
    return dataset_list


def load_test_data():
    file_path = 'C:/Users/20175878/Documents/DSAI/Y1Q1_RTDM/research_paper/code/' + dataset_name + '/' + MC_ID +'/test.csv'
    df_test = pd.read_csv(file_path,index_col=0)
    test_data = df_test.to_numpy()
    
    return test_data


def build_features(dataset):    
    count_dependent_vars = dataset.shape[1]-1
    
    X = dataset[:,0:count_dependent_vars]      
    X = X.astype(float)
    
    y = dataset[:,count_dependent_vars]
    encoder = LabelEncoder()
    encoder.fit(y)
    encoded_y = encoder.transform(y)    
    y = to_categorical(encoded_y)
            
    return X, y


def build_NN(X, y, input_dim, output_dim, nodes=n_nodes):    
    
    tf.keras.backend.clear_session()
    
    model = Sequential()    
    model.add(Dense(nodes, input_dim=input_dim, activation='relu'))
    model.add(Dense(output_dim, activation='softmax'))
        
    model.compile(loss='CategoricalCrossentropy', optimizer='adam', metrics=categorical_accuracy)
    
    model.fit(X, y, epochs=n_epochs, batch_size=batchsize, verbose=False)
 
    return model


def build_NN_custom(MD_percentage, custom_weights):
    
    tf.keras.backend.clear_session()
    
    with open('C:/Users/20175878/Documents/DSAI/Y1Q1_RTDM/research_paper/code/{0}/{1}/model.json'.format(dataset_name, MC_ID), 'r') as json_file:
        json_savedModel = json_file.read()

    pooled_weights = pool_weight_matrices(MD_percentage)
    
    model_j = keras.models.model_from_json(json_savedModel)
    model_j.set_weights(pooled_weights)
    model_j.compile(loss='CategoricalCrossentropy', optimizer='adam', metrics=categorical_accuracy)
    
    return(model_j)
   
    
def save_weights_to_npy(model, MD_percentage, model_tracker):
    weights_filename = 'C:/Users/20175878/Documents/DSAI/Y1Q1_RTDM/research_paper/code/' + dataset_name + '/' + MC_ID + '/model_weights' + MD_percentage + model_tracker + '.npy'
    weights = model.get_weights()
    np.save(weights_filename, weights)
    
    
def save_model_to_json(model):
    model_json = model.to_json()
    json_filename = 'C:/Users/20175878/Documents/DSAI/Y1Q1_RTDM/research_paper/code/' + dataset_name + '/' + MC_ID + '/model.json'
    with open(json_filename, "w") as json_file:
        json_file.write(model_json)     

        
def evaluate_nn_model(model):
    dataset = test_data
    X, y = build_features(dataset)
    
    y_cat = y.round()
    y_cat = np.argmax(y_cat, axis=1)
    
    y_hat = model.predict(X)
    y_hat = y_hat.round()
    y_hat = np.argmax(y_hat, axis=1)
    
    res = pd.DataFrame({'y':y_cat,'y_hat':y_hat,'correct?':(y_cat==y_hat)})    
    accuracy = len(res[res['correct?']==True])/len(res)*100
        
    return round(accuracy,3), res

        
# METHOD 1
def save_model_and_weights(MD_percentage):
    dataset_list = load_train_data(MD_percentage)

    count = 0
    models = ['a','b','c','d','e']
    for dataset in dataset_list:
        model_tracker = models[count]
        X, y = build_features(dataset)
        model = build_NN(X, y, input_dim=X.shape[1], output_dim=y.shape[1])
        
        save_model_to_json(model)
        save_weights_to_npy(model, MD_percentage, model_tracker)
        count += 1
        
        
# METHOD 1
def pool_weight_matrices(MD_percentage):
    weight_matrix_collections = []
    models = ['a','b','c','d','e']

    for i in range(0,5):
        model_tracker = models[i]
        weights_filename = 'C:/Users/20175878/Documents/DSAI/Y1Q1_RTDM/research_paper/code/' + dataset_name+ '/' + MC_ID + '/model_weights' + MD_percentage + model_tracker + '.npy'
        file = np.load(weights_filename, allow_pickle=True)
        weight_matrix_collections.append(file)

    average_model_weights = []
    for array in range(0,len(weight_matrix_collections[0])):
        array_sum = weight_matrix_collections[0][array] + weight_matrix_collections[1][array] + weight_matrix_collections[2][array] + weight_matrix_collections[3][array] + weight_matrix_collections[4][array]
        array_average = array_sum/5
        average_model_weights.append(array_average)
    return average_model_weights


# METHOD 2
def pool_datasets(MD_percentage):
    dataset_list = load_train_data(MD_percentage)

    for i in range(0,5):
        dataset_list[i] = pd.DataFrame(dataset_list[i])
          
        
    y = dataset_list[0].iloc[:,-1]    
    df_summed_x = dataset_list[0].iloc[:,:-1]
        
    for i in range(1,5):
        df_summed_x = df_summed_x + dataset_list[i].iloc[:,:-1]
        
    average_df = df_summed_x/5
    
    y_index = df_summed_x.shape[1]
    
    average_df[y_index] = y
    
    averaged_data = average_df.to_numpy()
    
    return averaged_data


# METHOD 3
def get_bagging_predictions(dataset):
    X, y = build_features(dataset)
        
    model = build_NN(X, y, input_dim=X.shape[1], output_dim=y.shape[1])
    
    evaluation = evaluate_nn_model(model)[1]
    
    y_cat = evaluation['y']
    y_hat = evaluation['y_hat']
    
    return y_hat, y_cat


def run_method1():
    accuracy_list = []
    for MD_percentage in ['10','20','50']:
        save_model_and_weights(MD_percentage)
        custom_weights = pool_weight_matrices(MD_percentage)
                
        model_j = build_NN_custom(MD_percentage, custom_weights)
                
        accuracy_on_test = evaluate_nn_model(model_j)[0]
        accuracy_list.append(accuracy_on_test)
        
    return accuracy_list


def run_method2():
    accuracy_list = []
    for MD_percentage in ['10','20','50']:
        averaged_data = pool_datasets(MD_percentage)
        
        X, y = build_features(averaged_data)
                    
        model = build_NN(X,y,input_dim=X.shape[1],output_dim=y.shape[1])
        
        accuracy_on_test = evaluate_nn_model(model)[0]
        accuracy_list.append(accuracy_on_test)
        
    return accuracy_list


def run_method3():
    accuracy_list = []
    for MD_percentage in ['10','20','50']:
        dataset_list = load_train_data(MD_percentage)
        pred_df = pd.DataFrame()
        for i in range(0,5):
            pred_df[i], y = get_bagging_predictions(dataset_list[i])
        
        final_y_hat = pred_df.mode(axis=1)[0]
                
        compare_df = pd.DataFrame({'y_hat':final_y_hat,'y':y})
        
        res = pd.DataFrame({'y':y,'y_hat':final_y_hat,'correct?':(y==final_y_hat)})    
        accuracy = len(res[res['correct?']==True])/len(res)*100
    
        accuracy_list.append(accuracy)
    return accuracy_list


def run_baseline():      
    file_path = 'C:/Users/20175878/Documents/DSAI/Y1Q1_RTDM/research_paper/code/' + dataset_name + '/' + MC_ID + '/train.csv'
    df_i = pd.read_csv(file_path, index_col=0)
    np_data = df_i.to_numpy()

    X, y = build_features(np_data)
    
    model = build_NN(X, y, input_dim=X.shape[1],output_dim=y.shape[1])
    
    accuracy_on_test = evaluate_nn_model(model)[0]
    
    return accuracy_on_test

In [3]:
method1_df = pd.DataFrame(columns=['10','20','50'])
method2_df = pd.DataFrame(columns=['10','20','50'])
method3_df = pd.DataFrame(columns=['10','20','50'])
baseline_series = pd.Series()

start_time = datetime.now()
for i in range(1,n_MC):
    start_time_i = datetime.now()
    MC_ID = str(i)
    test_data = load_test_data()
    
    method1_res = pd.Series(run_method1(), index = method1_df.columns)
    method1_df = method1_df.append(method1_res, ignore_index=True)
    print('iteration {} method 1 is done'.format(i))
    
    method2_res = pd.Series(run_method2(), index = method2_df.columns)
    method2_df = method2_df.append(method2_res, ignore_index=True)
    print('iteration {} method 2 is done'.format(i))
    
    method3_res = pd.Series(run_method3(), index = method3_df.columns)
    method3_df = method3_df.append(method3_res, ignore_index=True)
    print('iteration {} method 3 is done'.format(i))
    
    baseline_series = baseline_series.append(pd.Series(run_baseline()))
    print('iteration {} baseline is done'.format(i))
    
    print('running iteration {0} took {1}'.format(i, datetime.now() - start_time_i))

print('running the whole MC simulation took {}'.format(datetime.now() - start_time))

  baseline_series = pd.Series()
  arr = np.asanyarray(arr)


iteration 1 method 1 is done
iteration 1 method 2 is done
iteration 1 method 3 is done
iteration 1 baseline is done
running iteration 1 took 0:02:31.904389
iteration 2 method 1 is done
iteration 2 method 2 is done
iteration 2 method 3 is done
iteration 2 baseline is done
running iteration 2 took 0:02:33.201261
iteration 3 method 1 is done
iteration 3 method 2 is done
iteration 3 method 3 is done
iteration 3 baseline is done
running iteration 3 took 0:02:33.700254
iteration 4 method 1 is done
iteration 4 method 2 is done
iteration 4 method 3 is done
iteration 4 baseline is done
running iteration 4 took 0:02:35.034339
iteration 5 method 1 is done
iteration 5 method 2 is done
iteration 5 method 3 is done
iteration 5 baseline is done
running iteration 5 took 0:02:33.168273
iteration 6 method 1 is done
iteration 6 method 2 is done
iteration 6 method 3 is done
iteration 6 baseline is done
running iteration 6 took 0:02:33.754812
iteration 7 method 1 is done
iteration 7 method 2 is done
iterat

In [10]:
method1_df.to_csv('C:/Users/20175878/Documents/DSAI/Y1Q1_RTDM/research_paper/code/MC_results/{0}__nodes{1}_batch{2}_epoch{3}_method1_results.csv'.format(dataset_name, n_nodes, batchsize, n_epochs))

In [11]:
method2_df.to_csv('C:/Users/20175878/Documents/DSAI/Y1Q1_RTDM/research_paper/code/MC_results/{0}__nodes{1}_batch{2}_epoch{3}_method2_results.csv'.format(dataset_name, n_nodes, batchsize, n_epochs))

In [12]:
method3_df.to_csv('C:/Users/20175878/Documents/DSAI/Y1Q1_RTDM/research_paper/code/MC_results/{0}__nodes{1}_batch{2}_epoch{3}_method3_results.csv'.format(dataset_name, n_nodes, batchsize, n_epochs))

In [13]:
baseline_series.to_csv('C:/Users/20175878/Documents/DSAI/Y1Q1_RTDM/research_paper/code/MC_results/{0}__nodes{1}_batch{2}_epoch{3}_baseline_results.csv'.format(dataset_name, n_nodes, batchsize, n_epochs))

In [8]:
method2_df

Unnamed: 0,10,20,50
0,94.215,94.215,91.736
1,90.909,90.909,86.777
2,89.256,89.256,84.298
3,85.124,87.603,80.165
4,91.736,87.603,75.207
5,90.909,90.909,77.686
6,90.083,89.256,82.645
7,89.256,89.256,85.124
8,94.215,93.388,91.736
9,90.083,89.256,76.86


In [9]:
for i in range(3):
    print(run_baseline())

90.909
90.909
91.736
