In [5]:
import os
import random
import statsmodels.api as sm
import pylab as pl
import pandas as pd
import numpy as np
#import xgboost as xgb
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, RandomForestClassifier
from sklearn import metrics, svm, preprocessing
from pandas import DataFrame
from keras import backend as K
import sklearn
from sklearn.model_selection import train_test_split, KFold
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Input, BatchNormalization
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from IPython.display import SVG
from keras.utils import plot_model
import time
from sklearn.manifold import TSNE

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import gc
import pickle

In [2]:
def load_data(root = '', MetaFile = 'ADoub_meta.csv', ADoubFile = 'ADoub.npy'):

    ADoub_meta = pd.read_csv(root + MetaFile)
    X = np.load(root + ADoubFile)
    
    return X, ADoub_meta

def gen_label(Y_raw):
    
    Y_raw = Y_raw.astype(int)
    Y = np.zeros((Y_raw.shape[0], Y_raw.max() + 1))
    for i, idx in enumerate(Y_raw):
        Y[i, idx] = 1
        
    return Y

def FCNN(input_shape, out_shape, nodes = [2048, 1024, 512]):
    
    X_input = Input(input_shape)
    
    X = X_input
    
    for node in nodes:
        X = Dense(node, init = 'glorot_normal', activation = 'relu')(X)
        X = BatchNormalization(axis = 1)(X)
    
    X = Dense(out_shape, init = 'glorot_normal', activation = 'softmax')(X)
    
    model = Model(inputs = X_input, outputs = X, name = 'FCNN')
    
    return model

def batch_generator(X, y, sample_index, batch_size, shuffle):
    number_of_batches = int(np.ceil(len(sample_index)/batch_size))
    counter = 0

    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index, :]
        y_batch = y[batch_index, :]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0
            
def CrossValidation(X, Y, fold = 5, CV = True, epoch = 20, batch_size = 4096, nodes = [2048, 1024, 512]):

    kf = KFold(n_splits=fold, random_state = 0, shuffle = True)
    cv_index = kf.split(X)

    length = Y.shape[1]

    earlystopper = EarlyStopping(patience=5, verbose=1)

    # Set a learning rate annealer
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', 
                                                patience=3, 
                                                verbose=1, 
                                                factor=0.5, 
                                                min_lr=0.00001)
    
    histories = []

    i = 0

    for cv_train, cv_test in cv_index:

        if((not CV) & (i!= 0)):
            break
        i += 1
        print(i)

        X_valid = X[cv_test, :].copy()

        model = FCNN((X.shape[1],), length, nodes = nodes)
        model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

        history = model.fit_generator(generator = batch_generator(X,\
                                                        Y,\
                                                        cv_train,\
                                                        batch_size, True),\
                            epochs = epoch,\
                            steps_per_epoch = len(cv_train) / batch_size,\
                            validation_data = (X_valid, Y[cv_test, :]),\
                            callbacks = [learning_rate_reduction, earlystopper])
        
        histories.append(history)

        gc.collect()
    if CV:
        return histories
    else:
        return history

def Train(X, Y, root, name = '1_9_1', epoch = 20, batch_size = 4096, nodes = [2048, 1024, 512]):
    
    length = Y.shape[1]

    earlystopper = EarlyStopping(patience=5, verbose=1)

    # Set a learning rate annealer
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', 
                                                patience=3, 
                                                verbose=1, 
                                                factor=0.5, 
                                                min_lr=0.00001)

    model = FCNN((X.shape[1],), length, nodes = nodes)
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

    model.fit_generator(generator = batch_generator(X,\
                                                    Y,\
                                                    np.arange(X.shape[0]),\
                                                    batch_size, True),\
                        epochs = epoch,\
                        steps_per_epoch = X.shape[0] / batch_size,\
                        callbacks = [learning_rate_reduction, earlystopper])

    model.save(root + 'Decomp_' + name + '.h5')
    
def SaveValidation(history, CV, name):
    
    if(CV):
        histories = history
        
        with open(save_root + 'training_log_' + name + '.pickle', 'wb') as handle:
            pickle.dump([history.history for history in histories], handle)

        with open(save_root + 'training_log_' + name + '.pickle', 'rb') as handle:
            histories = pickle.load(handle)

        accuracy = []

        for history in histories:
            accuracy.append(history['val_accuracy'][-1])

        print('Validation Accuracy')
        print(accuracy)
        print(np.mean(accuracy))
        
    else:
        with open(save_root + 'training_log_' + name + '.pickle', 'wb') as handle:
            pickle.dump(history, handle)

        with open(save_root + 'training_log_' + name + '.pickle', 'rb') as handle:
            history = pickle.load(handle)
            
        print(history.history['val_accuracy'][-1])
        
def CheckAccuracy(name):
    with open(save_root + 'training_log_' + name + '.pickle', 'rb') as handle:
        histories = pickle.load(handle)

        accuracy = []

        for history in histories:
            accuracy.append(history['val_accuracy'][-1])

        print(name)
        print('Validation Accuracy')
        print(accuracy)
        print(np.mean(accuracy))

In [8]:
file_root = 'ADoub_new/'
save_root = 'DecompModel/'

if not os.path.exists(save_root):
    os.makedirs(save_root)
    
X, ADoub_meta = load_data(file_root, 'ADoub_meta.csv', 'ADoub.npy')

Y1 = gen_label(ADoub_meta.iloc[:, 0])
Y2 = gen_label(ADoub_meta.iloc[:, 2])

print(Y1.shape)
print(Y2.shape)
print(X.shape)
print(X.sum(axis = 1))

(400000, 1415)
(400000, 1203)
(400000, 8676)
[1000000. 1000000. 1000000. ... 1000000. 1000000. 1000000.]


# FCNN

In [4]:
#CrossValidation
main_name = 'hep_LEC'
##1
print('*' * 10)
print(1)
print('*' * 10)

name = main_name + '_1'

CV = True

history = CrossValidation(X, Y1, fold = 5, CV=CV, epoch = 20, batch_size = 4096, nodes = [2048, 1024, 512])

SaveValidation(history, CV, name)

**********
1
**********
1
Instructions for updating:
If using Keras pass *_constraint arguments to layers.





Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20

Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 20/20
2




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 19/20
Epoch 20/20
3




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20

Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
4




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
5




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Epoch 00020: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Validation Accuracy
[0.9977874755859375, 0.9976999759674072, 0.998324990272522, 0.99795001745224, 0.9965375065803528]
0.9976599931716919


In [5]:
name = main_name + '_1'
CheckAccuracy(name)

hep_LEC_1
Validation Accuracy
[0.9977874755859375, 0.9976999759674072, 0.998324990272522, 0.99795001745224, 0.9965375065803528]
0.9976599931716919


In [4]:
##2
main_name = 'hep_LEC'
print('*' * 10)
print(2)
print('*' * 10)

name = main_name + '_2'

CV=True

history = CrossValidation(X, Y2, fold = 5, CV=CV, epoch = 20, batch_size = 4096, nodes = [2048, 1024, 512])

SaveValidation(history, CV, name)

**********
2
**********
1
Instructions for updating:
If using Keras pass *_constraint arguments to layers.





Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 00010: early stopping
2




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Epoch 00020: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
3




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20

Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Epoch 00020: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 00020: early stopping
4




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20

Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 18/20
Epoch 19/20
Epoch 20/20
5




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20

Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Validation Accuracy
[0.7287499904632568, 0.7648375034332275, 0.8947250247001648, 0.9191250205039978, 0.9050124883651733]
0.8424900054931641


In [5]:
name = main_name + '_2'
CheckAccuracy(name)

hep_LEC_2
Validation Accuracy
[0.7287499904632568, 0.7648375034332275, 0.8947250247001648, 0.9191250205039978, 0.9050124883651733]
0.8424900054931641


In [5]:
#Final Model
## 1
main_name = 'hep_LEC'
name = main_name + '_1'
Train(X, Y1, save_root, name, epoch = 20, batch_size = 4096, nodes = [2048, 1024, 512])

## 2
name = main_name + '_2'
Train(X, Y2, save_root, name, epoch = 20, batch_size = 4096, nodes = [2048, 1024, 512])

Instructions for updating:
If using Keras pass *_constraint arguments to layers.





Epoch 1/20
Epoch 2/20




Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20




Epoch 1/20
Epoch 2/20




Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# LR

In [10]:
Y1_raw = ADoub_meta.iloc[:, 0].values.squeeze().astype(int)
Y2_raw = ADoub_meta.iloc[:, 2].values.squeeze().astype(int)

In [None]:
kf = KFold(n_splits=5, random_state = 0, shuffle = True)
cv_index = kf.split(X)

Accuracy = []

i = 0

sta = time.time()

for cv_train, cv_test in cv_index:
    
    i += 1
    print(i)
    print('c1:')
    
    clf = LogisticRegression(n_jobs = 24).fit(X = X[cv_train, :],
                                              y = Y1_raw[cv_train])
    
    test_pred_c1 = clf.predict(X[cv_test, :])

    t1 = test_pred_c1 == Y1_raw[cv_test]
    c1_accuracy = t1.sum() / test_pred_c1.shape[0]
    
    print('c2:')
    clf = LogisticRegression(n_jobs = 24).fit(X = X[cv_train, :],
                                              y = Y2_raw[cv_train])
    
    test_pred_c2 = clf.predict(X[cv_test, :])

    t2 = test_pred_c2 == Y2_raw[cv_test]
    c2_accuracy = t2.sum() / test_pred_c2.shape[0]
    
    c12_accuracy = np.sum(t1&t2) / test_pred_c2.shape[0]
    
    print([c1_accuracy, c2_accuracy, c12_accuracy])
    Accuracy.append([c1_accuracy, c2_accuracy, c12_accuracy])

end = time.time()
dura = (end - sta)/3600
print(dura, 'h(s)')
print(np.mean(Accuracy, axis = 0))