## Artificial Neural Network Training/Testing

Description: using scikit-learn package

#### === import libraries ===

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler  

#### Parameters setting

In [None]:
repeat_n = 5  # grand repeat number
n_cv     = 10 # number of cross-validation parts
n_process = 5 # parallel computing for different C.V.

### Read input file

In [None]:
data_source_dir = working_dir+'/vars/'

# select latest dated file (track record)
cmd = 'ls {data_source_dir}/DF.*.pickle'.format(data_source_dir=data_source_dir)
input_filepath = commands.getoutput(cmd).splitlines()[-1]
__nb_logger.info('reading file: %s' % input_filepath)
DF = pickle.load(open(input_filepath))

### Feature selection

In [None]:
# one-hot transformation
DF['is_feature_05'] = DF['feature_05'] == 'default_value'
DF['is_feature_06'] = DF['feature_06'] == 'default_value'
DF['is_feature_07'] = DF['feature_07'] == 'default_value'

one_hots = ['is_feature_05', 'is_feature_06', 'is_feature_07']

In [None]:
sel_features = ['__label', 
  'feature_01', 'feature_02', 'feature_03', 'feature_04',
  'is_feature_05', 'is_feature_06', 'is_feature_07']

In [None]:
# select only columns with selected feature
DF = DF[sel_features]

#### Transformation on one-hot features

In [None]:
ds = {True: 0.95, False: 0.05}
for colname in one_hots:
    DF[colname] = map(ds.get, DF[colname])

### ANN configurations

In [None]:
a = int((len(DF.columns)-1) * 1.5)
b = int(a / 2)
ml_config = (a, b, 2)
print 'ANN configurations:', ml_config

### Cross-validation: separate into training and testing set

In [None]:
def run_epoch(ml_config, box_train, box_test, label_col = '__label', n_thr_cuts=100, iter_label = None, iter_min = 50):
    # read training and testing set
    X_train = deepcopy(box_train)
    Y_train = X_train.pop(label_col)
    X_test  = deepcopy(box_test)
    Y_test  = X_test .pop(label_col)

    # separate one-hot and non-one-hot
    print 'Scaling ... '
    p = [_ for _ in X_train if not(_.startswith('is'))]
    q = [_ for _ in X_train if     _.startswith('is') ]

    btp_train = X_train[p]; btq_train = X_train[q]
    btp_test  = X_test [p]; btq_test  = X_test [q]

    # standard scaling fitting
    scaler = StandardScaler()
    scaler.fit(btp_train)
    btp_train = scaler.transform(btp_train)
    btp_test  = scaler.transform(btp_test )
    
    # convert to list type
    btp_train = map(list, btp_train); btq_train = map(list, btq_train.values)
    btp_test  = map(list, btp_test ); btq_test  = map(list, btq_test .values)

    # combine back after separated scaling
    for i in range(len(btp_train)):
        btp_train[i].extend(list(btq_train[i]))
    # end for
    for i in range(len(btq_test )):
        btp_test [i].extend(list(btq_test [i]))
    # end for
    X_train = np.array(btp_train)
    X_test  = np.array(btp_test)
    
    # formatting Y_train to scalar value
    Y_train = Y_train.values

    print 'Starting ANN ...'
    # ANN initialization
    mlp = MLPClassifier(hidden_layer_sizes = ml_config, warm_start=True)

    # ANN fitting
    # ensure the proportion is roughly 50%:50%
    print 'make balanced datasets ...'
    balanced_data = sample_balance_datasets(X_train, Y_train)
    print 'started training ...'
    
    print 'max-iter:',
    mlp_iter_record = [iter_min]
    for j in range(len(balanced_data)):
        x_train, y_train = balanced_data[j]
        
        while True:
            det_repeat = False
            mlp.fit(x_train, y_train)
            
            if j == 0:
                this_n_iter = mlp.n_iter_
            else:
                this_n_iter = mlp.n_iter_ - mlp_iter_record[-1]
            # end if
            
            if this_n_iter < max(iter_min, mlp_iter_record[0]):
                det_repeat = True

            if det_repeat == False:
                if j == 0:
                    mlp_iter_record = [mlp.n_iter_]
                else:
                    mlp_iter_record.append(mlp.n_iter_)
                # end if
                print mlp_iter_record[-1],
                break
            # end if
        # end while
    # end for
    print

    # logging training and testing accuracy
    print '(training) iteration:', iter_label, '| accuracy:', mlp.score(X_train, Y_train)
    print '(testing ) iteration:', iter_label, '| accuracy:', mlp.score(X_test , Y_test )

    # scoring the testing set by ranking
    probs = mlp.predict_proba(X_test)
    probs = np.array(zip(*probs)[0])
    ids = box_test.index.values
    # record the probs
    test_scores = dict(zip(ids, probs))

    # full statistical assessment with varying threshold
    thrs = list(set(map(float, probs)))
    thrs = np.percentile(thrs, np.arange(n_thr_cuts))

    Cs = {}
    for thr in thrs:
        predictions = probs < thr
        labels      = Y_test

        confusion_matrix = compute_confusion_matrix(predictions, labels)
        out = binary_classification_assessment(confusion_matrix)

        Cs[thr] = out
    # end for

    return {'model': {'mlp': mlp, 'scaler': scaler}, 'results': test_scores, 'assessments': Cs}
# end def

In [None]:
# declare memory for MLPs
MLPs        = []
CV_Recs     = []
test_scores = []
counter = 0

In [None]:
# parallel computing version
from multiprocessing import Pool

def run_ml_epoch(input_vars):
    ml_config, box_train, box_test, label_col, n_thr_cuts, iter_label = input_vars
    return run_epoch(ml_config, box_train, box_test, label_col, n_thr_cuts, iter_label)
# end def

for _episode in range(repeat_n):
    # separate into training and testing set
    print 'make cross-validation datasets ...'
    boxes = construct_CV_datasets(DF, '__label', n_cv)
    
    inputs  = [(ml_config, boxes[i]['training'], boxes[i]['testing'], '__label', 100, i+1) for i in range(n_cv)]
    pool = Pool(processes=n_process)
    outputs = pool.map(run_ml_epoch, inputs)
    pool.close()
    pool.join()
 
    for output in outputs:
        MLPs       .append(output['model'])
        test_scores.append(output['results'])
        CV_Recs    .append(output['assessments'])
    # end for
# end for

### Assessment: ROC curve

In [None]:
# plot ROC
R = []
[R.extend(_.values()) for _ in CV_Recs];
R = pd.DataFrame(R)
plt.scatter(*zip(*R[['FPR', 'TPR']].values), s = 0.5)

plt.plot([0, 1], [0, 1], ls = '--', color = 'red')

plt.xlim(0, 1)
plt.ylim(0, 1)
plt.xlabel('FPR')
plt.ylabel('TPR')

plt.gcf().set_size_inches(9, 6)
plt.title('Receiver operating characteristic curve');

### Save to files

In [None]:
_timestamp = int(utc_timestamp())

#### append scores

In [None]:
final_scores = defauldict(lambda: [])
for _scores in test_scores:
    for key, val in _scores.iteritems():
        final_scores[key].append(val)
    # end for
# end for
for key, vals in final_scores.iteritems():
    final_scores[key] = np.mean(vals)
# end for
DF['score'] = map(final_scores.get, DF.index)

In [None]:
# specify output
outfilename = working_dir + '/vars/DF.%s.pickle' % _timestamp
__nb_logger.info('write to output: %s' % outfilename)
pickle.dump(DF, open(outfilename, 'w'))

#### assessment results

In [None]:
# specify output
outfilename = working_dir + '/results/assessment_records.%s.xlsx' % _timestamp
__nb_logger.info('write to output: %s' % outfilename)

In [None]:
DFs = []
for part in CV_Recs:
    thrs    = part.keys()
    results = part.values()
    _df = pd.DataFrame(part.values())
    _df['threshold'] = thrs
    DFs.append(_df)
# end for
_df = pd.concat(DFs, axis=0)
_df.sort_values(by=['threshold', 'ACC'])
_df.set_index('threshold', inplace=True)
_df.to_excel(outfilename)

#### models

In [None]:
# specify output
outfilename = working_dir + '/models/MLP.%s.pickle' % _timestamp
__nb_logger.info('write to output: %s' % outfilename)

In [None]:
pickle.dump(MLPs, open(outfilename, 'w'))