In [1]:
"""Mainly Edited for private usage by:  Ioanna Mitsioni
                                        Ioannis Agriomallos
License: BSD 3 clause
"""
import time
start_time = time.time()
from copy import deepcopy, copy
import math
import scipy.io as sio
import shutil
import os
from random import shuffle
import numpy as np
from pylab import *
# from featext2 import *
import matplotlib.pyplot as plt
%matplotlib inline 
#matplotlib qt
# inline (suitable for ipython only, shown inside browser!) or qt (suitable in general, shown in external window!)
from matplotlib.colors import ListedColormap
from mpl_toolkits.mplot3d import Axes3D #, axes3d
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold, ParameterGrid, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.decomposition import PCA, KernelPCA, TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV, RandomizedLasso
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2, f_classif, mutual_info_classif, SelectFdr
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from collections import OrderedDict
import re
import datetime
import urllib
import tarfile
# import joblib
# from joblib import Parallel, delayed, Memory
from tempfile import mkdtemp
import copy_reg
import types
import itertools
from itertools import compress
from collections import Counter


#import multiprocessing
def _pickle_method(m):
    if m.im_self is None:
        return getattr, (m.im_class, m.im_func.func_name)
    else:
        return getattr, (m.im_self, m.im_func.func_name)
copy_reg.pickle(types.MethodType, _pickle_method)


h = .2  # step size in the mesh
window = 1024

In [2]:
############ Feature Names ############
"""features:                                                                       ||      if       
   |--> time domain      :                                                         || samples = 1024
   |----|---> phinyomark : 11+3{shist} --------------------------> = 14+0.0samples ||             14
   |----|---> golz       : 10+samples{acrol} --------------------> = 10+1.0samples ||           1034
   |--> frequency domain :                                                                          
   |----|---> phinyomark : 3{arco}+4{mf}+2(samples/2+1){RF,IF} --> =  9+1.0samples ||           1033
   |----|---> golz       : 2(samples/2+1){AF,PF} ----------------> =  2+1.0samples ||           1026
   |----|----------------|-------alltogether---------------------> = 35+3.0samples || numfeat = 3107
"""
## Time Domain Phinyomark feats
featnames = ['intsgnl', 'meanabs', 'meanabsslp', 'ssi', 'var', 'rms', 'rng', 'wavl', 'zerox', 'ssc', 'wamp', 
             'shist1', 'shist2', 'shist3']                                                   # 11+3{shist}
## Frequency Domain Phinyomark feats
featnames += ['arco1', 'arco2', 'arco3', 'mnf', 'mdf', 'mmnf', 'mmdf']                       # 3{arco}+4{mf}
featnames += ['reFFT{:03d}'.format(i) for i in range(window/2+1)]                            # samples/2+1{RF}
featnames += ['imFFT{:03d}'.format(i) for i in range(window/2+1)]                            # samples/2+1{IF}
## Time Domain Golz feats
featnames += ['meanv', 'stdr', 'mx', 'rngx', 'rngy', 'med', 'hjorth', 'sentr', 'se', 'ssk']  # 10
featnames += ['acrol{:04d}'.format(i) for i in range(window)]                                # samples{acrol}
## Frequency Domain Golz feats
featnames += ['amFFT{:03d}'.format(i) for i in range(window/2+1)]                            # samples/2+1{AF}
featnames += ['phFFT{:03d}'.format(i) for i in range(window/2+1)]                            # samples/2+1{PF}

In [3]:
############ Prepare the indeces for each feature ############
def get_feat_id(feat_ind, printit=0, sample_window=window): 
    """Find the corresponding indeces of the desired features inside feature vector,
    and link them with their names and level of abstraction
    -> feat_ind        : range of indeces
    -> printit         : print output indeces (1) or not (0)
    -> sample_window   : parameter for accurate computation of feature indeces
    <- full_path_id    : indeces of all features
    <- norm_time_feats : indeces of time features
    <- norm_freq_feats : indeces of frequency features
    """
    # get the feat inds wrt their source : 3rd level
    norm_time_phin = range(0,14)
    norm_freq_phin = range(norm_time_phin[-1] + 1, norm_time_phin[-1] + 9 + sample_window + 1)
    norm_time_golz = range(norm_freq_phin[-1] + 1, norm_freq_phin[-1] + 10 + sample_window + 1)
    norm_freq_golz = range(norm_time_golz[-1] + 1, norm_time_golz[-1] + 2 + sample_window + 1)
    # get the feat inds wrt their domain : 2nd level 
    norm_time_feats = norm_time_phin + norm_time_golz
    norm_freq_feats = norm_freq_phin + norm_freq_golz
    # get the feat inds wrt their prefeat: 1st level 
    norm_feats = norm_time_feats + norm_freq_feats

    # get the feat inds wrt their source : 3rd level
    disp = norm_feats[-1]+1
    ftfn_time_phin = range(disp ,disp + 14)
    ftfn_freq_phin = range(ftfn_time_phin[-1] + 1, ftfn_time_phin[-1] + 9 + sample_window + 1)
    ftfn_time_golz = range(ftfn_freq_phin[-1] + 1, ftfn_freq_phin[-1] + 10 + sample_window + 1)
    ftfn_freq_golz = range(ftfn_time_golz[-1] + 1, ftfn_time_golz[-1] + 2 + sample_window + 1)
    # get the feat inds wrt their domain : 2nd level 
    ftfn_time_feats = ftfn_time_phin + ftfn_time_golz
    ftfn_freq_feats = ftfn_freq_phin + ftfn_freq_golz
    # get the feat inds wrt their prefeat: 1st level 
    ftfn_feats = ftfn_time_feats + ftfn_freq_feats

    # create the final "reference dictionary"
    # 3 np.arrays, id_list[0] = level 1 etc
    id_list = [np.zeros((len(ftfn_feats + norm_feats),1)) for i in range(3)]
    id_list[0][:norm_feats[-1]+1] = 0 # 0 signifies norm / 1 signifies ft/fn
    id_list[0][norm_feats[-1]+1:] = 1

    id_list[1][:norm_time_phin[-1]+1] = 0 # 0 signifies time / 1 signifies freq
    id_list[1][norm_time_phin[-1]+1:norm_freq_phin[-1]+1] = 1
    id_list[1][norm_freq_phin[-1]+1:norm_time_golz[-1]+1] = 0
    id_list[1][norm_time_golz[-1]+1:norm_freq_golz[-1]+1] = 1
    id_list[1][norm_freq_golz[-1]+1:ftfn_time_phin[-1]+1] = 0
    id_list[1][ftfn_time_phin[-1]+1:ftfn_freq_phin[-1]+1] = 1
    id_list[1][ftfn_freq_phin[-1]+1:ftfn_time_golz[-1]+1] = 0
    id_list[1][ftfn_time_golz[-1]+1:] = 1

    id_list[2][:norm_freq_phin[-1]+1] = 0 #0 signifies phinyomark / 1 signifies golz
    id_list[2][norm_freq_phin[-1]+1:norm_freq_golz[-1]+1] = 1
    id_list[2][norm_freq_golz[-1]+1:ftfn_freq_phin[-1]+1] = 0
    id_list[2][ftfn_freq_phin[-1]+1:] = 1 
    
    full_path_id = [np.zeros((len(feat_ind),5)) for i in range(len(feat_ind))]
   
    for ind, val in enumerate(feat_ind):
        full_path_id[ind] = [val, id_list[2][val], id_list[1][val], id_list[0][val]]
        if (printit==1):
            if(full_path_id[ind][1]==0):
                lvl3 = 'Phin'
            else:
                lvl3 = 'Golz'
            if(full_path_id[ind][2]==0):
                lvl2 = 'Time'
            else:
                lvl2 = 'Freq'
            if(full_path_id[ind][3]==0):
                lvl1 = 'Norm'
            else:
                lvl1 = 'Ft/Fn'
            print(feat_ind[ind],featnames[val%(norm_feats[-1]+1)],lvl3,lvl2,lvl1)
    
    return(full_path_id,norm_time_feats,norm_freq_feats)        

In [4]:
def get_feat_occ(feat_masks):
    #get the number of occurences for each feature after SelectKbest
#     print("If it ain't working, just make sure you're adding the lists instead of concatenating them,")
#     print("if the input isn't a single list you'll get the unhashable error")
    feat_occ = Counter(feat_masks)
    return feat_occ 

In [5]:
def get_tot_feats(fs, subfs, r):
    import glob
     ###############################################################################################################
    # Version 2, using the bool masks and keeping an array of 6x3000 feats 
    ###############################################################################################################
    # If checking for FnormAll, you end up with 36 models of (trained_on, tested_on) combinations but TECHNICALLY
    # the features are the same for every trained_on "sixplet" so there's no need to iterate over all the tested_on
    # indeces. Therefore, ts = 2 is chosen arbitrarily 
    
    filenames = glob.glob("data/results" + str(r) + "/fs_" + str(fs) + "_subfs_" + str(subfs) + "_*.npz")
    # the features kept for surface i will be stored in bool_tot_feats[i] (final size: 6x1000)
    bool_tot_feats = []
    
    for filn in filenames:
        # for every training surface     
        model_file = np.load(filn)
        model = model_file['model']

        #keep a list of the 1000 features kept
        bool_model_features = list(model[0].named_steps['feature_selection'].get_support(indices = False))
    #     model_feat_scores = list(model[0].named_steps['feature_selection'].scores_)
        bool_tot_feats.append(bool_model_features)

    return bool_tot_feats

In [6]:
def get_common_feats(bool_tot_feats, skip_surf = 6, print_common_feats = 0):   
    # skip_surf = 6 by default so you won't skip any surfaces.
    # returns the list of inds for the common feats
    trans_test_bools = []

    for i in range(len(bool_tot_feats)):
        if i != skip_surf:
            trans_test_bools.append(bool_tot_feats[i])
        else: 
            continue
            
    trans_test_bools = np.transpose(trans_test_bools)
    common_feats = []
    matches  =[]
    for i in range(len(trans_test_bools)):
        matches.append(np.all(trans_test_bools[i]))
    for ind, val in enumerate(matches):
        if val:
            common_feats.append(ind)
    print("===============================================================")       
    print("%d common feats, out of %d total" %(len(common_feats),len(matches)))
    full_names, _, _ = get_feat_id(common_feats, printit = print_common_feats)
    freq_counter, time_counter = freq_time_counter(full_names)
    print("of which, %d (%.2f%%) were Freq features and %d (%.2f%%) were Time features" %(freq_counter, (float(freq_counter)/len(common_feats))*100, time_counter, (float(time_counter)/len(common_feats))*100 ))

    print("===============================================================")
    
    return common_feats

In [7]:
def freq_time_counter(full_names):
    f_c = 0; t_c = 0
    for i in range(len(full_names)):
        if full_names[i][2] == 1:
            f_c += 1
        else: 
            t_c += 1
    return (f_c, t_c)

In [10]:
### Example 

for r in range(1,6):
    tot_feats = get_tot_feats(fs = 0, subfs = 3, r = r)
    common_feats = get_common_feats(bool_tot_feats = tot_feats, skip_surf = 6, print_common_feats= 1)

133 common feats, out of 3107 total
(2, 'meanabsslp', 'Phin', 'Time', 'Norm')
(6, 'rng', 'Phin', 'Time', 'Norm')
(19, 'mmnf', 'Phin', 'Freq', 'Norm')
(20, 'mmdf', 'Phin', 'Freq', 'Norm')
(22, 'reFFT001', 'Phin', 'Freq', 'Norm')
(23, 'reFFT002', 'Phin', 'Freq', 'Norm')
(24, 'reFFT003', 'Phin', 'Freq', 'Norm')
(25, 'reFFT004', 'Phin', 'Freq', 'Norm')
(26, 'reFFT005', 'Phin', 'Freq', 'Norm')
(27, 'reFFT006', 'Phin', 'Freq', 'Norm')
(28, 'reFFT007', 'Phin', 'Freq', 'Norm')
(29, 'reFFT008', 'Phin', 'Freq', 'Norm')
(30, 'reFFT009', 'Phin', 'Freq', 'Norm')
(31, 'reFFT010', 'Phin', 'Freq', 'Norm')
(32, 'reFFT011', 'Phin', 'Freq', 'Norm')
(33, 'reFFT012', 'Phin', 'Freq', 'Norm')
(36, 'reFFT015', 'Phin', 'Freq', 'Norm')
(38, 'reFFT017', 'Phin', 'Freq', 'Norm')
(39, 'reFFT018', 'Phin', 'Freq', 'Norm')
(537, 'imFFT003', 'Phin', 'Freq', 'Norm')
(538, 'imFFT004', 'Phin', 'Freq', 'Norm')
(540, 'imFFT006', 'Phin', 'Freq', 'Norm')
(542, 'imFFT008', 'Phin', 'Freq', 'Norm')
(543, 'imFFT009', 'Phin', 'Fre

203 common feats, out of 3107 total
(2, 'meanabsslp', 'Phin', 'Time', 'Norm')
(6, 'rng', 'Phin', 'Time', 'Norm')
(17, 'mnf', 'Phin', 'Freq', 'Norm')
(19, 'mmnf', 'Phin', 'Freq', 'Norm')
(20, 'mmdf', 'Phin', 'Freq', 'Norm')
(22, 'reFFT001', 'Phin', 'Freq', 'Norm')
(23, 'reFFT002', 'Phin', 'Freq', 'Norm')
(24, 'reFFT003', 'Phin', 'Freq', 'Norm')
(25, 'reFFT004', 'Phin', 'Freq', 'Norm')
(26, 'reFFT005', 'Phin', 'Freq', 'Norm')
(27, 'reFFT006', 'Phin', 'Freq', 'Norm')
(28, 'reFFT007', 'Phin', 'Freq', 'Norm')
(29, 'reFFT008', 'Phin', 'Freq', 'Norm')
(30, 'reFFT009', 'Phin', 'Freq', 'Norm')
(31, 'reFFT010', 'Phin', 'Freq', 'Norm')
(32, 'reFFT011', 'Phin', 'Freq', 'Norm')
(33, 'reFFT012', 'Phin', 'Freq', 'Norm')
(34, 'reFFT013', 'Phin', 'Freq', 'Norm')
(35, 'reFFT014', 'Phin', 'Freq', 'Norm')
(36, 'reFFT015', 'Phin', 'Freq', 'Norm')
(37, 'reFFT016', 'Phin', 'Freq', 'Norm')
(38, 'reFFT017', 'Phin', 'Freq', 'Norm')
(39, 'reFFT018', 'Phin', 'Freq', 'Norm')
(40, 'reFFT019', 'Phin', 'Freq', 'Norm'

(644, 'imFFT110', 'Phin', 'Freq', 'Norm')
(647, 'imFFT113', 'Phin', 'Freq', 'Norm')
(1048, 'stdr', 'Golz', 'Time', 'Norm')
(1049, 'mx', 'Golz', 'Time', 'Norm')
(1051, 'rngy', 'Golz', 'Time', 'Norm')
(1053, 'hjorth', 'Golz', 'Time', 'Norm')
(1056, 'ssk', 'Golz', 'Time', 'Norm')
(1151, 'acrol0094', 'Golz', 'Time', 'Norm')
(1184, 'acrol0127', 'Golz', 'Time', 'Norm')
(1192, 'acrol0135', 'Golz', 'Time', 'Norm')
(1193, 'acrol0136', 'Golz', 'Time', 'Norm')
(1209, 'acrol0152', 'Golz', 'Time', 'Norm')
(1211, 'acrol0154', 'Golz', 'Time', 'Norm')
(1247, 'acrol0190', 'Golz', 'Time', 'Norm')
(1248, 'acrol0191', 'Golz', 'Time', 'Norm')
(1249, 'acrol0192', 'Golz', 'Time', 'Norm')
(2082, 'amFFT001', 'Golz', 'Freq', 'Norm')
(2083, 'amFFT002', 'Golz', 'Freq', 'Norm')
(2084, 'amFFT003', 'Golz', 'Freq', 'Norm')
(2085, 'amFFT004', 'Golz', 'Freq', 'Norm')
(2086, 'amFFT005', 'Golz', 'Freq', 'Norm')
(2087, 'amFFT006', 'Golz', 'Freq', 'Norm')
(2088, 'amFFT007', 'Golz', 'Freq', 'Norm')
(2089, 'amFFT008', 'Golz'