In [37]:
%matplotlib inline

import pandas
import numpy
import sklearn
import sklearn.model_selection
import sklearn.ensemble
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.feature_selection
import sklearn.linear_model


In [83]:
# Return a combined DataFrame, and a dictionary of {featuretype: columns}
def load_data(filename):
    def load_sheet(name):
        df = file.parse(name)
        df.index = df['Index']
        del df['Index']
        return df
    
    file = pandas.ExcelFile(filename)
    dfs = { sheet: load_sheet(sheet) for sheet in file.sheet_names}
    
    # AMT is missing names for columns
    combine = [dfs[k] for k in dfs.keys() if k not in ('AMT')]
    
    combined = dfs['AMT'].copy().add_suffix('-AMT')
    combined = combined.join(combine, lsuffix='', rsuffix='')
    
    # TODO: there are also categories within each wavelet series
    # separated with a _ or -
    feature_categories = {}
    for category, df in dfs.items():
        columns = df.columns
        feature_categories[category] = columns
    
    return combined, feature_categories

# FIXME: load updated dataset
fulldata, categories = load_data('Full_data.xlsx')
assert fulldata.shape[0] == 128, fulldata.shape
assert numpy.count_nonzero(fulldata.Labels.notnull()) == 96

In [3]:
categories.keys()

dict_keys(['Labels', 'AMT', 'WT_originals', 'WT-LLL', 'WT-LLH', 'WT-LHL', 'WT-LHH', 'LBP'])

In [4]:
fulldata.head()

Unnamed: 0_level_0,1-AMT,2-AMT,3-AMT,4-AMT,5-AMT,6-AMT,7-AMT,8-AMT,9-AMT,10-AMT,...,"lbp_24_(24,3)","lbp_25_(24,3)","lbp_2_(24,3)","lbp_3_(24,3)","lbp_4_(24,3)","lbp_5_(24,3)","lbp_6_(24,3)","lbp_7_(24,3)","lbp_8_(24,3)","lbp_9_(24,3)"
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.55144,1.5198,1.45847,1.4394,1.43263,1.44893,1.50741,1.54977,1.59866,1.63075,...,53695,468638,17261,15439,18953,22857,24262,26894,27454,29276
1,1.51458,1.47539,1.45449,1.46195,1.4409,1.47846,1.51607,1.53459,1.51287,1.54192,...,53047,462418,16941,15294,18823,22369,24826,27250,28437,30374
2,1.50565,1.46602,1.444,1.45143,1.42508,1.41574,1.44243,1.47648,1.44198,1.45998,...,53917,469225,17328,15441,19056,22997,24702,26641,27585,29121
3,1.53664,1.50323,1.46508,1.42571,1.4347,1.42078,1.44322,1.4524,1.46457,1.51837,...,53643,465916,16694,15318,18830,22634,24608,26706,27805,29507
4,1.52808,1.47522,1.45618,1.37537,1.37561,1.39346,1.42378,1.46455,1.47553,1.46507,...,48987,456795,15882,14244,17120,20866,23471,26877,28607,31338


In [5]:
def feature_groupings(df, patterns=None, mingroup=2, forbidden=set([])):
    if patterns is None:
        patterns = ['-', '_', '.']

    groups = {}
    names = df.columns
    for feature in names:
        parts = []
        for patt in patterns:
            s = feature.split(patt)
            parts += s
        for part in parts:
            if part in forbidden:
                continue
            if not groups.get(part, None):
                groups[part] = set()
            groups[part] = groups[part].union(set([feature]))
    
    return { k: v for k,v in groups.items() if len(v) >= mingroup }

boring =set(categories.keys()).union(['originals', 'wavelet-LLL', 'LLL', 'wavelet-LHH', 'LLH', 'wavelet-LLH', 'LHH', 'wavelet-LHH', 'LHL', 'wavelet-LHL', 'lbp'])
groups = feature_groupings(fulldata, forbidden=boring, mingroup=6)
pandas.DataFrame(index=groups.keys(), data={'Size': [len(v) for v in groups.values()] }).sort_values('Size', ascending=False)

Unnamed: 0,Size
glcm2,105
glcm1,105
glcm15,105
glcm10,105
glcm3,105
first,95
glrlm,80
glszm,75
gldm,70
Contrast,30


In [6]:
fulldata.shape

(128, 1426)

In [7]:
len(list(fulldata.Labels.values))

128

In [97]:
# Goal: train classifiers that have high test set accuracy using as few features as possible
def evaluate_one(estimator, data, seed=1, n_random=100, cv=3):
    # TODO: clone estimator. Pass in random_state
    train_test_split = sklearn.model_selection.train_test_split
    
    feature_columns = list(set(data) - set(['Labels']))
    use = data[data.Labels.notna()]

    numpy.random.seed(seed)
    test_scores = numpy.array([])
    train_scores = numpy.array([])
    for rng in numpy.random.randint(0, 1000, size=n_random):
        #estimator = estimator_class(**params)
        X = use[feature_columns].astype(float)
        X_train, X_test, Y_train, Y_test = train_test_split(X, use.Labels,
                                                            test_size=0.4, random_state=rng)

        estimator.fit(X_train, Y_train)

        test = sklearn.model_selection.cross_val_score(estimator, X_test, Y_test, cv=cv)
        train = sklearn.model_selection.cross_val_score(estimator, X_train, Y_train, cv=cv)
        test_scores = numpy.concatenate([test_scores, test]) 
        train_scores = numpy.concatenate([train_scores, train])
    
    return train_scores, test_scores

def evaluate_many(models, data, n_random=100, cv=5):
    df = pandas.DataFrame({
        'model': [],
        'parameters': [],
        'train_accuracy_mean': [],
        'train_accuracy_std': [],
        'test_accuracy_mean': [],
        'test_accuracy_std': [],
    })
    for (mtype, mclass) in models:
        train, test = evaluate_one(mclass, data, n_random=n_random, cv=cv)
        d = pandas.DataFrame({
            'model': [ mtype ],
            'parameters': [ 'TODO' ],
            'train_accuracy_mean': [ numpy.mean(train) ],
            'train_accuracy_std': [ numpy.std(train) ],
            'test_accuracy_mean': [ numpy.mean(test) ],
            'test_accuracy_std': [ numpy.std(test) ],
        })
        df = pandas.concat([df, d])
        
    return df



RandomForestClassifier = sklearn.ensemble.RandomForestClassifier

RandomForest = sklearn.pipeline.Pipeline([
  ('classification', RandomForestClassifier(n_estimators=50, min_samples_leaf=0.001))
])

RandomForest5 = sklearn.pipeline.Pipeline([
  ('feature_selection', sklearn.feature_selection.SelectFromModel(RandomForestClassifier(n_estimators=50, min_samples_leaf=0.001), threshold=-numpy.inf, max_features=5)),
  ('classification', RandomForestClassifier(n_estimators=50, min_samples_leaf=0.001))
])

logistic_params = {
    'log__C': [ 1.0, 0.75, 0.5, 0.25 ],
}
GridSearchCV = sklearn.model_selection.GridSearchCV
LogisticRegression = sklearn.linear_model.LogisticRegression
LogisticRegressionL1 = GridSearchCV(sklearn.pipeline.Pipeline([
    ('scale', sklearn.preprocessing.StandardScaler()),
    ('log', LogisticRegression(penalty='l1', max_iter=100, solver='saga', multi_class='multinomial')),
]), param_grid=logistic_params, cv=5)

# Motivations for feature selection
# Getting a more explainable model. PCA/dim reduction. Using only one feature type
# Reducing resources needed for data gathering. Using few feature types.
# Reducing prediction time. Avoid many/expensive feature computations

# TODO: implement k-NN?
# TODO: implement PCA. On all features, grouped by feature type?

models = [
    #('rf-all', RandomForest ),
    #('rf-5', RandomForest5 ),
    ('logistic-l1',  LogisticRegressionL1),
]
    
df = evaluate_many(models, fulldata, n_random=1, cv=3)
df.head()









Unnamed: 0,model,parameters,train_accuracy_mean,train_accuracy_std,test_accuracy_mean,test_accuracy_std
0,logistic-l1,TODO,0.858685,0.026912,0.793956,0.039685


In [95]:
LogisticRegressionL1.best_params_

{'log__C': 1.0}

In [98]:
pandas.DataFrame(LogisticRegressionL1.cv_results_)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_log__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,2.68999,0.073675,0.004595,0.001074,1.0,{'log__C': 1.0},0.846154,1.0,0.909091,0.9,...,0.912281,0.052982,2,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,2.635399,0.089513,0.00472,0.001298,0.75,{'log__C': 0.75},0.923077,1.0,0.909091,0.9,...,0.929825,0.039097,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,2.582524,0.087914,0.004394,0.001041,0.5,{'log__C': 0.5},0.769231,1.0,0.909091,0.9,...,0.894737,0.078544,3,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,2.547987,0.089202,0.004863,0.001077,0.25,{'log__C': 0.25},0.769231,0.846154,0.818182,0.8,...,0.789474,0.049083,4,0.977273,0.909091,0.956522,0.957447,0.957447,0.951556,0.022622


In [87]:
LogisticRegressionL1.best_estimator_.steps[1][1].coef_.shape

(6, 1425)

In [91]:
numpy.nonzero(numpy.count_nonzero(LogisticRegressionL1.best_estimator_.steps[1][1].coef_, axis=0))[0].shape

(1047,)

In [29]:
# maybe consider a multi step-approach.
# 0: Evaluate features with different methods
# RF feature importance, Linear/kNN SBS/SFS, SelectKBest scoring funcs 
# For each random iteration, build a dictionary with feature: score
# 1. Build a shortlist of features to consider
# Look at the score statistics
# Extract top-NN
# 2: Train expressive classifiers on the shortlisted features
# RandomForest, SVM rbf, DNN, GBT

In [55]:
def extract_feature_scores(data):
    ret = []
    scorers = [
        sklearn.feature_selection.mutual_info_classif,
        sklearn.feature_selection.f_classif,
        sklearn.feature_selection.chi2,
    ]
    for f in scorers:
        feature_columns = list(set(data) - set(['Labels']))
        use = data[data.Labels.notna()]
        X = use[feature_columns]
        X = sklearn.preprocessing.MinMaxScaler().fit_transform(X)
        y = use.Labels
        scores = numpy.array(f(X, y))
        if scores.shape[0] == 2:
            scores = scores[0] # drop p-values
        # normalize it
        scores = scores/scores.sum()
        ret.append(scores)
    return numpy.array(ret)
        
s = extract_feature_scores(fulldata)
assert s.shape[1] == 1425
print(sorted(numpy.argsort(s, axis=0)[:20]))

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [76]:
# top features
numpy.sort(numpy.argsort(s, axis=1)[:,:20])

array([[  15,   33,   80,  187,  265,  311,  385,  551,  659,  795,  854,
         926,  945,  981,  995,  998, 1051, 1201, 1319, 1345],
       [ 258,  301,  303,  311,  659,  730,  854,  888,  926,  945,  981,
        1001, 1037, 1051, 1088, 1098, 1112, 1201, 1276, 1368],
       [ 187,  265,  311,  373,  531,  659,  676,  795,  854,  866,  888,
         907,  926,  945,  981, 1001, 1037, 1089, 1098, 1201]])