In [1]:
%matplotlib inline

import pandas
import numpy
import sklearn
import sklearn.model_selection
import sklearn.ensemble
import sklearn.pipeline
import sklearn.feature_selection


In [2]:
# Return a combined DataFrame, and a dictionary of {featuretype: columns}
def load_data():
    def load_sheet(name):
        df = file.parse(name)
        df.index = df['Index']
        del df['Index']
        return df
    
    file = pandas.ExcelFile('Full_data.xlsx')
    dfs = { sheet: load_sheet(sheet) for sheet in file.sheet_names}
    
    # AMT is missing names for columns
    combine = [dfs[k] for k in dfs.keys() if k not in ('AMT')]
    
    combined = dfs['AMT'].copy().add_suffix('-AMT')
    combined = combined.join(combine, lsuffix='', rsuffix='')
    
    # TODO: there are also categories within each wavelet series
    # separated with a _ or -
    feature_categories = {}
    for category, df in dfs.items():
        columns = df.columns
        feature_categories[category] = columns
    
    return combined, feature_categories

fulldata, categories = load_data()
assert fulldata.shape[0] == 128, fulldata.shape
assert numpy.count_nonzero(fulldata.Labels.notnull()) == 96

In [3]:
categories.keys()

dict_keys(['Labels', 'AMT', 'WT_originals', 'WT-LLL', 'WT-LLH', 'WT-LHL', 'WT-LHH', 'LBP'])

In [4]:
fulldata.head()

Unnamed: 0_level_0,1-AMT,2-AMT,3-AMT,4-AMT,5-AMT,6-AMT,7-AMT,8-AMT,9-AMT,10-AMT,...,"lbp_24_(24,3)","lbp_25_(24,3)","lbp_2_(24,3)","lbp_3_(24,3)","lbp_4_(24,3)","lbp_5_(24,3)","lbp_6_(24,3)","lbp_7_(24,3)","lbp_8_(24,3)","lbp_9_(24,3)"
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.55144,1.5198,1.45847,1.4394,1.43263,1.44893,1.50741,1.54977,1.59866,1.63075,...,53695,468638,17261,15439,18953,22857,24262,26894,27454,29276
1,1.51458,1.47539,1.45449,1.46195,1.4409,1.47846,1.51607,1.53459,1.51287,1.54192,...,53047,462418,16941,15294,18823,22369,24826,27250,28437,30374
2,1.50565,1.46602,1.444,1.45143,1.42508,1.41574,1.44243,1.47648,1.44198,1.45998,...,53917,469225,17328,15441,19056,22997,24702,26641,27585,29121
3,1.53664,1.50323,1.46508,1.42571,1.4347,1.42078,1.44322,1.4524,1.46457,1.51837,...,53643,465916,16694,15318,18830,22634,24608,26706,27805,29507
4,1.52808,1.47522,1.45618,1.37537,1.37561,1.39346,1.42378,1.46455,1.47553,1.46507,...,48987,456795,15882,14244,17120,20866,23471,26877,28607,31338


In [5]:
def feature_groupings(df, patterns=None, mingroup=2, forbidden=set([])):
    if patterns is None:
        patterns = ['-', '_', '.']

    groups = {}
    names = df.columns
    for feature in names:
        parts = []
        for patt in patterns:
            s = feature.split(patt)
            parts += s
        for part in parts:
            if part in forbidden:
                continue
            if not groups.get(part, None):
                groups[part] = set()
            groups[part] = groups[part].union(set([feature]))
    
    return { k: v for k,v in groups.items() if len(v) >= mingroup }

boring =set(categories.keys()).union(['originals', 'wavelet-LLL', 'LLL', 'wavelet-LHH', 'LLH', 'wavelet-LLH', 'LHH', 'wavelet-LHH', 'LHL', 'wavelet-LHL', 'lbp'])
groups = feature_groupings(fulldata, forbidden=boring, mingroup=6)
pandas.DataFrame(index=groups.keys(), data={'Size': [len(v) for v in groups.values()] }).sort_values('Size', ascending=False)

Unnamed: 0,Size
glcm2,105
glcm1,105
glcm15,105
glcm10,105
glcm3,105
first,95
glrlm,80
glszm,75
gldm,70
Contrast,30


In [6]:
fulldata.shape

(128, 1426)

In [7]:
len(list(fulldata.Labels.values))

128

In [18]:
# Goal: train classifiers that have high test set accuracy using as few features as possible
def evaluate_one(estimator, data, seed=1, n_random=100, cv=3):
    # TODO: clone estimator. Pass in random_state
    train_test_split = sklearn.model_selection.train_test_split
    
    feature_columns = list(set(data) - set(['Labels']))
    use = data[data.Labels.notna()]

    numpy.random.seed(seed)
    test_scores = numpy.array([])
    train_scores = numpy.array([])
    for rng in numpy.random.randint(0, 10000, size=n_random):
        #estimator = estimator_class(**params)
        X_train, X_test, Y_train, Y_test = train_test_split(use[feature_columns], use.Labels,
                                                            test_size=0.4, random_state=rng)

        estimator.fit(X_train, Y_train)

        test = sklearn.model_selection.cross_val_score(estimator, X_test, Y_test, cv=cv)
        train = sklearn.model_selection.cross_val_score(estimator, X_train, Y_train, cv=cv)
        test_scores = numpy.concatenate([test_scores, test]) 
        train_scores = numpy.concatenate([train_scores, train])
    
    return train_scores, test_scores

def evaluate_many(models, data, n_random=100, cv=5):
    df = pandas.DataFrame({
        'model': [],
        'parameters': [],
        'train_accuracy_mean': [],
        'train_accuracy_std': [],
        'test_accuracy_mean': [],
        'test_accuracy_std': [],
    })
    for (mtype, mclass) in models:
        train, test = evaluate_one(mclass, data, n_random=n_random, cv=cv)
        d = pandas.DataFrame({
            'model': [ mtype ],
            'parameters': [ 'TODO' ],
            'train_accuracy_mean': [ numpy.mean(train) ],
            'train_accuracy_std': [ numpy.std(train) ],
            'test_accuracy_mean': [ numpy.mean(test) ],
            'test_accuracy_std': [ numpy.std(test) ],
        })
        df = pandas.concat([df, d])
        
    return df



RandomForestClassifier = sklearn.ensemble.RandomForestClassifier

RandomForest = sklearn.pipeline.Pipeline([
  ('classification', RandomForestClassifier(n_estimators=50, min_samples_leaf=0.001))
])

RandomForest5 = sklearn.pipeline.Pipeline([
  ('feature_selection', sklearn.feature_selection.SelectFromModel(RandomForestClassifier(n_estimators=50, min_samples_leaf=0.001), threshold=-numpy.inf, max_features=5)),
  ('classification', RandomForestClassifier(n_estimators=50, min_samples_leaf=0.001))
])

models = [
    ('rf-all', RandomForest ),
    ('rf-5', RandomForest5 ),
]
    
df = evaluate_many(models, fulldata, n_random=10, cv=3)
df.head()

Unnamed: 0,model,parameters,test_accuracy_mean,test_accuracy_std,train_accuracy_mean,train_accuracy_std
0,rf-all,TODO,0.869558,0.106525,0.880518,0.054296
0,rf-5,TODO,0.791251,0.113168,0.848283,0.064511
