In [126]:
%matplotlib inline

import pandas
import numpy
import sklearn
import sklearn.model_selection
import sklearn.ensemble


  from numpy.core.umath_tests import inner1d


In [124]:
# Return a combined DataFrame, and a dictionary of {featuretype: columns}
def load_data():
    def load_sheet(name):
        df = file.parse(name)
        df.index = df['Index']
        del df['Index']
        return df
    
    file = pandas.ExcelFile('Full_data.xlsx')
    dfs = { sheet: load_sheet(sheet) for sheet in file.sheet_names}
    
    # AMT is missing names for columns
    combine = [dfs[k] for k in dfs.keys() if k not in ('AMT')]
    
    combined = dfs['AMT'].copy().add_suffix('-AMT')
    combined = combined.join(combine, lsuffix='', rsuffix='')
    
    # TODO: there are also categories within each wavelet series
    # separated with a _ or -
    feature_categories = {}
    for category, df in dfs.items():
        columns = df.columns
        feature_categories[category] = columns
    
    return combined, feature_categories

fulldata, categories = load_data()
assert fulldata.shape[0] == 128, fulldata.shape
assert numpy.count_nonzero(fulldata.Labels.notnull()) == 96

In [89]:
categories.keys()

dict_keys(['Labels', 'AMT', 'WT_originals', 'WT-LLL', 'WT-LLH', 'WT-LHL', 'WT-LHH', 'LBP'])

In [53]:
fulldata.head()

Unnamed: 0_level_0,Labels,10th perc_first_originals,90th perc_first_originals,Energy_first_originals,Entropy_first_originals,Interquartile Range_first_originals,Kurtosis_first_originals,MAD_first_originals,Maximum_first_originals,Mean_first_originals,...,"lbp_24_(24,3)","lbp_25_(24,3)","lbp_2_(24,3)","lbp_3_(24,3)","lbp_4_(24,3)","lbp_5_(24,3)","lbp_6_(24,3)","lbp_7_(24,3)","lbp_8_(24,3)","lbp_9_(24,3)"
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Australia,89,142,15529230000.0,1.831645,28,2.811141,11.74549,211,115.253652,...,53695,468638,17261,15439,18953,22857,24262,26894,27454,29276
1,Australia,88,142,15523470000.0,1.843345,28,2.732178,11.887582,198,115.227023,...,53047,462418,16941,15294,18823,22369,24826,27250,28437,30374
2,Australia,88,142,15519160000.0,1.850979,28,2.76477,11.931181,208,115.210354,...,53917,469225,17328,15441,19056,22997,24702,26641,27585,29121
3,Australia,89,142,15522210000.0,1.837482,28,2.77299,11.711393,200,115.221868,...,53643,465916,16694,15318,18830,22634,24608,26706,27805,29507
4,Australia,89,143,15522930000.0,1.836921,29,2.697592,12.122661,203,115.225664,...,48987,456795,15882,14244,17120,20866,23471,26877,28607,31338


In [96]:
def feature_groupings(df, patterns=None, mingroup=2, forbidden=set([])):
    if patterns is None:
        patterns = ['-', '_', '.']

    groups = {}
    names = df.columns
    for feature in names:
        parts = []
        for patt in patterns:
            s = feature.split(patt)
            parts += s
        for part in parts:
            if part in forbidden:
                continue
            if not groups.get(part, None):
                groups[part] = set()
            groups[part] = groups[part].union(set([feature]))
    
    return { k: v for k,v in groups.items() if len(v) >= mingroup }

boring =set(categories.keys()).union(['originals', 'wavelet-LLL', 'LLL', 'wavelet-LHH', 'LLH', 'wavelet-LLH', 'LHH', 'wavelet-LHH', 'LHL', 'wavelet-LHL', 'lbp'])
groups = feature_groupings(fulldata, forbidden=boring, mingroup=6)
pandas.DataFrame(index=groups.keys(), data={'Size': [len(v) for v in groups.values()] }).sort_values('Size', ascending=False)

Unnamed: 0,Size
glcm2,105
glcm1,105
glcm15,105
glcm10,105
glcm3,105
first,95
glrlm,80
glszm,75
gldm,70
Contrast,30


In [113]:
fulldata.shape

(96, 926)

In [115]:
len(list(fulldata.Labels.values))

96

In [184]:
def evaluate(estimator, data, seed=1, n_random=5, cv=3):
    train_test_split = sklearn.model_selection.train_test_split
    
    feature_columns = list(set(data) - set(['Labels']))
    use = data[data.Labels.notna()]

    numpy.random.seed(seed)
    test_scores = numpy.array([])
    train_scores = numpy.array([])
    for rng in numpy.random.randint(0, 10000, size=n_random):
        X_train, X_test, Y_train, Y_test = train_test_split(use[feature_columns], use.Labels,
                                                            test_size=0.4, random_state=rng)

        estimator.fit(X_train, Y_train)

        test = sklearn.model_selection.cross_val_score(estimator, X_test, Y_test, cv=cv)
        train = sklearn.model_selection.cross_val_score(estimator, X_train, Y_train, cv=cv)
        test_scores = numpy.concatenate([test_scores, test]) 
        train_scores = numpy.concatenate([train_scores, train])
        

    print(numpy.mean(train_scores), numpy.max(train_scores), numpy.min(train_scores))
    print(numpy.mean(test_scores), numpy.max(test_scores), numpy.min(test_scores))    
    evaluation = data[data.Labels.isna()]
    Y_eval = estimator.predict(evaluation[feature_columns])
    
    pred = pandas.DataFrame({'Id': numpy.arange(1, len(Y_eval)+1), 'Category': Y_eval})
    return pred

randomforest = sklearn.ensemble.RandomForestClassifier(n_estimators=50, min_samples_leaf=0.001, random_state=1)
predictions = evaluate(randomforest, fulldata, n_random=1, cv=3)
predictions.to_csv('predictions.csv', index=False)
predictions

0.8922797189051058 0.9523809523809523 0.8421052631578947
0.7864801864801865 0.9230769230769231 0.6363636363636364


Unnamed: 0,Id,Category
0,1,China
1,2,Australia
2,3,SouthAfrica
3,4,China
4,5,SouthAfrica
5,6,SouthAfrica
6,7,China
7,8,SouthAfrica
8,9,USA
9,10,USA
