### Imports

In [1]:
from __future__ import print_function, division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from ggplot import *
import json

import os
from os.path import join
print('Current directory: {}'.format(os.getcwd()))
import sys
sys.path.append('..')
from glob import glob
from pprint import pprint
from time import time

import subprocess
import re

Current directory: /Users/hmourit/Documents/0project/MScThesis/notebooks


### General constants

In [2]:
DATA_PATH = '../../bucket/data/'
RESULTS_PATH = '../../bucket/results/'

### Latex table generation

#### Some constants

In [11]:
CAPTION_PLACEHOLDER = r'caption:placeholder'
REMOVE_N_FEATURES = r'^n\\_features*'

TABLES_FOLDER = '~/Dropbox/MSc Thesis/tables'

#### Util functions

In [6]:
def mean_pm_std(mean, std):
    return '{:1.2f}$\pm${:1.2f}'.format(mean, std)
mean_pm_std = np.vectorize(mean_pm_std)

In [7]:
def pbcopy(data):
    """Copy data to clipboard."""
    p = subprocess.Popen(['pbcopy'], stdin=subprocess.PIPE) 
    p.stdin.write(data) 
    p.stdin.close() 
    retcode = p.wait()

In [8]:
def bold_center(cell):
    return r'\mc{1}{c}{\textbf{' + cell + '}}'

In [9]:
def process_latex(tabular, remove_lines=None):
    replacements = {
        ' en ': bold_center('EN'),
        'svm\_linear\_kernel': bold_center('SVM'),
        'svm\_linear\_l1': bold_center('L1-SVM'),
        r'\$\textbackslashpm\$': r'$\pm$',
        'clf': bold_center('CLF'),
        r'infogain\_10': bold_center('IG D1'),
        r'infogain\_exp': bold_center('IG D2')
    }
    
    if remove_lines and not isinstance(remove_lines, list):
        remove_lines = [remove_lines]
    
    purged_tabular = []
    for line in tabular.split('\n'):
        if r'\begin{tabular}' in line:
            begin_tabular = line.split('}{')
            line = begin_tabular[0] + '}'
            line += '{' + begin_tabular[1].replace('l', 'r')
            purged_tabular.append(line)
        elif remove_lines and not any(re.match(pattern, line) for pattern in remove_lines):
            purged_tabular.append(line)
    tabular = '\n'.join(purged_tabular)
    
    for x in replacements:
        tabular = tabular.replace(x, replacements[x])
        
    table_env_begin = r"""\begin{table}[!h]
                       \centering
                       \footnotesize
                       """.replace(' ', '')
    table_env_end = r"""\caption{caption:placeholder}
                     \label{fig:placeholder}
                     \end{table}
                     """.replace(' ', '')
    
    return table_env_begin + tabular + table_env_end

In [12]:
def save_table(tex, filename, caption=None, folder=TABLES_FOLDER):
    folder = os.path.expanduser(folder)
    tex = tex.replace('fig:placeholder', 'fig:{}'.format(filename.split('.')[0]))
    if caption is not None:
        tex = tex.replace(CAPTION_PLACEHOLDER, caption)
    with open(os.path.join(folder, filename), 'w') as f:
        f.write(tex)

### Load results

In [None]:
from sklearn.metrics import accuracy_score

result_files = glob(join(RESULTS_PATH, '*_*.json'))
result_files = [x for x in result_files if os.path.basename(x).startswith(('anova', 'infogain', 'rfe', 'chi2', 'mrmr'))]

results = []
for f in result_files:
    exp_id = f.split('_')[-1].rstrip('.json')
    try:
        d = json.load(open(join(RESULTS_PATH, f), 'r'))
    except ValueError as e:
        if os.path.getsize(f) == 0:
            e = 'File size is 0. Removing.'
            os.remove(f)
        print('{} -> {}'.format(f, e))
    base = {'exp_id': exp_id}
    left = []
    for k, v in d.items():
        if k != 'experiments':
            base[k] = v
    if os.path.basename(f).startswith('mrmr'):
        base['filter'] = 'mrmr'
    elif os.path.basename(f).startswith('rfe'):
        base['filter'] = 'rfe'
    for exp in d['experiments']:
        it = exp['iteration']
        if 'subsets' not in exp:
            print('{} -> Field "subsets" not found.'.format(f))
            continue
        for s in exp['subsets']:
            train = accuracy_score(s['train']['y_true'], s['train']['y_pred'])
            test = accuracy_score(s['test']['y_true'], s['test']['y_pred'])
            if 'n_features' in s:
                n_features = s['n_features']
            else:
                n_features = len(s['features'])   
            results.append(dict(base, iteration=it, train=train, test=test, n_features=n_features))    

df = pd.DataFrame(results)

## Tables

#### Some constants

In [13]:
MDD = 'mdd_raw37'
EPI = 'epi_ad'
AD = 'ad.disease.status'

#### d1_vs_d2_mdd

In [None]:
foo = df
foo = foo[((foo.data == MDD)
           & (foo.target == 'stress')
           & (foo['filter'].isin(['infogain_10', 'infogain_exp'])))]
foo = foo.groupby(['n_features', 'filter', 'clf']).agg({'test': [np.mean, np.std]})
foo['Accuracy'] = mean_pm_std(foo['test']['mean'], foo['test']['std'])
foo = foo['Accuracy'].unstack().unstack()
foo_latex = foo.to_latex()
foo_latex = process_latex(foo_latex, remove_lines=REMOVE_N_FEATURES)
repl = [('clf', ['b', '2r', 'c']),
        ('en', 'bc'),
        ('svm', 'bc'),
        ('l1 svm', 'bc'),
        
        ('IG1', 'bc'),
        ('IG2', 'bc'),
        ('pm', None)]
foo_latex = batch_replace(foo_latex, repl)
foo_latex = foo_latex.replace('filter', '')
save_table(foo_latex, 'd1_vs_d2_mdd.tex')

#### dis_anal_mdd_en

In [None]:
foo = df
def discretization_type(method):
    if method in ['infogain_exp', 'chi2']:
        return 'D2'
    elif method in ['infogain_10', 'mrmr']:
        return 'D1'
    elif method in ['anova', 'rfe']:
        return 'D0'
    else:
        raise ValueError
vec_dis = np.vectorize(discretization_type)
foo['disc'] = vec_dis(foo['filter'])
foo = foo[((foo.data == MDD)
           & (foo.target == 'stress'))]
#            & (foo['filter'].isin(['infogain_10', 'infogain_exp'])))]
foo = foo[df.clf == 'en']
foo = foo.groupby(['n_features', 'filter', 'disc']).agg({'test': [np.mean, np.std]})
foo['Accuracy'] = mean_pm_std(foo['test']['mean'], foo['test']['std'])
foo = foo['Accuracy'].unstack().unstack()
foo = foo.loc[[37231, 30000, 20000, 10000, 5000, 1000, 500, 100, 50, 10]]
# foo = foo[['rfe', 'anova', 'mrmr', 'infogain_10', 'infogain_exp', 'chi2']]
foo = foo.dropna(axis=1, how='all')

foo_latex = foo.to_latex(na_rep=' ')
foo_latex = process_latex(foo_latex, remove_lines=REMOVE_N_FEATURES)
repl = [('clf', ['b', '2r', 'c']),
        ('en', 'bc'),
        ('svm', 'bc'),
        ('l1 svm', 'bc'),
        ('anova', 'bc'),
        ('mrmr', 'bc'), 
        ('rfe', 'bc'),
        ('IG1', 'bc'),
        ('IG2', 'bc'),
        ('chi2', 'bc'),
        ('pm', None)]
foo_latex = batch_replace(foo_latex, repl)
foo_latex = my_replace(foo_latex, src='disc', dst=r'\# Feat', effects=['b', '2r', 'c'])
for d in [' D0 ', ' D1 ', ' D2 ']:
    foo_latex = my_replace(foo_latex, src=d, dst=d, effects='bc')
foo_latex = foo_latex.replace('filter', '')`
foo_latex = add_clines(foo_latex, [(1, 2, 3), (1, 4, 5), (1, 6, 7)])
save_table(foo_latex, 'dis_anal_mdd_en.tex')

#### dis_anal_mdd_svm

In [None]:
foo = df
def discretization_type(method):
    if method in ['infogain_exp', 'chi2']:
        return 'D2'
    elif method in ['infogain_10', 'mrmr']:
        return 'D1'
    elif method in ['anova', 'rfe']:
        return 'D0'
    else:
        raise ValueError
vec_dis = np.vectorize(discretization_type)
foo['disc'] = vec_dis(foo['filter'])
foo = foo[((foo.data == MDD)
           & (foo.target == 'stress'))]
#            & (foo['filter'].isin(['infogain_10', 'infogain_exp'])))]
foo = foo[df.clf.isin(['svm_linear_kernel'])]
foo = foo.groupby(['n_features', 'filter', 'disc']).agg({'test': [np.mean, np.std]})
foo['Accuracy'] = mean_pm_std(foo['test']['mean'], foo['test']['std'])
foo = foo['Accuracy'].unstack().unstack()
foo = foo.loc[[37231, 30000, 20000, 10000, 5000, 1000, 500, 100, 50, 10]]
# foo = foo[['rfe', 'anova', 'mrmr', 'infogain_10', 'infogain_exp', 'chi2']]
foo = foo.dropna(axis=1, how='all')
foo_latex = foo.to_latex(na_rep=' ')
foo_latex = process_latex(foo_latex, remove_lines=REMOVE_N_FEATURES)
repl = [('clf', ['b', '2r', 'c']),
        ('en', 'bc'),
        ('svm', 'bc'),
        ('l1 svm', 'bc'),
        ('anova', 'bc'),
        ('mrmr', 'bc'), 
        ('rfe', 'bc'),
        ('IG1', 'bc'),
        ('IG2', 'bc'),
        ('chi2', 'bc'),
        ('pm', None)]
foo_latex = foo_latex.replace('clf', '')
foo_latex = batch_replace(foo_latex, repl)
foo_latex = my_replace(foo_latex, src='disc', dst=r'\# Feat', effects=['b', '2r', 'c'])
for d in [' D0 ', ' D1 ', ' D2 ']:
    foo_latex = my_replace(foo_latex, src=d, dst=d, effects='bc')
foo_latex = foo_latex.replace('filter', '')
foo_latex = add_clines(foo_latex, [(1, 2, 3), (1, 4, 5), (1, 6, 7)])
save_table(foo_latex, 'dis_anal_mdd_svm.tex')

#### dis_anal_mdd_svm_l1

In [None]:
foo = df
def discretization_type(method):
    if method in ['infogain_exp', 'chi2']:
        return 'D2'
    elif method in ['infogain_10', 'mrmr']:
        return 'D1'
    elif method in ['anova', 'rfe']:
        return 'D0'
    else:
        raise ValueError
vec_dis = np.vectorize(discretization_type)
foo['disc'] = vec_dis(foo['filter'])
foo = foo[((foo.data == MDD)
           & (foo.target == 'stress'))]
#            & (foo['filter'].isin(['infogain_10', 'infogain_exp'])))]
foo = foo[df.clf.isin(['svm_linear_l1'])]
foo = foo.groupby(['n_features', 'filter', 'disc']).agg({'test': [np.mean, np.std]})
foo['Accuracy'] = mean_pm_std(foo['test']['mean'], foo['test']['std'])
foo = foo['Accuracy'].unstack().unstack()
foo = foo.loc[[37231, 30000, 20000, 10000, 5000, 1000, 500, 100, 50, 10]]
# foo = foo[['rfe', 'anova', 'mrmr', 'infogain_10', 'infogain_exp', 'chi2']]
foo = foo.dropna(axis=1, how='all')
foo_latex = foo.to_latex(na_rep=' ')
foo_latex = process_latex(foo_latex, remove_lines=REMOVE_N_FEATURES)
repl = [('clf', ['b', '2r', 'c']),
        ('en', 'bc'),
        ('svm', 'bc'),
        ('l1 svm', 'bc'),
        ('anova', 'bc'),
        ('mrmr', 'bc'), 
        ('rfe', 'bc'),
        ('IG1', 'bc'),
        ('IG2', 'bc'),
        ('chi2', 'bc'),
        ('pm', None)]
foo_latex = foo_latex.replace('clf', '')
foo_latex = batch_replace(foo_latex, repl)
foo_latex = my_replace(foo_latex, src='disc', dst=r'\# Feat', effects=['b', '2r', 'c'])
for d in [' D0 ', ' D1 ', ' D2 ']:
    foo_latex = my_replace(foo_latex, src=d, dst=d, effects='bc')
foo_latex = foo_latex.replace('filter', '')
foo_latex = add_clines(foo_latex, [(1, 2, 3), (1, 4, 5), (1, 6, 7)])
save_table(foo_latex, 'dis_anal_mdd_svm_l1.tex')

#### mdd_drug_discard

In [None]:
foo = df
foo = foo[((foo.data == MDD) 
           & (df.target == 'drug'))]
foo = foo.groupby(['n_features', 'filter', 'clf'], squeeze=True).agg({'test': [np.mean, np.std, len]})
foo['Accuracy'] = mean_pm_std(foo['test']['mean'], foo['test']['std'])
foo = foo['Accuracy'].unstack().unstack()
foo = foo[foo.index.isin([37231, 30000, 10000, 1000, 100, 10])]
foo = foo.iloc[::-1]
foo = foo[[('en', 'infogain_exp'), ('svm_linear_kernel', 'infogain_10'), ('svm_linear_l1', 'anova')]]
foo_latex = foo.to_latex()
foo_latex = process_latex(foo_latex, remove_lines=REMOVE_N_FEATURES)
repl = [('clf', ['b', '2r', 'c']),
        ('en', 'bc'),
        ('svm', 'bc'),
        ('l1 svm', 'bc'),
        ('IG1', 'bc'),
        ('IG2', 'bc'),
        ('anova', 'bc'),
        ('pm', None)]
foo_latex = batch_replace(foo_latex, repl)
foo_latex = foo_latex.replace('filter', '')
save_table(foo_latex, 'mdd_drug_discard.tex')

#### comparing_tissues

In [None]:
foo = df
# foo = foo.query("data == 'epi_ad' & target == 'ad.disease.status'")
foo = foo[((foo.data == EPI) & (foo.target == AD))]
foo = foo.query(GOOD_CLF_FILTER)
foo = foo[foo.n_features.isin([485577, 200000, 100000, 50000, 10000, 5000, 1000, 500, 100, 10])]
foo = foo.groupby(['tissue', 'n_features', 'clf', 'filter']).agg({'test': [np.mean, np.std, len]})
foo['Accuracy'] = mean_pm_std(foo['test']['mean'], foo['test']['std'])
foo['len'] = foo[('test', 'len')]
foo = foo['Accuracy'].unstack('tissue').unstack('clf').unstack('filter')
foo = foo.dropna(axis=1, how='all')
foo = foo.iloc[::-1]
foo = foo[[(CER, 'en', 'infogain_exp'),
           (EC, 'en', 'chi2'),
           (FC, KER_SVM, 'infogain_10'), (STG, 'en', 'chi2'), (WB, 'en', 'chi2')]]
# foo = foo.drop(['cerebellum', EC, STG], axis=1)

foo_latex = foo.to_latex(na_rep=' ')
foo_latex = process_latex(foo_latex, remove_lines=REMOVE_N_FEATURES)
repl = [('clf', ['b', '2r', 'c']),
        ('en', 'bc'),
        ('svm', 'bc'),
        ('l1 svm', 'bc'),
        ('anova', 'bc'),
        ('mrmr', 'bc'), 
        ('rfe', 'bc'),
        ('IG1', 'bc'),
        ('IG2', 'bc'),
        ('chi2', 'bc'), ('cer', 'bc'), ('ec', 'bc'), ('fc', 'bc'), ('stg', 'bc'), ('wb', 'bc'),
        ('pm', None)]
foo_latex = foo_latex.replace('clf', '')
foo_latex = foo_latex.replace('filter', '')
foo_latex = my_replace(foo_latex, src='tissue', dst=r'\# Feat', effects=['b', '3r', 'c'])
foo_latex = batch_replace(foo_latex, repl)
save_table(foo_latex, 'comparing_tissues.tex')

#### mdd_rfe

In [None]:
foo = df
foo = foo[(foo.data == MDD) & (foo.target == 'stress') & (foo['filter'] == 'rfe')]
# foo = foo[((foo.data == MDD) 
#            & (foo.target == 'drug'))]
#            & (foo['filter'] == 'rfe'))]
foo = foo.groupby(['n_features', 'clf']).agg({'test': [np.mean, np.std, len]})
foo['Accuracy'] = mean_pm_std(foo['test']['mean'], foo['test']['std'])
foo = foo['Accuracy'].unstack('clf')
foo = foo[foo.index.isin([37231, 30000, 10000, 1000, 100, 10])]
foo = foo.iloc[::-1]
foo = foo.drop('svm_linear', axis=1)
# foo = foo[[('en', 'infogain_exp'), ('svm_linear_kernel', 'infogain_10'), ('svm_linear_l1', 'anova')]]
foo_latex = foo.to_latex()
foo_latex = process_latex(foo_latex, remove_lines=REMOVE_N_FEATURES)
repl = [('clf', ['b', 'c']),
        ('en', 'bc'),
        ('svm', 'bc'),
        ('l1 svm', 'bc'),
        ('IG1', 'bc'),
        ('IG2', 'bc'),
        ('anova', 'bc'),
        ('pm', None)]
foo_latex = batch_replace(foo_latex, repl)
save_table(foo_latex, 'mdd_rfe.tex')

#### ad_rfe

In [None]:
foo = df
foo = foo[(foo.data == EPI) & (foo['filter'] == 'rfe')]
# foo = foo[((foo.data == MDD) 
#            & (foo.target == 'drug'))]
#            & (foo['filter'] == 'rfe'))]
foo = foo.groupby(['n_features', 'clf', 'tissue']).agg({'test': [np.mean, np.std, len]})
foo['Accuracy'] = mean_pm_std(foo['test']['mean'], foo['test']['std'])
foo = foo['Accuracy'].unstack('tissue').unstack('clf')
foo = foo[foo.index.isin([485577, 200000, 100000, 50000, 10000, 5000, 1000, 500, 100, 10])]
foo = foo.iloc[::-1]
foo = foo.dropna(axis=1, how='all')
foo = foo.drop([(CER, KER_SVM), (CER, LIN_SVM), (FC, LIN_SVM), FC, (STG, KER_SVM), (STG, L1_SVM), WB, EC], axis=1)
# foo = foo[[('en', 'infogain_exp'), ('svm_linear_kernel', 'infogain_10'), ('svm_linear_l1', 'anova')]]
foo_latex = foo.to_latex()
foo_latex = process_latex(foo_latex, remove_lines=REMOVE_N_FEATURES)
repl = [('clf', ['b', 'c']),
        ('en', 'bc'),
        ('svm', 'bc'),
        ('svm k', 'bc'),
        ('l1 svm', 'bc'),
        ('IG1', 'bc'),
        ('IG2', 'bc'),
        ('anova', 'bc'),
        ('chi2', 'bc'), ('cer', 'bc'), ('ec', 'bc'), ('fc', 'bc'), ('stg', 'bc'), ('wb', 'bc'),
        ('pm', None)]
foo_latex = foo_latex.replace('clf', '')
foo_latex = my_replace(foo_latex, src='tissue', dst=r'\# Feat', effects=['b', '2r', 'c'])
foo_latex = batch_replace(foo_latex, repl)

save_table(foo_latex, 'ad_rfe.tex')

#### mdd_robust

In [None]:
def format_jaccard(jaccard):
    return '{:0.2f}'.format(jaccard)
format_jaccard = np.vectorize(format_jaccard)

df = pd.DataFrame(foo)
df['jaccard'] = format_jaccard(df['jaccard'])
# df = df.groupby(['filter', 'size'])
df = df.drop('clf', axis=1)
# df = df.groupby(['filter'])
# df = df.unstack('filter')
df = df.set_index(['filter', 'size']).unstack('filter')
df = df.swaplevel(0, 1, axis=1)
df = df[[('anova', 'inter'), ('anova', 'union'), ('anova', 'jaccard'),
         ('mrmr', 'inter'), ('mrmr', 'union'), ('mrmr', 'jaccard'),
         ('infogain_10', 'inter'), ('infogain_10', 'union'), ('infogain_10', 'jaccard')]]
df = df.iloc[::-1]
foo_latex = df.to_latex(na_rep=' ')
foo_latex = process_latex(foo_latex, remove_lines=r'^size*')
repl = [('clf', ['b', 'c']),
        ('en', 'bc'),
        ('svm', 'bc'),
        ('l1 svm', 'bc'),
        ('IG1', 'bc'),
        ('IG2', 'bc'),
        ('anova', 'bc'), ('mrmr', 'bc'),
        ('pm', None)]
foo_latex = batch_replace(foo_latex, repl)
foo_latex = my_replace(foo_latex, src='filter', dst='\# Feat', effects=['b', '2r', 'c'])
foo_latex = my_replace(foo_latex, src='inter', dst=r'$\bm{\cap}$', effects='c')
foo_latex = my_replace(foo_latex, src='union', dst=r'$\bm{\cup}$', effects='c')
foo_latex = my_replace(foo_latex, src='jaccard', dst=r'$\bm{J}$', effects='c')
foo_latex = add_clines(foo_latex, [(1, 2, 4), (1, 5, 7), (1, 8, 10)])
save_table(foo_latex, 'mdd_robust.tex')