In [1]:
from multiprocessing import Pool

import numpy as np
import pandas as pd
from statsmodels.stats.multitest import multipletests
from tsfresh.feature_selection.significance_tests import target_binary_feature_real_test, \
    target_real_feature_binary_test, target_real_feature_real_test, target_binary_feature_binary_test
from tsfresh.feature_selection.relevance import calculate_relevance_table, get_feature_type, _calculate_relevance_table_for_implicit_target
from tsfresh import defaults
from tsfresh.utilities.distribution import initialize_warnings_in_workers
from tsfresh.feature_selection import select_features
from functools import partial, reduce

In [2]:
X = pd.read_hdf('data/features/6000/ach-at-hex_6000_0_eff.h5')
y = pd.read_hdf('data/processed/y_4_class_6000.h5')
y_bin = y.astype('category')
y_bin = pd.get_dummies(y_bin)

In [None]:
print(X.shape)
print(y.shape)
y = y.reset_index(drop=True)
X = X.reset_index(drop=True)

In [None]:
y

In [None]:
n_jobs=defaults.N_PROCESSES 
show_warnings=defaults.SHOW_WARNINGS
chunksize=defaults.CHUNKSIZE
test_for_binary_target_binary_feature=defaults.TEST_FOR_BINARY_TARGET_BINARY_FEATURE
test_for_binary_target_real_feature=defaults.TEST_FOR_BINARY_TARGET_REAL_FEATURE
test_for_real_target_binary_feature=defaults.TEST_FOR_REAL_TARGET_BINARY_FEATURE
test_for_real_target_real_feature=defaults.TEST_FOR_REAL_TARGET_REAL_FEATURE
fdr_level=defaults.FDR_LEVEL
hypotheses_independent=defaults.HYPOTHESES_INDEPENDENT
if n_jobs == 0:
    map_function = map
else:
    pool = Pool(processes=n_jobs, initializer=initialize_warnings_in_workers, initargs=(show_warnings,))
    map_function = partial(pool.map, chunksize=chunksize)

In [None]:
relevance_table = pd.DataFrame(index=pd.Series(X.columns, name='feature'))
relevance_table['feature'] = relevance_table.index
relevance_table['type'] = pd.Series(
    map(get_feature_type, [X[feature] for feature in relevance_table.index]),
    index=relevance_table.index
)
table_real = relevance_table[relevance_table.type == 'real'].copy()
table_binary = relevance_table[relevance_table.type == 'binary'].copy()

table_const = relevance_table[relevance_table.type == 'constant'].copy()
table_const['p_value'] = np.NaN
table_const['relevant'] = False

In [None]:
tables = []
for label in y.unique():
    _test_real_feature = partial(target_binary_feature_real_test, y=(y == label),
                                 test=test_for_binary_target_real_feature)
    _test_binary_feature = partial(target_binary_feature_binary_test, y=(y == label))
    tmp = _calculate_relevance_table_for_implicit_target(
        table_real, table_binary, X, _test_real_feature, _test_binary_feature, hypotheses_independent,
        fdr_level, map_function
    )
    tmp = tmp.reset_index(drop=True)
    tmp.columns = tmp.columns.map(lambda x : x+'_'+str(label) if x !='feature' and x!='type' else x)

    tables.append(tmp)
    
relevance_table = reduce(lambda  left,right: pd.merge(left,right,on=['feature','type'],
                                            how='outer'), tables)
relevance_table['n_significant'] = relevance_table.filter(regex='^relevant_', axis=1).sum(axis=1)
relevance_table['relevant'] = np.where(relevance_table['n_significant'] >= len(y.unique()), True, False)

In [None]:
sum(relevance_table['relevant'])

In [None]:
relevant_features = relevance_table[relevance_table.relevant].feature

In [None]:
relevant_features

In [None]:
filtered = select_features(X,y)

In [None]:
filtered.shape

In [None]:
p_vector = []
for feature in X:
        p = []
        try:
            p.append(target_binary_feature_real_test(X[feature],y_bin[0],'mann'))
            p.append(target_binary_feature_real_test(X[feature],y_bin[1],'mann'))
            p.append(target_binary_feature_real_test(X[feature],y_bin[2],'mann'))
        except ValueError:
            p.append(1000)
            p.append(1000)
            p.append(1000)

        p.append(feature)
        p_vector.append(p)

In [None]:
p_vals = pd.DataFrame(p_vector)
p_vals.sort_values(0)

In [None]:
# a.p_value = a.p_value.combine(b.p_value, min, 1)

a = p_vals[0].combine(p_vals[1], min, 1)

In [None]:
defaults.FDR_LEVEL = 0.01

In [3]:
from src.features.feature_selection import MulticlassFeatureSelector

In [None]:
dataset = pd.read_hdf('data/processed/ach_at_hex_6000.h5')

In [None]:
window_size = 6000
y4 = (dataset[['id','y']]
     .drop_duplicates('id')
     .set_index('id')
     .T
     .squeeze()
     .sort_index(0))

In [None]:
y3 = y4.copy()
y3[y3 > 2] = 2

y2 = y3.copy()
y2[y2 > 1] = 1

y4.to_hdf('data/processed/y_4_class_'+str(window_size)+'.h5', key='data', complevel=9)
y3.to_hdf('data/processed/y_3_class_'+str(window_size)+'.h5', key='data', complevel=9)
y2.to_hdf('data/processed/y_2_class_'+str(window_size)+'.h5', key='data', complevel=9)

In [4]:
fs = MulticlassFeatureSelector()

In [5]:
filt = fs.fit(X,y)

In [7]:
filt.transform(X)

variable,"0__change_quantiles__f_agg_""var""__isabs_True__qh_0.4__ql_0.0",0__ar_coefficient__k_10__coeff_1,0__ar_coefficient__k_10__coeff_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,0.007879,4.900492,-8.159355
2.0,0.007024,4.893328,-8.140396
3.0,0.006491,4.910918,-8.206875
4.0,0.002848,4.898380,-8.159901
5.0,0.006811,4.899040,-8.161483
...,...,...,...
986.0,0.000137,6.530854,-19.585644
987.0,0.000117,6.546576,-19.682495
988.0,0.000093,6.547226,-19.689064
989.0,0.000052,6.562510,-19.775162
