In [90]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats
from os.path import join
import os
import json
import math

In [2]:
def remove_zero_sum_cols(df):
    sum_series = df.sum(axis=0)
    nonzero_sum_cols = sum_series[sum_series > 0].index.values.tolist()
    df = df[nonzero_sum_cols]
    return df

In [3]:
def normalize_df(df):
    return pd.DataFrame(
        index=df.index.values.tolist(),
        columns=df.columns.values.tolist(),
        data=(df.values / df.values.sum(axis=0, keepdims=True))
    )

In [16]:
DATA_DIR = "data"
TSF_WINDOW_DIR = join(DATA_DIR, "cis-pd", "training_data_tsf_sample")
TSF_WINDOW_FILES = [ join(TSF_WINDOW_DIR, f) for f in os.listdir(TSF_WINDOW_DIR) if f.endswith(".tsf.csv") ]
LABELS_FILE = join(DATA_DIR, "cis-pd", "data_labels", "CIS-PD_Training_Data_IDs_Labels.csv")

In [74]:
labels_df = pd.read_csv(LABELS_FILE, index_col=0)
labels_df.head()

m_ids = [ os.path.basename(f[:-8]) for f in TSF_WINDOW_FILES ]
labels_df = labels_df.loc[m_ids,:]
labels_df = labels_df.sort_values(by="on_off", ascending=False)
labels_df.shape

(327, 4)

In [63]:
nonsense_variables = {
    "value__count_below__t_0",
    "value__count_above__t_0",
    "value__has_duplicate_min",
    "value__has_duplicate_max",
    "value__value_count__value_-1",
    "value__value_count__value_0",
    "value__value_count__value_1",
    "value__number_crossing_m__m_-1",
    "value__number_crossing_m__m_0",
    "value__number_crossing_m__m_1"
}

In [64]:
def select_variables(tsf_window_file):
    f = tsf_window_file
    df = pd.read_csv(f, index_col=0)
    non_fft_cols = [ c for c in df.columns.values.tolist() if not c.startswith("value__fft") ]
    # Variance
    var_df = pd.DataFrame(data=[], columns=["variable", "variance", "dim"], index=[])
    for dim, dim_df in df.groupby("id"):
        dim_df = dim_df.drop(columns=["id", "window_stop", "window_count"])
        dim_df = dim_df.set_index("window_start", drop=True)
        dim_df = remove_zero_sum_cols(dim_df)
        dim_df = normalize_df(dim_df)

        dim_var_df = dim_df.var().to_frame().reset_index().rename(columns={'index': 'variable', 0: 'variance'})
        dim_var_df = dim_var_df.loc[dim_var_df["variable"].isin(non_fft_cols)]
        dim_var_df = dim_var_df.sort_values(by="variance", ascending=False)
        dim_var_df = dim_var_df.loc[dim_var_df["variance"] <= 10]
        dim_var_df = dim_var_df.loc[dim_var_df["variance"] > 0.1]
        dim_var_df["dim"] = dim
        var_df = var_df.append(dim_var_df, ignore_index=True)

    var_variables = var_df["variable"].unique().tolist()
    
    # Entropy
    ent_df = pd.DataFrame(data=[], columns=["variable", "entropy", "dim"], index=[])
    for dim, dim_df in df.groupby("id"):
        dim_df = dim_df.drop(columns=["id", "window_stop", "window_count"])
        dim_df = dim_df.set_index("window_start", drop=True)
        dim_df = remove_zero_sum_cols(dim_df)
        dim_entropies = []
        for col_i, col_name in enumerate(dim_df.columns.values.tolist()):
            dim_entropies.append({
                "variable": col_name,
                "entropy": scipy.stats.entropy(dim_df.values[:,col_i], base=2)
            })
        dim_ent_df = pd.DataFrame(data=dim_entropies)
        dim_ent_df = dim_ent_df.loc[dim_ent_df["variable"].isin(non_fft_cols)]
        dim_ent_df = dim_ent_df.loc[dim_ent_df["entropy"] <= 6]
        dim_ent_df = dim_ent_df.loc[dim_ent_df["entropy"] > 0]
        dim_ent_df = dim_ent_df.sort_values(by="entropy", ascending=False)
        dim_ent_df["dim"] = dim
        ent_df = ent_df.append(dim_ent_df, ignore_index=True)

    ent_variables = ent_df["variable"].unique().tolist()
    
    intersect_variables = set(var_variables).union(set(ent_variables))
    selected_variables = set(intersect_variables.difference(nonsense_variables))
    return selected_variables

In [89]:
var_counts = {}
for on_off_val in [0.0, 1.0, 2.0, 3.0, 4.0]:
    var_counts[on_off_val] = {}
    filtered_labels_df = labels_df.loc[labels_df["on_off"] == on_off_val]
    for m_id in filtered_labels_df.index.values.tolist():
        f = join(TSF_WINDOW_DIR, f"{m_id}.tsf.csv")
        f_vars = select_variables(f)
    
        for var_name in f_vars:
            try:
                var_counts[on_off_val][var_name] += 1
            except KeyError:
                var_counts[on_off_val][var_name] = 1
            

In [93]:
top_vars = set()
percent_cutoff = 0.25
for on_off_val in [0.0, 1.0, 2.0, 3.0, 4.0]:
    filtered_labels_df = labels_df.loc[labels_df["on_off"] == on_off_val]
    num_obs = filtered_labels_df.shape[0]
    num_top_obs = math.ceil(percent_cutoff * num_obs)
    
    var_val_counts = list(var_counts[on_off_val].items())
    top_val_vars = set([ v[0] for v in var_val_counts if v[1] >= num_top_obs ])
    top_vars = top_vars.union(top_val_vars)
len(top_vars)

80

In [87]:
#from upsetplot import from_memberships
#from upsetplot import UpSet
#upset_data = from_memberships(all_vars, data=labels_df)
#upset = UpSet(upset_data, intersection_plot_elements=2, orientation='vertical')
#upset.add_catplot(value='on_off', kind='strip', color='blue')
#upset.plot()

In [94]:
top_vars

{'value__agg_linear_trend__f_agg_"max"__chunk_len_10__attr_"slope"',
 'value__agg_linear_trend__f_agg_"max"__chunk_len_50__attr_"slope"',
 'value__agg_linear_trend__f_agg_"max"__chunk_len_5__attr_"slope"',
 'value__agg_linear_trend__f_agg_"mean"__chunk_len_10__attr_"rvalue"',
 'value__agg_linear_trend__f_agg_"mean"__chunk_len_10__attr_"slope"',
 'value__agg_linear_trend__f_agg_"mean"__chunk_len_50__attr_"slope"',
 'value__agg_linear_trend__f_agg_"mean"__chunk_len_5__attr_"rvalue"',
 'value__agg_linear_trend__f_agg_"mean"__chunk_len_5__attr_"slope"',
 'value__agg_linear_trend__f_agg_"min"__chunk_len_10__attr_"rvalue"',
 'value__agg_linear_trend__f_agg_"min"__chunk_len_10__attr_"slope"',
 'value__agg_linear_trend__f_agg_"min"__chunk_len_50__attr_"rvalue"',
 'value__agg_linear_trend__f_agg_"min"__chunk_len_50__attr_"slope"',
 'value__agg_linear_trend__f_agg_"min"__chunk_len_5__attr_"rvalue"',
 'value__agg_linear_trend__f_agg_"min"__chunk_len_5__attr_"slope"',
 'value__agg_linear_trend__f_

In [95]:
with open(join(DATA_DIR, "tsf_window_variables.json"), "w") as f:
    json.dump(list(top_vars), f)