## Example Usage of Threshold Based Feature Selection

In [1]:
from tbfs.ranker import TBFSRanker, metrics
import pandas as pd
import numpy as np
import logging
from datetime import datetime

version = datetime.utcnow().strftime("%Y_%m_%dt%H_%M_%S")
log_file = f"tbfs.{version}.log"
logging.basicConfig(filename=log_file, level=logging.DEBUG)

In [2]:
# load data
df = pd.read_csv('sample-data.csv')

# separate features/labels
y, x = df['class'], df.drop(columns=['class'])

# convert class label to numeric value
y = np.where(y == 'ACL', 1, 0)

In [3]:
# create instance of TBFS Ranker
t_delta = 0.01
tbfs = TBFSRanker(t_delta)

# fit the ranker to our data set
rankings = tbfs.fit(x, y)

# save the results for future experiments
tbfs.to_csv('tbfs-results.csv')

In [4]:
# view available metrics
metrics

['f-score',
 'odds-ratio',
 'statistical-power',
 'probability-ratio',
 'gini-index',
 'kolmogorov-smirnov',
 'geometric-mean',
 'auc',
 'auprc',
 'mutual-info',
 'deviance',
 'matthews-correlation-coefficient']

In [5]:
# take top K features for a metric
tbfs.top_k_features_by_metric('f-score', 10)

['GENE1609X',
 'GENE1537X',
 'GENE493X',
 'GENE1616X',
 'GENE3945X',
 'GENE3258X',
 'GENE3946X',
 'GENE384X',
 'GENE1296X',
 'GENE1620X']

In [6]:
# load previous results and re-use
tbfs2 = TBFSRanker()
tbfs2.from_csv('tbfs-results.csv')

tbfs2.top_k_features_by_metric('f-score', 10)

['GENE1609X',
 'GENE1537X',
 'GENE493X',
 'GENE1616X',
 'GENE3945X',
 'GENE3258X',
 'GENE3946X',
 'GENE384X',
 'GENE1296X',
 'GENE1620X']