In [None]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

In [None]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC
import random
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
import numpy as np
from sklearn.metrics import matthews_corrcoef, roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
import time

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.feature_selection import SelectFdr
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

In [None]:
def set_random_seed(seed_value):
    np.random.seed(seed_value)
    random.seed(seed_value)
    
set_random_seed(42)

In [None]:
def init_classifiers():
  gnb = GaussianNB()
  svc = SVC(probability=True) 
  lr = LogisticRegression()
  rf = RandomForestClassifier()
  knn = KNeighborsClassifier()
  return [gnb, svc, lr, rf, knn]

In [None]:
def splits(num_of_records):
  if num_of_records<50:
    return split_LPO
  elif num_of_records<100:
    return split_LOO
  elif num_of_records>1000:
    return split_5Fold
  else:
    return split_10Fold

In [None]:
from sklearn.model_selection import LeaveOneOut

def split_LOO(X,y):
  Xs_train, Xs_test, ys_train, ys_test = [],[],[],[]
  loo = LeaveOneOut()
  folds = loo.get_n_splits(X)

  for train_index, test_index in loo.split(X):
      X_train, X_test = X.iloc[train_index], X.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]
      Xs_train.append(X_train)
      Xs_test.append(X_test)
      ys_train.append(y_train)
      ys_test.append(y_test)

  return Xs_train, Xs_test, ys_train, ys_test, folds

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

def pca_scores(X):
  pca = PCA(n_components=1)
  pca.fit(X)
  components = abs(pca.components_[0])
  selected_features =[x for _,x in sorted(zip(components, range(1, len(components) + 1)), reverse=True)]
  return selected_features
  
def DTScore(X, y):
  clf = DecisionTreeClassifier(random_state=42)
  clf = clf.fit(X, y)
  
  selected_features =[x for _,x in sorted(zip(clf.feature_importances_, range(1, len(clf.feature_importances_) + 1)), reverse=True)]
  return selected_features

def algo2(X, y):
  rank_1 = pca_scores(X)
  rank_2 = DTScore(X, y)
  interleaved_ranking = [val for pair in zip(rank_2, rank_1) for val in pair]
  ranking_no_duplicates = list(dict.fromkeys(interleaved_ranking))
  ranking_array_features_places = np.zeros(X.shape[1])
  j = 1
  for i in range(X.shape[1]):
    ranking_of_i_best_feature = ranking_no_duplicates[i] - 1
    ranking_array_features_places[ranking_of_i_best_feature] = j
    j += 1
  return 1 / ranking_array_features_places


In [None]:
from sklearn.model_selection import StratifiedKFold

def split_5Fold(X,y):
  Xs_train, Xs_test, ys_train, ys_test = [],[],[],[]
  kf = StratifiedKFold(n_splits=5)
  folds = kf.get_n_splits(X)

  for train_index, test_index in kf.split(X,y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    Xs_train.append(X_train)
    Xs_test.append(X_test)
    ys_train.append(y_train)
    ys_test.append(y_test)

  return Xs_train, Xs_test, ys_train, ys_test, folds

def split_10Fold(X,y):
  Xs_train, Xs_test, ys_train, ys_test = [],[],[],[]
  kf = StratifiedKFold(n_splits=10)
  folds = kf.get_n_splits(X)

  for train_index, test_index in kf.split(X,y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    Xs_train.append(X_train)
    Xs_test.append(X_test)
    ys_train.append(y_train)
    ys_test.append(y_test)

  return Xs_train, Xs_test, ys_train, ys_test, folds

In [None]:
from sklearn.model_selection import LeavePOut

def split_LPO(X,y):
  Xs_train, Xs_test, ys_train, ys_test = [],[],[],[]
  lpo = LeavePOut(2)
  folds = lpo.get_n_splits(X)

  for train_index, test_index in lpo.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    Xs_train.append(X_train)
    Xs_test.append(X_test)
    ys_train.append(y_train)
    ys_test.append(y_test)

  return Xs_train, Xs_test, ys_train, ys_test, folds

In [None]:
def get_dataset_files():
  return ["clean/CLL.csv",
          "clean/COPDSexualDimorphism.data.csv",
          "clean/DLBCL.csv",
          "clean/Leukemia_3c_arff.csv",
          "clean/Leukemia_4c_arff.csv",
          "clean/Prostate.csv",
          "clean/breastCancerVDX.csv",
          "clean/breast_arff.csv",
          "clean/colon.csv",
          "clean/curatedOvarianData.csv",
          "clean/leukemia.csv",
          "clean/lung.csv",
          "clean/lung_arff.csv",
          "clean/lymphoma.csv",
          "clean/lymphoma_arff.csv",
          "clean/misc1.csv",
          "clean/misc2.csv",
          "clean/misc3.csv",
          "clean/misc4.csv",
          "clean/misc5.csv"
      ]

def parse_file(dataset_file):
  name = dataset_file.split('/')[-1].split('.')[0]
  print(f'Starting to work on dataset: {name}')
  df = pd.read_csv(dataset_file)
  X = df.drop(columns=['target'])
  y = df['target']
  print(f'X shape: {X.shape}')
  return name, df, X, y

def get_splits(X, y):
  num_of_samples = X.shape[0]
  splitter = splits(num_of_samples)
  cv_type_name = splitter.__name__
  print(f'CV type: {cv_type_name}')
  X_trains, X_tests, y_trains, y_tests, n_of_folds = splitter(X,y)
  print(f'Number of CV folds: {n_of_folds}')
  return X_trains, X_tests, y_trains, y_tests, n_of_folds, cv_type_name

def get_split(X_trains, X_tests, y_trains, y_tests, fold_i):
  X_train = X_trains[fold_i]
  X_test = X_tests[fold_i]
  y_train = y_trains[fold_i]
  y_test = y_tests[fold_i]
  return X_train, X_test, y_train, y_test

def calculate_feature_rankings_and_wrap_for_kbest(X, y):
  algo2_ranked = algo2(X, y)
  def kbestwrapper(X, y):
    return algo2_ranked
  return kbestwrapper

def create_data_dict_empty():
  result_cols = ["Dataset Name","Number of samples","Original Number of features","Filtering Algorithm","Learning algorithm","Number of features selected (K)","CV Method",
                "Fold","Measure Type","Measure Value","List of Selected Features Names","Selected Features scores"]
  data_dict = {}
  for column in result_cols:
    data_dict[column] = []
  return data_dict

def fill_row(data_dict, *args):
  result_cols = ["Dataset Name","Number of samples","Original Number of features","Filtering Algorithm","Learning algorithm","Number of features selected (K)","CV Method",
                "Fold","Measure Type","Measure Value","List of Selected Features Names","Selected Features scores"]
  i = 0
  for arg in args:
    data_dict[result_cols[i]].append(arg)
    i += 1

def format_e(num):
    return format(num,'.2E')
format_e2 = np.vectorize(format_e)

# Change with each notebook
FSMethod_name = "algo2"
result_table_dict = create_data_dict_empty()
for dataset_file in get_dataset_files():
  dataset_name, df, X, y = parse_file(dataset_file)
  n_samples, n_orig_features = X.shape
  X_trains, X_tests, y_trains, y_tests, n_of_folds, cv_type_name = get_splits(X, y)
  j = 0
  for fold_i in range(n_of_folds):
    print(f'Fold {fold_i + 1}:')
    X_train, X_test, y_train, y_test = get_split(X_trains, X_tests, y_trains, y_tests, fold_i)
    start = time.time()
    kbestwrapper = calculate_feature_rankings_and_wrap_for_kbest(X_train, y_train)
    end = time.time()
    FSMethod_time = end - start
    FSMethod_time = format(FSMethod_time,'.2E')
    print(f'Time (seconds) took for {FSMethod_name}: {FSMethod_time}')
    for k in [1,2,3,4,5,10,15,20,25,30,50,100][::-1]:
      print(f'k {k}:')
      select_k_best = SelectKBest(kbestwrapper, k=k).fit(X_train, y_train)
      k_feature_names = select_k_best.get_feature_names_out()
      k_feature_scores = select_k_best.scores_[select_k_best.get_support()]
      k_feature_scores = format_e2(k_feature_scores)
      print(k_feature_scores)
      X_train_reduced = select_k_best.transform(X_train)
      print(X_train_reduced.shape)
      X_test_reduced = select_k_best.transform(X_test)
      print(X_test_reduced.shape)
      for clf in init_classifiers():
        j += 1
        start = time.time()
        clf.fit(X_train_reduced, y_train)
        end = time.time()
        clf_fit_time = end - start
        clf_name = clf.__class__.__name__
        clf_fit_time = format(clf_fit_time,'.2E')
        print(f'Time (seconds) took to fit {clf_name}: {clf_fit_time}')

        start = time.time()
        y_probas = clf.predict_proba(X_test_reduced)
        end = time.time()
        clf_inference_time_per_record = (end - start) / X_test_reduced.shape[0]
        clf_inference_time_per_record = format(clf_inference_time_per_record,'.2E')
        print(f'Time (seconds) took for inference per record: {clf_inference_time_per_record}')

        acc = clf.score(X_test_reduced, y_test)
        mcc = matthews_corrcoef(clf.predict(X_test_reduced), y_test)

        try:
          if len(y.unique()) > 2:
            auc = roc_auc_score(y_test, y_probas, multi_class = 'ovr')
          else:
            auc = roc_auc_score(y_test, y_probas[:,1])
        except:
          auc = np.nan
        
        try:
          pr_auc = average_precision_score(y_test, y_probas[:, 1])
        except:
          pr_auc = np.nan
        
        acc = format(acc,'.2E')
        mcc = format(mcc,'.2E')
        auc = format(auc,'.2E')
        pr_auc = format(pr_auc,'.2E')

        print(f'{j} accuracy: {acc}')
        print(f'{j} mcc: {mcc}')
        print(f'{j} auc: {auc}')
        print(f'{j} pr_auc: {pr_auc}')
        fold_iter = fold_i + 1
        fill_row(result_table_dict, dataset_name, n_samples, n_orig_features, FSMethod_name, clf_name, k, cv_type_name, fold_iter, "ACC", acc, k_feature_names, k_feature_scores)
        fill_row(result_table_dict, dataset_name, n_samples, n_orig_features, FSMethod_name, clf_name, k, cv_type_name, fold_iter, "MCC", mcc, k_feature_names, k_feature_scores)
        fill_row(result_table_dict, dataset_name, n_samples, n_orig_features, FSMethod_name, clf_name, k, cv_type_name, fold_iter, "AUC", auc, k_feature_names, k_feature_scores)
        fill_row(result_table_dict, dataset_name, n_samples, n_orig_features, FSMethod_name, clf_name, k, cv_type_name, fold_iter, "PR-AUC", pr_auc, k_feature_names, k_feature_scores)
        fill_row(result_table_dict, dataset_name, n_samples, n_orig_features, FSMethod_name, clf_name, k, cv_type_name, fold_iter, "FSMethod time(seconds)", FSMethod_time, k_feature_names, k_feature_scores)
        fill_row(result_table_dict, dataset_name, n_samples, n_orig_features, FSMethod_name, clf_name, k, cv_type_name, fold_iter, "CLF fit time(seconds)", clf_fit_time, k_feature_names, k_feature_scores)
        fill_row(result_table_dict, dataset_name, n_samples, n_orig_features, FSMethod_name, clf_name, k, cv_type_name, fold_iter, "CLF inference time per record(seconds)", clf_inference_time_per_record, k_feature_names, k_feature_scores)
     
result_table_df = pd.DataFrame(result_table_dict)
result_table_df.to_csv(f'clean/results_table_{FSMethod_name}.csv')