In [None]:
!pip install datomize
!pip install ReliefF
!pip install mrmr_selection

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import pandas as pd
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFdr
from sklearn.feature_selection import f_classif
from mrmr import mrmr_classif
from ReliefF import ReliefF

def pca_scores(X):
  pca = PCA(n_components=1)
  pca.fit(X)
  components = abs(pca.components_[0])
  selected_features =[x for _,x in sorted(zip(components, range(1, len(components) + 1)), reverse=True)]
  return selected_features
  
def DTScore(X, y):
  clf = DecisionTreeClassifier(random_state=42)
  clf = clf.fit(X, y)
  
  selected_features =[x for _,x in sorted(zip(clf.feature_importances_, range(1, len(clf.feature_importances_) + 1)), reverse=True)]
  return selected_features

def selected_features_algo2(X, y):
  rank_1 = pca_scores(X)
  rank_2 = DTScore(X, y)
  interleaved_ranking = [val for pair in zip(rank_2, rank_1) for val in pair]
  ranking_no_duplicates = list(dict.fromkeys(interleaved_ranking))
  ranking_array_features_places = np.zeros(X.shape[1])
  j = 1
  for i in range(X.shape[1]):
    ranking_of_i_best_feature = ranking_no_duplicates[i] - 1
    ranking_array_features_places[ranking_of_i_best_feature] = j
    j += 1
  return 1 / ranking_array_features_places

def selected_features_algo1(X, y, batches = 40):
  F = np.zeros(X.shape[1])
  for i in range(batches):
    X_train, _, y_train, _ = train_test_split(X, y, train_size=0.9, random_state=i)
    estimator = LinearSVC(dual=False, C=1.0)
    selector = RFE(estimator, n_features_to_select=100, step=0.99, verbose=False)
    selector = selector.fit(X_train, y_train)
    F += selector.ranking_
  return 1 / F

def selected_features_selectFdr(X_train, y_train):
  fdr = SelectFdr(f_classif, alpha=0.1)
  fdr = fdr.fit(X_train, y_train)
  selected_features_fdr = fdr.scores_
  return selected_features_fdr

def selected_features_rfe(X_train, y_train):
  estimator = LinearSVC()
  rfe = RFE(estimator, n_features_to_select=1, step=1, verbose=False)
  rfe = rfe.fit(X_train, y_train)
  ranking_rfe = rfe.ranking_
  return ranking_rfe

def selected_features_new_algo2(X, y, batches = 20):
  F = np.zeros(X.shape[1])
  for i in range(batches):
    X_train, _, y_train, _ = train_test_split(X, y, train_size=0.9, random_state=i)
    algo_2_rankings = selected_features_algo2(X_train, y_train)
    F += algo_2_rankings
  return 1/ F

def selected_features_relief(X_train, y_train):
  num_features = X_train.shape[1]
  num_sampls = X_train.shape[0] - 1
  relief = ReliefF(n_neighbors=num_sampls, n_features_to_keep=num_features)
  relief.fit(X_train.to_numpy(), y_train.to_numpy())
  nurmalized = abs(relief.feature_scores)
  return np.asarray(nurmalized)

def ranking_mrmr(X_train,y_train):
  ranking_mrmr = mrmr_classif(X=X_train, y=y_train, K=100)
  buckets = [0] * X_train.shape[1]
  for index, feature in enumerate(ranking_mrmr[::-1]):
    buckets[X_train.columns.get_loc(feature)] = index + 1
  return np.asarray(buckets)

def get_fs_by_name(name):
  if name == "selectFdr":
    return selected_features_selectFdr
  elif name == "mRMR":
    return ranking_mrmr
  elif name == "reliefF":
    return selected_features_relief
  elif name == "RFE":
    return selected_features_rfe
  elif name == "algo1":
    return selected_features_algo1
  else:
    return selected_features_algo2

In [25]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

def get_clf_by_name(name):
  if name == "GaussianNB":
    clf = GaussianNB()
  elif name == "SVC":
    clf = SVC(probability=True) 
  elif name == "LogisticRegression":
    clf = LogisticRegression()
  elif name == "RandomForestClassifier":
    clf = RandomForestClassifier()
  else:
    clf = KNeighborsClassifier()
  return clf

In [17]:
algo1 = "/content/drive/Shareddrives/ML./project/output table/results_table_algo1.csv"
algo2 = "/content/drive/Shareddrives/ML./project/output table/results_table_algo2.csv"
algo2_new = "/content/drive/Shareddrives/ML./project/output table/results_table_new_algo2.csv"
mrmr = "/content/drive/Shareddrives/ML./project/output table/results_table_mRMR.csv"
selectFDR = "/content/drive/Shareddrives/ML./project/output table/results_table_selectFdr.csv"
relifeF = "/content/drive/Shareddrives/ML./project/output table/results_table_reliefF.csv"
rfe = "/content/drive/Shareddrives/ML./project/output table/results_table_RFE.csv"

In [18]:
algo1_df = pd.read_csv(algo1, index_col = 0)
algo2_df = pd.read_csv(algo2, index_col = 0)
algo2_new_df = pd.read_csv(algo2_new, index_col = 0)
mrmr_df = pd.read_csv(mrmr, index_col = 0)
selectFDR_df = pd.read_csv(selectFDR, index_col = 0)
relifeF_df = pd.read_csv(relifeF, index_col = 0)
rfe_df = pd.read_csv(rfe, index_col = 0)

In [19]:
algos = [algo1_df,algo2_df,algo2_new_df,mrmr_df,selectFDR_df,relifeF_df,rfe_df]
dss = algo1_df['Dataset Name'].unique()

In [20]:
combim_base_on_dataset = []
for dataset in dss:  
  ds_res = pd.DataFrame()
  for algo in algos:
    name = algo['Dataset Name'] == dataset
    temp = algo[name]
    ds_res = ds_res.append(temp, ignore_index=True)
  combim_base_on_dataset.append(ds_res)

In [21]:
gouped_by_folds=[]
for tmp in combim_base_on_dataset:
  tmp['Measure Value'] = tmp['Measure Value'].astype(float)
  res = tmp.groupby(['Dataset Name','Number of samples','Original Number of features','Filtering Algorithm','Learning algorithm','Number of features selected (K)','CV Method','Measure Type'], as_index=False)['Measure Value'].mean()
  gouped_by_folds.append(res)

In [22]:
res = []
for tmp in gouped_by_folds:
  ds2 = tmp['Dataset Name'] == 'DLBCL'
  acc = tmp['Measure Type'] == 'ACC'
  auc = tmp['Measure Type'] == 'AUC'
  loo = tmp['CV Method'] == 'split_LOO'
  tmp = tmp[(loo & acc) | (ds2 & acc) | auc]
  res.append(tmp.sort_values(['Measure Value'],axis=0, ascending=False))


In [23]:
results = pd.DataFrame()
i=0
for r in res:
  results[i] = r.iloc[0]
  i=i+1

In [24]:
results = results.transpose()
results

Unnamed: 0,Dataset Name,Number of samples,Original Number of features,Filtering Algorithm,Learning algorithm,Number of features selected (K),CV Method,Measure Type,Measure Value
0,CLL,22,12625,selectFdr,LogisticRegression,10,split_LPO,AUC,0.901786
1,COPDSexualDimorphism,229,14498,mRMR,LogisticRegression,100,split_10Fold,AUC,0.8912
2,DLBCL,194,3583,reliefF,RandomForestClassifier,50,split_10Fold,ACC,0.5047
3,Leukemia_3c_arff,72,7129,mRMR,KNeighborsClassifier,15,split_LOO,ACC,0.986111
4,Leukemia_4c_arff,72,7129,RFE,LogisticRegression,100,split_LOO,ACC,0.958333
5,Prostate,102,5966,algo1,GaussianNB,20,split_10Fold,AUC,0.984
6,breastCancerVDX,344,22284,RFE,LogisticRegression,3,split_10Fold,AUC,0.9643
7,breast_arff,97,24188,algo1,KNeighborsClassifier,20,split_LOO,ACC,0.804124
8,colon,62,2000,reliefF,SVC,30,split_LOO,ACC,0.887097
9,curatedOvarianData,194,3584,algo1,RandomForestClassifier,3,split_10Fold,AUC,0.664


In [None]:
# Import relevant packages
from datomizer import Datomizer, DatoMapper, DatoTrainer, DatoGenerator
from sklearn.datasets import load_iris 
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import KernelPCA



username = "hilakese@post.bgu.ac.il"
password = "072c776a"

for index, row in results.iterrows():
  ds_name = row["Dataset Name"]
  data = pd.read_csv(f"/content/drive/Shareddrives/ML./project/datasets/clean/{ds_name}.csv")
  X = data.drop(columns = 'target')
  y = data['target']

  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

  fs_name =  row["Filtering Algorithm"]
  fs = get_fs_by_name(fs_name)

  clf_name = row ["Learning algorithm"]
  clf = get_clf_by_name(clf_name)

  k = row ["Number of features selected (K)"]
  select_k_best = SelectKBest(fs, k=k).fit(X_train, y_train)

  smaller_X = X_train[select_k_best.get_feature_names_out()]

  pca_linear = KernelPCA(kernel = 'linear')
  added_linear = pca_linear.fit_transform(smaller_X)
  added_linear = np.transpose(added_linear)

  pca_rbf = KernelPCA(kernel = 'rbf')
  added_rbf = pca_rbf.fit_transform(smaller_X)
  added_rbf = np.transpose(added_rbf)


  i = 0
  for arr in added_linear:
    smaller_X[f'added_linear{i}'] = arr
    i = i+1

    i = 0
  for arr in added_rbf:
    smaller_X[f'addded_rbf{i}'] = arr
    i = i+1

  # smaller_X= pd.concat([smaller_X,added_linear, added_rbf], axis=1)


  print(smaller_X)

  # Create a Datomizer with your credentials:
  datomizer = Datomizer(username=username, password=password)

  # Create a DatoMapper and analyze the data structure:
  mapper = DatoMapper(datomizer)
  mapper.discover(df=smaller_X)

  # Create a DatoTrainer and train the generative model:
  trainer = DatoTrainer(mapper)
  trainer.train()

  # Create a DatoGenerator and generate output data:
  generator = DatoGenerator(trainer)
  generator.generate()

  dato_df = pd.read_csv(generator.get_generated_data_csv())