[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kostaslazaros/FSIHDD/blob/master/cross_vall_experiment.ipynb)

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
import warnings
warnings.filterwarnings('ignore')

## Import utility functions

In [2]:
import sys
sys.path.append('./')

from helpers import utility_functions as ufn

## Initialize classifiers

In [3]:
rfc = RandomForestClassifier(n_estimators=100)
svmc = svm.SVC(kernel='linear', C=1)
lrc = LogisticRegression(random_state=0)
knn = KNeighborsClassifier(n_neighbors=5)

In [4]:
def experimental_cross_val(alg, X, Y, folds, num):
    acc_list = []
    f1_list = []
    i = 1
    while(i <= num):
        print(f'{folds}-fold cross-validation run: {i}', end=' ')
        alg_scores = cross_validate(alg, X, Y, scoring=('accuracy', 'f1') , cv=folds)
        print('Done')
        acc_list += list(alg_scores['test_accuracy'])
        f1_list += list(alg_scores['test_f1'])
        i+=1
    return(acc_list, f1_list)

In [5]:
def list2csv(acc_list, f1_list, csv_name):
    metric_list_df = pd.DataFrame(list(zip(acc_list, f1_list)),
                                  columns =['Accuracy', 'F1 score'])
    metric_list_df['Data'] = ['Original']* 100 + ['FSC'] * 100 + ['ANOVA'] * 100
    metric_list_df.to_csv(csv_name, index=False)

## Get dataset (all columns) and tags

In [6]:
X_full = pd.read_csv('dataset_full.csv')

In [8]:
X_consensus = pd.read_csv('consensus_data_6000.csv')

In [17]:
X_ANOVA = pd.read_csv('dataset_ANOVA.csv')

In [7]:
Y = pd.read_csv('emtab_tags.csv')

### Get lists for Random Forest (on full dataset -> 18958 features/genes)

In [9]:
(rfc_acc_full_data, rfc_f1_full_data) = experimental_cross_val(rfc, X_full, Y, 10, 10)

10-fold cross-validation run: 1 Done
10-fold cross-validation run: 2 Done
10-fold cross-validation run: 3 Done
10-fold cross-validation run: 4 Done
10-fold cross-validation run: 5 Done
10-fold cross-validation run: 6 Done
10-fold cross-validation run: 7 Done
10-fold cross-validation run: 8 Done
10-fold cross-validation run: 9 Done
10-fold cross-validation run: 10 Done


### Get lists for Random Forest (on consensus dataset -> 762 features/genes)

In [10]:
(rfc_acc_cons_data, rfc_f1_cons_data) = experimental_cross_val(rfc, X_consensus, Y, 10, 10)

10-fold cross-validation run: 1 Done
10-fold cross-validation run: 2 Done
10-fold cross-validation run: 3 Done
10-fold cross-validation run: 4 Done
10-fold cross-validation run: 5 Done
10-fold cross-validation run: 6 Done
10-fold cross-validation run: 7 Done
10-fold cross-validation run: 8 Done
10-fold cross-validation run: 9 Done
10-fold cross-validation run: 10 Done


### Get lists for Random Forest (on ANOVA reduced dataset -> 10 features/genes)

In [19]:
(rfc_acc_ANOVA_data, rfc_f1_ANOVA_data) = experimental_cross_val(rfc, X_ANOVA, Y, 10, 10)

10-fold cross-validation run: 1 Done
10-fold cross-validation run: 2 Done
10-fold cross-validation run: 3 Done
10-fold cross-validation run: 4 Done
10-fold cross-validation run: 5 Done
10-fold cross-validation run: 6 Done
10-fold cross-validation run: 7 Done
10-fold cross-validation run: 8 Done
10-fold cross-validation run: 9 Done
10-fold cross-validation run: 10 Done


### Get lists for Logistic Regression (on full dataset -> 18958 features/genes)

In [11]:
(lrc_acc_full_data, lrc_f1_full_data) = experimental_cross_val(lrc, X_full, Y, 10, 10)

10-fold cross-validation run: 1 Done
10-fold cross-validation run: 2 Done
10-fold cross-validation run: 3 Done
10-fold cross-validation run: 4 Done
10-fold cross-validation run: 5 Done
10-fold cross-validation run: 6 Done
10-fold cross-validation run: 7 Done
10-fold cross-validation run: 8 Done
10-fold cross-validation run: 9 Done
10-fold cross-validation run: 10 Done


### Get lists for Logistic Regression (on consensus dataset -> 762 features/genes)

In [12]:
(lrc_acc_cons_data, lrc_f1_cons_data) = experimental_cross_val(lrc, X_consensus, Y, 10, 10)

10-fold cross-validation run: 1 Done
10-fold cross-validation run: 2 Done
10-fold cross-validation run: 3 Done
10-fold cross-validation run: 4 Done
10-fold cross-validation run: 5 Done
10-fold cross-validation run: 6 Done
10-fold cross-validation run: 7 Done
10-fold cross-validation run: 8 Done
10-fold cross-validation run: 9 Done
10-fold cross-validation run: 10 Done


### Get lists for Logistic Regression (on ANOVA reduced dataset -> 10 features/genes)

In [20]:
(lrc_acc_ANOVA_data, lrc_f1_ANOVA_data) = experimental_cross_val(lrc, X_ANOVA, Y, 10, 10)

10-fold cross-validation run: 1 Done
10-fold cross-validation run: 2 Done
10-fold cross-validation run: 3 Done
10-fold cross-validation run: 4 Done
10-fold cross-validation run: 5 Done
10-fold cross-validation run: 6 Done
10-fold cross-validation run: 7 Done
10-fold cross-validation run: 8 Done
10-fold cross-validation run: 9 Done
10-fold cross-validation run: 10 Done


### Get lists for K Nearest Neighbors (on full dataset -> 18958 features/genes)

In [13]:
(knn_acc_full_data, knn_f1_full_data) = experimental_cross_val(knn, X_full, Y, 10, 10)

10-fold cross-validation run: 1 Done
10-fold cross-validation run: 2 Done
10-fold cross-validation run: 3 Done
10-fold cross-validation run: 4 Done
10-fold cross-validation run: 5 Done
10-fold cross-validation run: 6 Done
10-fold cross-validation run: 7 Done
10-fold cross-validation run: 8 Done
10-fold cross-validation run: 9 Done
10-fold cross-validation run: 10 Done


### Get lists for K Nearest Neighbors (on consensus dataset -> 762 features/genes)

In [14]:
(knn_acc_cons_data, knn_f1_cons_data) = experimental_cross_val(knn, X_consensus, Y, 10, 10)

10-fold cross-validation run: 1 Done
10-fold cross-validation run: 2 Done
10-fold cross-validation run: 3 Done
10-fold cross-validation run: 4 Done
10-fold cross-validation run: 5 Done
10-fold cross-validation run: 6 Done
10-fold cross-validation run: 7 Done
10-fold cross-validation run: 8 Done
10-fold cross-validation run: 9 Done
10-fold cross-validation run: 10 Done


### Get lists for K Nearest Neighbors (on ANOVA reduced dataset -> 10 features/genes)

In [21]:
(knn_acc_ANOVA_data, knn_f1_ANOVA_data) = experimental_cross_val(knn, X_ANOVA, Y, 10, 10)

10-fold cross-validation run: 1 Done
10-fold cross-validation run: 2 Done
10-fold cross-validation run: 3 Done
10-fold cross-validation run: 4 Done
10-fold cross-validation run: 5 Done
10-fold cross-validation run: 6 Done
10-fold cross-validation run: 7 Done
10-fold cross-validation run: 8 Done
10-fold cross-validation run: 9 Done
10-fold cross-validation run: 10 Done


### Get lists for Support Vector Machine (on full dataset -> 18958 features/genes)

In [15]:
(svm_acc_full_data, svm_f1_full_data) = experimental_cross_val(svmc, X_full, Y, 10, 10)

10-fold cross-validation run: 1 Done
10-fold cross-validation run: 2 Done
10-fold cross-validation run: 3 Done
10-fold cross-validation run: 4 Done
10-fold cross-validation run: 5 Done
10-fold cross-validation run: 6 Done
10-fold cross-validation run: 7 Done
10-fold cross-validation run: 8 Done
10-fold cross-validation run: 9 Done
10-fold cross-validation run: 10 Done


### Get lists for Support Vector Machine (on consensus dataset -> 762 features/genes)

In [16]:
(svm_acc_cons_data, svm_f1_cons_data) = experimental_cross_val(svmc, X_consensus, Y, 10, 10)

10-fold cross-validation run: 1 Done
10-fold cross-validation run: 2 Done
10-fold cross-validation run: 3 Done
10-fold cross-validation run: 4 Done
10-fold cross-validation run: 5 Done
10-fold cross-validation run: 6 Done
10-fold cross-validation run: 7 Done
10-fold cross-validation run: 8 Done
10-fold cross-validation run: 9 Done
10-fold cross-validation run: 10 Done


### Get lists for Support Vector Machine (on ANOVA reduced dataset -> 10 features/genes)

In [22]:
(svm_acc_ANOVA_data, svm_f1_ANOVA_data) = experimental_cross_val(svmc, X_ANOVA, Y, 10, 10)

10-fold cross-validation run: 1 Done
10-fold cross-validation run: 2 Done
10-fold cross-validation run: 3 Done
10-fold cross-validation run: 4 Done
10-fold cross-validation run: 5 Done
10-fold cross-validation run: 6 Done
10-fold cross-validation run: 7 Done
10-fold cross-validation run: 8 Done
10-fold cross-validation run: 9 Done
10-fold cross-validation run: 10 Done


### Merging lists for Random Forest

In [46]:
rfc_acc_data = rfc_acc_full_data + rfc_acc_cons_data + rfc_acc_ANOVA_data

In [47]:
rfc_f1_data = rfc_f1_full_data + rfc_f1_cons_data + rfc_f1_ANOVA_data

### Merging lists for Logistic Regression

In [48]:
lrc_acc_data = lrc_acc_full_data + lrc_acc_cons_data + lrc_acc_ANOVA_data

In [49]:
lrc_f1_data = lrc_f1_full_data + lrc_f1_cons_data + lrc_f1_ANOVA_data

### Merging lists for K Nearest Neighbors

In [50]:
knn_acc_data = knn_acc_full_data + knn_acc_cons_data + knn_acc_ANOVA_data

In [51]:
knn_f1_data = knn_f1_full_data + knn_f1_cons_data + knn_f1_ANOVA_data

### Merging lists for Support vector Machine

In [52]:
svm_acc_data = svm_acc_full_data + svm_acc_cons_data + svm_acc_ANOVA_data

In [53]:
svm_f1_data = svm_f1_full_data + svm_f1_cons_data + svm_f1_ANOVA_data

### Create dataframe for Random Forest metrics

In [54]:
list2csv(rfc_acc_data, rfc_f1_data, 'random_forest_scores.csv')

### Create dataframe for Logistic Regression metrics

In [55]:
list2csv(lrc_acc_data, lrc_f1_data, 'logistic_regression_scores.csv')

### Create dataframe for K Nearest Neighbors metrics

In [56]:
list2csv(knn_acc_data, knn_f1_data, 'k_nearest_neighbors_scores.csv')

### Create dataframe for Support Vector Machine metrics

In [57]:
list2csv(svm_acc_data, svm_f1_data, 'support_vector_machine_scores.csv')