# BIO-SELECT - Marigliano
## Features selection using Limma and R

The goal of this script is to use Limma algorithm and add the selected features to the ones we already have selected in the features_selection.ipynb notebook.

To use this notebook, you need to have Docker installed.

The steps are the following:
1. Build docker image to setup a ready-to-use R environment
2. Run the two docker containers, one for MILE and one for Golub
    1. Run the container
    1. Read the limma CSV file
    1. Sort this file
    1. Convert limma features indices to dataset indices
    1. Append the feature list to the CSV files generated in the features_selection.ipynb notebook



# Build Docker image

In [None]:
# execute this every time you change the R scripts.
!cd docker-R && \
docker build -t rdocker .

# Run Limma for Golub

In [None]:
!ls -al docker-R/dataset/

In [None]:
!head -n4 docker-R/dataset/limma-golub.csv

## Parse Limma CSV

In [None]:
import pandas as pd
from datasets.Golub99.GolubDataset import GolubDataset
from algorithms.Algorithm import Algorithm

In [None]:
GROUP_NAME_GOLUB = "golub" #TODO: change it to match the group name of the previously generated lists (CSV)
N_FEATURES = 1000
ALG_NAME = "Limma"

In [None]:
ds_golub = GolubDataset()

In [None]:
filename = r"docker-R/dataset/limma-golub.csv"

df = pd.read_csv(filename, sep="\t", usecols=["ID", "B"])
df = df.dropna()  # ignore NaN values

df = df.sort_values(['B'], ascending=[0])

# convert pandas dataframe to array of tuples
features_by_score = [tuple(x) for x in df.to_records(index=False)]

# convert features name to features indices
f_names, f_scores = zip(*features_by_score)
f_names = ds_golub.get_features_indices(f_names)
features_by_score = zip(f_names, f_scores)
print(features_by_score[:3])

# normalize the score
features_by_score_normed = Algorithm.normalize_scores(features_by_score)[:N_FEATURES]

# transform the rank tuples in the format: (index, rank)
r = [f[0] for f in features_by_score_normed]
features_by_rank = [(v, 1.0/(1.0+k)) for k, v in enumerate(r)]

# assign the same weight for all features
features = [(f[0], 1) for f in features_by_score_normed]

# prepare the subsets dict to export in CSV
subsets = {}
subsets[ALG_NAME] = {"features": [], "features_by_rank": [], "features_by_score": []}
subsets[ALG_NAME]["features"] = features
subsets[ALG_NAME]["features_by_rank"] = features_by_rank
subsets[ALG_NAME]["features_by_score"] = features_by_score_normed

## Save the features

In [None]:
from utils.CSVFeaturesExporter import CSVFeaturesExporter

group_name = GROUP_NAME_GOLUB + "_limma"
features_exporter = CSVFeaturesExporter(subsets, group_name)
features_exporter.export()

# Run Limma for MILE

In [None]:
!ls -al docker-R/dataset/

In [None]:
!head -n3 docker-R/dataset/limma-mile.csv

## Parse Limma CSV

In [None]:
import pandas as pd
from datasets.MILE.MileDataset import MileDataset
from algorithms.Algorithm import Algorithm

In [None]:
GROUP_NAME_MILE = "mile" #TODO: change it to match the group name of the previously generated lists (CSV)
N_FEATURES = 1000
ALG_NAME = "Limma"

In [None]:
# load only 20 samples is enough because we only want to convert the names of the features in indices
ds_mile = MileDataset(samples_limit=20)

In [None]:
filename = r"docker-R/dataset/limma-mile.csv"

df = pd.read_csv(filename, sep="\t", usecols=["Genes.ID", "F"])
df = df.dropna()  # ignore NaN values

df = df[["Genes.ID", "F"]] # order the columns

df = df.sort_values(['F'], ascending=[0])

# convert pandas dataframe to array of tuples
features_by_score = [tuple(x) for x in df.to_records(index=False)]

# convert features name to features indices
f_names, f_scores = zip(*features_by_score)
f_names = ds_mile.get_features_indices(f_names)
features_by_score = zip(f_names, f_scores)

# normalize the score
features_by_score_normed = Algorithm.normalize_scores(features_by_score)[:N_FEATURES]
print(features_by_score_normed[:10])

# transform the rank tuples in the format: (index, rank)
r = [f[0] for f in features_by_score_normed]
features_by_rank = [(v, 1.0/(1.0+k)) for k, v in enumerate(r)]

# assign the same weight for all features
features = [(f[0], 1) for f in features_by_score_normed]

# prepare the subsets dict to export in CSV
subsets = {}
subsets[ALG_NAME] = {"features": [], "features_by_rank": [], "features_by_score": []}
subsets[ALG_NAME]["features"] = features
subsets[ALG_NAME]["features_by_rank"] = features_by_rank
subsets[ALG_NAME]["features_by_score"] = features_by_score_normed

## Save the features

In [None]:
from utils.CSVFeaturesExporter import CSVFeaturesExporter

group_name = GROUP_NAME_MILE + "_limma"
features_exporter = CSVFeaturesExporter(subsets, group_name)
features_exporter.export()