# Datasets Visualization

## Initialization

In [None]:
from __future__ import print_function
from sklearn import neighbors, datasets
import pandas as pd
import os
from matplotlib import pyplot as plt
import numpy as np
from sklearn import preprocessing
from scipy import stats
import collections
%matplotlib inline

# set float precision at 2 digits
np.set_printoptions(precision=2)

# increase font size in matplotlib
import matplotlib
matplotlib.rcParams.update({'font.size': 11})

In [None]:
def summarize_dataset(ds, ds_name):
    X = ds.get_X()
    y = ds.get_y()
    
    print("Summary of the dataset %s " % ds_name)
    print("X : ", end="")
    print(stats.describe(X))
    print(" ")
    print(" ")
    print("y : ", end="")
    print(stats.describe(y))
    
    print("Number of classes: %s" % len(set(y)))
    print("Number of samples: %s" % len(X))
    print("Number of genes: %s" % len(X[0]))

In [None]:
def plot_hist(ds, ds_name):
    #WARNING: disabled because I dont think that it is useful to
    # print the mean X values
    return

    X = ds.get_X()
    X_mean = np.mean(X, axis=0)
    y = ds.get_y()
    
    genes = range(len(X_mean))
    
    fig = plt.figure(figsize=(8,6))
    ax = fig.add_subplot(111)
    
    ax.bar(genes, X_mean, align='center', alpha=0.5)
    
    ax.set_ylabel('Value')
    ax.set_xlabel('Genes')
    
    # add some padding
    ax.set_xlim([-100, max(genes) + 100])
    ax.set_ylim([0, max(X_mean) * 1.1])
    
    plt.title('Mean value per gene - %s' % ds_name)
    
    plt.show()

In [None]:
def plot_classes_repartition(ds, ds_name, ds_encoder=None):   
    y = ds.get_y()
    
    N_classes = len(set(y))
    N_samples = len(y)
    
    ind = np.arange(N_classes)  # the x locations for the groups
    width = 0.35       # the width of the bars
    
    # list of tuple as (class_name, count)
    counter=collections.Counter(y).most_common()
    
    classes_names = [c[0] for c in counter]
    classes_count = [c[1] for c in counter]

    fig = plt.figure(figsize=(8,6))
    ax = fig.add_subplot(111)
    
    ax.bar(classes_names, classes_count, align='center', width=width, alpha=0.5)
    
    ax.set_ylabel('Samples')
    ax.set_xlabel('Classes')
    
    # add some padding
    ax.set_xlim([min(classes_names) - 0.5, max(classes_names) + 0.5])
    ax.set_ylim([0, max(classes_count) * 1.1])
    
    # add values above the bars
    for a,b in counter:
        plt.text(a, b, str(b), ha='center', va='bottom')
    
    
    if ds_encoder is not None:
        le = ds_encoder.get_label_encoder()
        classes_names = le.inverse_transform(classes_names)
    
    ax.set_xticks(ind)
    ax.set_xticklabels(classes_names)
    
    
    plt.title('Number of samples per class - %s\nTotal samples: %d' % (ds_name, N_samples))
    plt.show()

In [None]:
def describe_dataset(ds, ds_name, ds_encoder=None):
    print("Description of dataset %s" % ds_name)
    print("===================================")
    
    summarize_dataset(ds, ds_name)
    plot_hist(ds, ds_name)
    plot_classes_repartition(ds, ds_name, ds_encoder)

## Golub 99

In [None]:
from datasets.Golub99.GolubDataset import GolubDataset
from datasets.DatasetEncoder import DatasetEncoder

ds = GolubDataset()

# encode Dataset string classes into numbers
ds_encoder = DatasetEncoder(ds)
ds = ds_encoder.encode()

describe_dataset(ds, "Golub 99", ds_encoder)

In [None]:
def plot_hist_two_feat(ds, ds_name, ds_encoder):
    X = ds.get_X()
    mean_feat_0 = X[:, 10]
    mean_feat_1 = X[:, 100]
    mean_feat_2 = X[:, 1000]
    
    plt.figure(figsize=(8, 4))
    plt.hist(mean_feat_0, bins=10, alpha=0.5, label='feat0')
    plt.hist(mean_feat_1, bins=10, alpha=0.5, label='feat1')
    plt.hist(mean_feat_2, bins=10, alpha=0.5, label='feat2')
    plt.legend(loc='best')
    plt.show()
    
plot_hist_two_feat(ds, "Golub 99", ds_encoder)

In [None]:
ds.get_X()[:, 1]

## EGEOD22619

In [None]:
from datasets.EGEOD22619.EGEOD22619Dataset import EGEOD22619Dataset
from datasets.DatasetEncoder import DatasetEncoder

ds = EGEOD22619Dataset()

# encode Dataset string classes into numbers
ds_encoder = DatasetEncoder(ds)
ds = ds_encoder.encode()

describe_dataset(ds, "EGEOD22619", ds_encoder)

## MILE

In [None]:
from datasets.MILE.MileDataset import MileDataset
from datasets.DatasetEncoder import DatasetEncoder
from datasets.DatasetBalancer import DatasetBalancer
 
#ds = MileDataset(samples_limit=200)
ds = MileDataset()

# encode Dataset string classes into numbers
ds_encoder = DatasetEncoder(ds)
ds = ds_encoder.encode()

describe_dataset(ds, "MILE", ds_encoder)


# balance the classe using random oversampling
ds_balancer = DatasetBalancer(ds)
ds = ds_balancer.balance()

describe_dataset(ds, "MILE", ds_encoder)
