In [1]:
import warnings
warnings.filterwarnings('ignore')
    
import os
import traceback
import sys
from time import time

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import numpy as np 

from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors

import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
%matplotlib inline

from visualization.plots import plots
from visualization.plots import post_plots

In [2]:
# names = ['breast_cancer', 'Carcinom', 'chin', 'quality_control', 'CLL_SUB_111', 'SRBCT', 'Lymphoma', 'GLIOMA', 'CNS', 
# 'colon', 'Data_Cortex_Nuclear','Leukemia', 'LSVT', 'Prostate_GE', 'lung', 'MLL', 'prostate_cancer', 'ovarian',
# 'SMK_CAN_187', 'TOX_171', 'GLI_85']

names = ['CLL_SUB_111', 'CNS', 'colon', 'GLI_85', 'GLIOMA', 'Leukemia', 'lung', 
            'Lymphoma', 'MLL', 'ovarian', 'Prostate_GE', 'SRBCT', 'TOX_171']


# Create 4 T-SNE and PCA plots

In [None]:
for name in names:
    plots(name, '../data/' + name + '_clean.csv')

# Create CSV with Dataset Info

In [None]:
def ib(counts):
    num_class = len(counts)
    
    sum=0
    for c in counts:
        sum += c/(X.shape[0]-c)
    
    coef = (num_class - 1)/num_class
    
    return 1 - (1/(coef*sum))

In [None]:
no_feature = []
no_instance = []
no_classes = []
sbrs = []
irs = []
clusters = []

for name in names:
    df = pd.read_csv('../data/' + name + '_clean.csv')
    
    # features and target var
    X = df.drop('class', axis=1)
    y = df['class']
    
    # number of classes for dataset
    num_of_classes = df['class'].nunique()
      
    # keys and counts for classes
    values = y.value_counts().keys().tolist()
    counts = y.value_counts().tolist()
    
    # average number of points per dimension as percent
    inst = X.shape[0]
    feat = X.shape[1]
    sbr = round((inst/feat)*100, 2)
    
    # imbalance ratio
    ir = round(ib(counts), 4)
    
    no_instance.append(inst)
    no_feature.append(feat)
    no_classes.append(num_of_classes)
    sbrs.append(sbr)
    irs.append(ir) 

data = {'name': names, 'no_instances': no_instance, 'no_features': no_feature, 'no_classes': no_classes, 'sbr': sbrs, 'ir': irs}   
df = pd.DataFrame(data) 
df.to_csv('./stats/dataset_info.csv', index=False)

In [None]:
for name in names:
    df = pd.read_csv('../data/' + name + '_clean.csv')
    
    # features and target var
    X = df.drop('class', axis=1)
    y = df['class']
    
    # number of classes for dataset
    num_of_classes = df['class'].nunique()
      
    # keys and counts for classes
    values = y.value_counts().keys().tolist()
    counts = y.value_counts().tolist()
    print(name)
    print(values)
    print(counts)

# Post FS T-SNE

In [None]:
name = 'TOX_171'
alg_name = 'RELIEFF'
df = pd.read_csv('./results/' + name + '/' + alg_name + '_selected_feat.csv', nrows=1, header=None)
results = df.values.tolist()
results = results[0]

# cutoff = int(len(results) * .8)
# results = results[:cutoff]

results = results[:200]

results = [str(i) for i in results]

post_plots(name, '../data/' + name + '_clean.csv', results)