In [129]:
from importlib import reload
import dataset as d; reload(d)
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
import ipywidgets as widgets
from ipywidgets import interact, FloatSlider

In [130]:
df = d.load_database()

In [131]:
PREDICTED_COLUMNS = ['diagnosis', 'radius_mean']
FEATURES_PREFIX = 'mean'
x, y = d.create_xy(df, FEATURES_PREFIX, PREDICTED_COLUMNS)
N_FEATURES = len(x.columns)

In [132]:
x_healthy, y_healthy = x[diagnosis == 'B'], y[diagnosis == 'B']

# Comparing Outlier detection methods

- Which features am I taking into account to detect outliers? 
    - Univariate 
        - Box plots, Z-Scores
    - Multivariate
- Can I assume a distribution(s) of values for my selected features? 
    - Parametric 
        - Z-Score, IQR score
    - Non-parametric
        - PCA, LMS, Proximity Based Models

### Sources of outliers
* Data entry errors
* Measurement errors (instrument errors)
* Experimental errors (data extraction or experiment planning/executing errors)
* Natural (not an error, novelties in data)

### IQR score

In [133]:
q1 = x.quantile(0.25)
q3 = x.quantile(0.75)
IQR = q3 - q1

In [159]:
from mpl_toolkits.mplot3d import Axes3D

In [171]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

k = FloatSlider(min=0, max=2.0, step=.05)
threshold_vote_outlier = FloatSlider(min=0, max=1, step=.01)

ui = widgets.HBox([k, threshold_vote_outlier])

def plot(x, outliers_idx, real_outlier_index):
    fig = plt.figure(figsize=(16, 10))
    ax = fig.add_subplot(111)
    ax.scatter(x[~outliers_idx, 0], x[~outliers_idx, 1], alpha=0.4, label='Inliers', c='r')
    ax.scatter(x[outliers_idx, 0], x[outliers_idx, 1], alpha=0.5, label='Predicted outliers', c='g')
    ax.scatter(x[real_outlier_index, 0], x[real_outlier_index, 1], alpha=0.2, label='Real outliers', c='b')

    plt.legend(loc='upper left')
    plt.grid()
    plt.show()
    
def f(k, threshold_vote_outlier):
    x_pca = pca.fit(x.values).transform(x.values)
    outliers_candidates = (x < (q1 - k * IQR)) | (x > (q3 + k * IQR))
    outliers_idx = outliers_candidates.apply(lambda c: c.mean() >= threshold_vote_outlier, axis=1)
    print('# Of Outliers %s' % sum(outliers_idx))
    print('% Of correct outlier predictions: ', (sum(outliers_idx == (diagnosis == 'M'))) / len(outliers_idx))
    plot(x_pca, outliers_idx, diagnosis == 'M')   
    

out = widgets.interactive_output(f, {'k': k, 'threshold_vote_outlier': threshold_vote_outlier})
display(ui, out)

In [97]:
k.value

0.6