In [1]:
__author__ = 'Tilii: https://kaggle.com/tilii7'


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import os
import time
from datetime import timedelta
from matplotlib.ticker import MultipleLocator

import warnings
with warnings.catch_warnings():
    warnings.filterwarnings('ignore',category=DeprecationWarning)
    import pandas as pd
    import numpy as np
    from datetime import datetime
    from sklearn.ensemble import IsolationForest
    from sklearn.model_selection import cross_val_predict
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.manifold import TSNE
    import pprint



In [2]:
# constants
DATA_PATH = '/kaggle/dev/mercedes-benz-greener-manufacturing-data/raw_data/'
SUBMISSION_PATH = '/kaggle/dev/mercedes-benz-greener-manufacturing-data/submissions'
ENSEMBLE_PATH = '/kaggle/dev/mercedes-benz-greener-manufacturing-data/ensemble/jsardinha/'
ASSET_PATH = '/kaggle/dev/jovan/mercedes-benz-greener-manufacturing/mercedes-benz-greener-manufacturing/assets/'

threshold_list = [[0.0, '0-0'], [0.5, '0-5'], [1.0, '1-0'], [1.5, '1-5'], [2.0, '2-0'],[2.5, '2-5'], [3.0, '3-0'],
                  [3.5, '3-5'], [4.0, '4-0'],[4.5, '4-5']]

In [None]:
# from https://stackoverflow.com/questions/22354094/pythonic-way-of-detecting-outliers-in-one-dimensional-observation-data
def is_outlier(points, thresh=3.5):
    '''
    Returns a boolean array with True if points are outliers and False
    otherwise.

    Parameters:
    -----------
        points : An numobservations by numdimensions array of observations
        thresh : The modified z-score to use as a threshold. Observations with
            a modified z-score (based on the median absolute deviation) greater
            than this value will be classified as outliers.

    Returns:
    --------
        mask : A numobservations-length boolean array.

    References:
    ----------
        Boris Iglewicz and David Hoaglin (1993), 'Volume 16: How to Detect and
        Handle Outliers', The ASQC Basic References in Quality Control:
        Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
    '''
    if len(points.shape) == 1:
        points = points[:,None]
    median = np.median(points, axis=0)
    diff = np.sum((points - median)**2, axis=-1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation

    return (modified_z_score, (modified_z_score > thresh) )

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        tmin, tsec = divmod((datetime.now() - start_time).total_seconds(), 60)
        print(' Time taken: %i minutes and %s seconds.' % (tmin, round(tsec,2)))

if __name__ == '__main__':
    
    for i , threshold in enumerate(threshold_list):
        print('------ RUNNING for treshold:', threshold[0])
        RFR = RandomForestRegressor(n_estimators=100)
        tsne = TSNE(n_components=2, n_iter_without_progress=50, init='pca', verbose=2, random_state=1001)

    # Load data set and target values
        start_time = timer(None)
        print('\n# Reading and Processing Data')
        train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'), dtype={'ID': np.int32, 'y': np.float32})
        target = train['y'].values
        train_ids = train['ID'].values
        train = train.drop(['ID', 'y'], axis=1)
        print('\n Initial Train Set Matrix Dimensions: %d x %d' % (train.shape[0], train.shape[1]))
        train_len = len(train)
        test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'), dtype={'ID': np.int32})
        test_ids = test['ID'].values
        test = test.drop(['ID'], axis=1)
        print('\n Initial Test Set Matrix Dimensions: %d x %d' % (test.shape[0], test.shape[1]))

    # Sort out numerical and categorical features
        all_data = pd.concat((train, test))
        numeric_feats = all_data.dtypes[all_data.dtypes != 'object'].index
        categorical_feats = all_data.dtypes[all_data.dtypes == 'object'].index

        print('\n Converting categorical features:')
        for i, col_name in enumerate(categorical_feats):
            print(' Converting %s' % col_name)
            temp_df = pd.get_dummies(all_data[col_name])
            new_features = temp_df.columns.tolist()
            new_features = [col_name + '_' + w for w in new_features]
            temp_df.columns = new_features
            all_data.drop(col_name, axis=1, inplace=True)
            all_data = pd.concat((all_data, temp_df), axis=1)

    # Remove columns where all data points have the same value
        print('\n Number of columns before cleaning: %d' % len(all_data.columns))
        cols = all_data.columns.tolist()
        for column in cols:
            if len(np.unique(all_data[column])) == 1:
                print(' Column %s removed' % str(column))
                all_data.drop(column, axis=1, inplace=True)

    # Remove identical columns where all data points have the same value
        cols = all_data.columns.tolist()
        remove = []
        for i in range(len(cols)-1):
            v = all_data[cols[i]].values
            for j in range(i+1,len(cols)):
                if np.array_equal(v,all_data[cols[j]].values):
                    remove.append(cols[j])
                    print(' Column %s is identical to %s. Removing %s' % (str(cols[i]), str(cols[j]), str(cols[j])))

        all_data.drop(remove, axis=1, inplace=True)
        print('\n Number of columns after cleaning: %d' % len(all_data.columns))

        features = all_data.columns
        print('\n Final Matrix Dimensions: %d x %d' % (all_data.shape[0], all_data.shape[1]))
        train_data = pd.DataFrame(all_data[ : train_len].values, columns=features)
        test_data = pd.DataFrame(all_data[train_len : ].values, columns=features)
        train_data.reset_index(drop=True, inplace=True)
        test_data.reset_index(drop=True, inplace=True)
        timer(start_time)

        start_time = timer(None)
        print('\n Calculating t-SNE embedding:')
        all_data_tsne = tsne.fit_transform(all_data)
        train_data_tsne = pd.DataFrame(all_data_tsne[ : train_len], columns=['tsne_x','tsne_y'])
        test_data_tsne = pd.DataFrame(all_data_tsne[train_len : ], columns=['tsne_x','tsne_y'])
        train_data_tsne.reset_index(drop=True, inplace=True)
        test_data_tsne.reset_index(drop=True, inplace=True)
        timer(start_time)

    #Running isolation forest to remove outliers
        start_time = timer(None)
        clf = IsolationForest(n_estimators=500, max_samples=1.0, random_state=1001, bootstrap=True, contamination=0.02, verbose=0, n_jobs=-1)
        print('\n Running Isolation Forest:')
        clf.fit(train_data.values, target)
        isof = clf.predict(train_data.values)
        train.insert(0, 'y', target)
        train.insert(0, 'ID', train_ids)
        train['isof'] = isof
        myindex = train['isof'] < 0
        train_IF = train.loc[myindex]
        train_IF.reset_index(drop=True, inplace=True)
        train_IF.drop('isof', axis=1, inplace=True)
        #train_IF.to_csv('train-isof-outliers.csv', index=False)
        test.insert(0, 'ID', test_ids)
        test['isof'] = clf.predict(test_data.values)
        myindex = test['isof'] < 0
        test_IF = test.loc[myindex]
        test_IF.reset_index(drop=True, inplace=True)
        test_IF.drop('isof', axis=1, inplace=True)
        #test_IF.to_csv('test-isof-outliers.csv', index=False)
        print('\n Found %d outlier points' % len(train_IF))
        timer(start_time)

        start_time = timer(None)
        print('\n Running Random Forest Regressor (10-fold):')
        target_pred = cross_val_predict(estimator=RFR, X=train_data.values, y=target, cv=10, n_jobs=-1)
        rfr_pred = pd.DataFrame({'ID': train_ids, 'y': target, 'y_pred': target_pred})
        #rfr_pred.to_csv('prediction-train-oof-10fold-RFR.csv', index=False)
        yvalues = np.vstack((target, target_pred)).transpose()
        OL_score, OL = is_outlier(yvalues, threshold[0])
        train['outlier_score'] = OL_score
        myindex = train['outlier_score'] >= threshold[0]
        train_OL = train.loc[myindex]
        train_OL.reset_index(drop=True, inplace=True)
        train_OL.drop(['isof','outlier_score'], axis=1, inplace=True)
        #train_OL.to_csv('train-outliers.csv', index=False)
        train_OL.to_csv(os.path.join(ASSET_PATH, 'train-outliers_' + threshold[1] + '.csv'), index=False)
        timer(start_time)

        start_time = timer(None)
        train_outliers_tsne = train_data_tsne.loc[myindex]
        test_outliers_tsne = test_data_tsne.values
        outlier_list = []
        for k in range(len(train_outliers_tsne)):
            d = ((test_outliers_tsne-train_outliers_tsne.values[k])**2).sum(axis=1)  # compute distances
            ndx = d.argsort() # sort so that smallest distance is first
            print(' Presumed outlier point for train ID = %d is test ID = %d ; their Euclidean distance from t-SNE embedding is %.8f' % (train_OL.iloc[k]['ID'], test.iloc[ndx[0]]['ID'], d[ndx[0]]))
            outlier_list.append(ndx[0])
            print(' Ten closest test points (ID, distance):')
            pprint.pprint(zip(test.iloc[ndx[:10]]['ID'], d[ndx[:10]]))

        test_OL = test.iloc[outlier_list]
        test_OL.drop(['isof'], axis=1, inplace=True)
        test_OL.sort_values(['ID'], inplace=True)
        test_OL.reset_index(drop=True, inplace=True)
        test_OL.to_csv(os.path.join(ASSET_PATH, 'test-outliers_' + threshold[1] + '.csv'), index=False)

        timer(start_time)

------ RUNNING for treshold: 0.0

# Reading and Processing Data

 Initial Train Set Matrix Dimensions: 4209 x 376

 Initial Test Set Matrix Dimensions: 4209 x 376

 Converting categorical features:
 Converting X0
 Converting X1
 Converting X2
 Converting X3
 Converting X4
 Converting X5
 Converting X6
 Converting X8

 Number of columns before cleaning: 579
 Column X16 is identical to X2_ap. Removing X2_ap
 Column X17 is identical to X382. Removing X382
 Column X23 is identical to X2_f. Removing X2_f
 Column X26 is identical to X2_b. Removing X2_b
 Column X28 is identical to X2_n. Removing X2_n
 Column X30 is identical to X2_ag. Removing X2_ag
 Column X31 is identical to X35. Removing X35
 Column X31 is identical to X37. Removing X37
 Column X32 is identical to X2_a. Removing X2_a
 Column X33 is identical to X39. Removing X39
 Column X35 is identical to X37. Removing X37
 Column X36 is identical to X2_z. Removing X2_z
 Column X44 is identical to X302. Removing X302
 Column X48 is identi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



 Found 85 outlier points
 Time taken: 0 minutes and 7.74 seconds.

 Running Random Forest Regressor (10-fold):


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


 Time taken: 0 minutes and 13.8 seconds.
 Presumed outlier point for train ID = 0 is test ID = 3084 ; their Euclidean distance from t-SNE embedding is 0.00048114
 Ten closest test points (ID, distance):
<zip object at 0x7f5636804548>
 Presumed outlier point for train ID = 6 is test ID = 2392 ; their Euclidean distance from t-SNE embedding is 0.55056130
 Ten closest test points (ID, distance):
<zip object at 0x7f56367eba88>
 Presumed outlier point for train ID = 7 is test ID = 5199 ; their Euclidean distance from t-SNE embedding is 0.05340328
 Ten closest test points (ID, distance):
<zip object at 0x7f56370c65c8>
 Presumed outlier point for train ID = 9 is test ID = 4648 ; their Euclidean distance from t-SNE embedding is 0.12928096
 Ten closest test points (ID, distance):
<zip object at 0x7f56367f1c88>
 Presumed outlier point for train ID = 13 is test ID = 6676 ; their Euclidean distance from t-SNE embedding is 0.00664494
 Ten closest test points (ID, distance):
<zip object at 0x7f56367

 Presumed outlier point for train ID = 197 is test ID = 5595 ; their Euclidean distance from t-SNE embedding is 0.02142416
 Ten closest test points (ID, distance):
<zip object at 0x7f56367ace88>
 Presumed outlier point for train ID = 200 is test ID = 7242 ; their Euclidean distance from t-SNE embedding is 0.00000012
 Ten closest test points (ID, distance):
<zip object at 0x7f56367ace88>
 Presumed outlier point for train ID = 202 is test ID = 1054 ; their Euclidean distance from t-SNE embedding is 0.00000040
 Ten closest test points (ID, distance):
<zip object at 0x7f56367acc08>
 Presumed outlier point for train ID = 203 is test ID = 4739 ; their Euclidean distance from t-SNE embedding is 0.00000534
 Ten closest test points (ID, distance):
<zip object at 0x7f56367b9288>
 Presumed outlier point for train ID = 207 is test ID = 117 ; their Euclidean distance from t-SNE embedding is 0.00000009
 Ten closest test points (ID, distance):
<zip object at 0x7f56367aca48>
 Presumed outlier point fo

 Presumed outlier point for train ID = 389 is test ID = 3770 ; their Euclidean distance from t-SNE embedding is 0.00937270
 Ten closest test points (ID, distance):
<zip object at 0x7f56367ae7c8>
 Presumed outlier point for train ID = 391 is test ID = 6402 ; their Euclidean distance from t-SNE embedding is 0.00111080
 Ten closest test points (ID, distance):
<zip object at 0x7f56367b9e48>
 Presumed outlier point for train ID = 394 is test ID = 1590 ; their Euclidean distance from t-SNE embedding is 0.57158130
 Ten closest test points (ID, distance):
<zip object at 0x7f56367ae308>
 Presumed outlier point for train ID = 397 is test ID = 939 ; their Euclidean distance from t-SNE embedding is 0.00000238
 Ten closest test points (ID, distance):
<zip object at 0x7f56367ae688>
 Presumed outlier point for train ID = 398 is test ID = 5846 ; their Euclidean distance from t-SNE embedding is 0.01014796
 Ten closest test points (ID, distance):
<zip object at 0x7f56367aedc8>
 Presumed outlier point fo

 Ten closest test points (ID, distance):
<zip object at 0x7f56367ba288>
 Presumed outlier point for train ID = 568 is test ID = 3943 ; their Euclidean distance from t-SNE embedding is 0.00084964
 Ten closest test points (ID, distance):
<zip object at 0x7f56367ba248>
 Presumed outlier point for train ID = 569 is test ID = 6053 ; their Euclidean distance from t-SNE embedding is 0.00018057
 Ten closest test points (ID, distance):
<zip object at 0x7f5636728788>
 Presumed outlier point for train ID = 574 is test ID = 3761 ; their Euclidean distance from t-SNE embedding is 0.00016007
 Ten closest test points (ID, distance):
<zip object at 0x7f5636728808>
 Presumed outlier point for train ID = 578 is test ID = 6594 ; their Euclidean distance from t-SNE embedding is 0.00044331
 Ten closest test points (ID, distance):
<zip object at 0x7f56367bac88>
 Presumed outlier point for train ID = 582 is test ID = 368 ; their Euclidean distance from t-SNE embedding is 0.01865451
 Ten closest test points (

 Presumed outlier point for train ID = 749 is test ID = 1697 ; their Euclidean distance from t-SNE embedding is 0.00000160
 Ten closest test points (ID, distance):
<zip object at 0x7f56367c2b08>
 Presumed outlier point for train ID = 750 is test ID = 1319 ; their Euclidean distance from t-SNE embedding is 0.00000902
 Ten closest test points (ID, distance):
<zip object at 0x7f56367c2108>
 Presumed outlier point for train ID = 752 is test ID = 3746 ; their Euclidean distance from t-SNE embedding is 0.00071050
 Ten closest test points (ID, distance):
<zip object at 0x7f5636773b88>
 Presumed outlier point for train ID = 753 is test ID = 7890 ; their Euclidean distance from t-SNE embedding is 0.00741298
 Ten closest test points (ID, distance):
<zip object at 0x7f5636773948>
 Presumed outlier point for train ID = 755 is test ID = 267 ; their Euclidean distance from t-SNE embedding is 0.00343545
 Ten closest test points (ID, distance):
<zip object at 0x7f5636773b88>
 Presumed outlier point fo

 Presumed outlier point for train ID = 917 is test ID = 731 ; their Euclidean distance from t-SNE embedding is 0.00346777
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741948>
 Presumed outlier point for train ID = 919 is test ID = 3176 ; their Euclidean distance from t-SNE embedding is 0.04562208
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a48>
 Presumed outlier point for train ID = 920 is test ID = 4437 ; their Euclidean distance from t-SNE embedding is 0.00161952
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740c08>
 Presumed outlier point for train ID = 922 is test ID = 43 ; their Euclidean distance from t-SNE embedding is 0.00000006
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740a08>
 Presumed outlier point for train ID = 923 is test ID = 6526 ; their Euclidean distance from t-SNE embedding is 0.00059373
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741c48>
 Presumed outlier point for 

<zip object at 0x7f5636741bc8>
 Presumed outlier point for train ID = 1122 is test ID = 1152 ; their Euclidean distance from t-SNE embedding is 0.00000043
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741dc8>
 Presumed outlier point for train ID = 1123 is test ID = 3386 ; their Euclidean distance from t-SNE embedding is 0.00040917
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741c88>
 Presumed outlier point for train ID = 1127 is test ID = 3386 ; their Euclidean distance from t-SNE embedding is 0.00036954
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740588>
 Presumed outlier point for train ID = 1128 is test ID = 1330 ; their Euclidean distance from t-SNE embedding is 0.00000173
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740fc8>
 Presumed outlier point for train ID = 1129 is test ID = 1380 ; their Euclidean distance from t-SNE embedding is 0.00000224
 Ten closest test points (ID, distance):
<zip object at 0x7f5

 Presumed outlier point for train ID = 1300 is test ID = 399 ; their Euclidean distance from t-SNE embedding is 0.00511150
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e88>
 Presumed outlier point for train ID = 1302 is test ID = 2866 ; their Euclidean distance from t-SNE embedding is 0.00601796
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741848>
 Presumed outlier point for train ID = 1303 is test ID = 1257 ; their Euclidean distance from t-SNE embedding is 0.00000004
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e48>
 Presumed outlier point for train ID = 1304 is test ID = 3655 ; their Euclidean distance from t-SNE embedding is 0.00134779
 Ten closest test points (ID, distance):
<zip object at 0x7f56367412c8>
 Presumed outlier point for train ID = 1305 is test ID = 7148 ; their Euclidean distance from t-SNE embedding is 0.00332594
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741848>
 Presumed outlier poi

 Ten closest test points (ID, distance):
<zip object at 0x7f5636740c08>
 Presumed outlier point for train ID = 1504 is test ID = 3884 ; their Euclidean distance from t-SNE embedding is 0.00023658
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741c48>
 Presumed outlier point for train ID = 1505 is test ID = 728 ; their Euclidean distance from t-SNE embedding is 0.00008479
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741808>
 Presumed outlier point for train ID = 1506 is test ID = 1820 ; their Euclidean distance from t-SNE embedding is 0.00027954
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741b48>
 Presumed outlier point for train ID = 1510 is test ID = 5104 ; their Euclidean distance from t-SNE embedding is 0.00002990
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741d88>
 Presumed outlier point for train ID = 1515 is test ID = 5658 ; their Euclidean distance from t-SNE embedding is 0.00000122
 Ten closest test poi

 Presumed outlier point for train ID = 1715 is test ID = 1711 ; their Euclidean distance from t-SNE embedding is 0.00000747
 Ten closest test points (ID, distance):
<zip object at 0x7f56367410c8>
 Presumed outlier point for train ID = 1716 is test ID = 2277 ; their Euclidean distance from t-SNE embedding is 0.00186505
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e48>
 Presumed outlier point for train ID = 1718 is test ID = 5076 ; their Euclidean distance from t-SNE embedding is 0.00001653
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741bc8>
 Presumed outlier point for train ID = 1721 is test ID = 1844 ; their Euclidean distance from t-SNE embedding is 0.00144074
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741948>
 Presumed outlier point for train ID = 1723 is test ID = 6806 ; their Euclidean distance from t-SNE embedding is 0.00003374
 Ten closest test points (ID, distance):
<zip object at 0x7f5636773408>
 Presumed outlier po

 Presumed outlier point for train ID = 1904 is test ID = 267 ; their Euclidean distance from t-SNE embedding is 0.00078926
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e88>
 Presumed outlier point for train ID = 1907 is test ID = 4389 ; their Euclidean distance from t-SNE embedding is 0.00006288
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740f48>
 Presumed outlier point for train ID = 1908 is test ID = 8125 ; their Euclidean distance from t-SNE embedding is 0.00270358
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741748>
 Presumed outlier point for train ID = 1909 is test ID = 2037 ; their Euclidean distance from t-SNE embedding is 0.00006004
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e08>
 Presumed outlier point for train ID = 1910 is test ID = 1189 ; their Euclidean distance from t-SNE embedding is 0.00000001
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741b08>
 Presumed outlier poi

 Presumed outlier point for train ID = 2095 is test ID = 3021 ; their Euclidean distance from t-SNE embedding is 0.00518787
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741bc8>
 Presumed outlier point for train ID = 2096 is test ID = 573 ; their Euclidean distance from t-SNE embedding is 0.00000428
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e88>
 Presumed outlier point for train ID = 2097 is test ID = 2604 ; their Euclidean distance from t-SNE embedding is 0.00001738
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741f48>
 Presumed outlier point for train ID = 2099 is test ID = 76 ; their Euclidean distance from t-SNE embedding is 0.00831297
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741bc8>
 Presumed outlier point for train ID = 2100 is test ID = 2010 ; their Euclidean distance from t-SNE embedding is 0.00001960
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741d48>
 Presumed outlier point

 Presumed outlier point for train ID = 2285 is test ID = 3134 ; their Euclidean distance from t-SNE embedding is 0.00001541
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741b08>
 Presumed outlier point for train ID = 2286 is test ID = 3716 ; their Euclidean distance from t-SNE embedding is 0.00026837
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741f48>
 Presumed outlier point for train ID = 2287 is test ID = 711 ; their Euclidean distance from t-SNE embedding is 0.02026709
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741808>
 Presumed outlier point for train ID = 2290 is test ID = 2665 ; their Euclidean distance from t-SNE embedding is 0.00084385
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741dc8>
 Presumed outlier point for train ID = 2293 is test ID = 7161 ; their Euclidean distance from t-SNE embedding is 0.00000733
 Ten closest test points (ID, distance):
<zip object at 0x7f56367416c8>
 Presumed outlier poi

 Ten closest test points (ID, distance):
<zip object at 0x7f5636741bc8>
 Presumed outlier point for train ID = 2475 is test ID = 1629 ; their Euclidean distance from t-SNE embedding is 0.00012565
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a08>
 Presumed outlier point for train ID = 2476 is test ID = 942 ; their Euclidean distance from t-SNE embedding is 0.00434927
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741f48>
 Presumed outlier point for train ID = 2477 is test ID = 7286 ; their Euclidean distance from t-SNE embedding is 0.00000030
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741948>
 Presumed outlier point for train ID = 2479 is test ID = 3002 ; their Euclidean distance from t-SNE embedding is 0.00675555
 Ten closest test points (ID, distance):
<zip object at 0x7f56367416c8>
 Presumed outlier point for train ID = 2482 is test ID = 1578 ; their Euclidean distance from t-SNE embedding is 0.00419393
 Ten closest test poi

 Ten closest test points (ID, distance):
<zip object at 0x7f56367b9188>
 Presumed outlier point for train ID = 2660 is test ID = 2572 ; their Euclidean distance from t-SNE embedding is 0.00000015
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e88>
 Presumed outlier point for train ID = 2661 is test ID = 7740 ; their Euclidean distance from t-SNE embedding is 0.26210697
 Ten closest test points (ID, distance):
<zip object at 0x7f56367411c8>
 Presumed outlier point for train ID = 2666 is test ID = 1201 ; their Euclidean distance from t-SNE embedding is 0.00056180
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741ac8>
 Presumed outlier point for train ID = 2667 is test ID = 2123 ; their Euclidean distance from t-SNE embedding is 0.00099613
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a88>
 Presumed outlier point for train ID = 2668 is test ID = 413 ; their Euclidean distance from t-SNE embedding is 0.00009907
 Ten closest test poi

 Ten closest test points (ID, distance):
<zip object at 0x7f5636740788>
 Presumed outlier point for train ID = 2859 is test ID = 8154 ; their Euclidean distance from t-SNE embedding is 0.04679776
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741bc8>
 Presumed outlier point for train ID = 2860 is test ID = 2868 ; their Euclidean distance from t-SNE embedding is 0.03069255
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e48>
 Presumed outlier point for train ID = 2862 is test ID = 2890 ; their Euclidean distance from t-SNE embedding is 0.00001117
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741048>
 Presumed outlier point for train ID = 2863 is test ID = 370 ; their Euclidean distance from t-SNE embedding is 0.00413625
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e88>
 Presumed outlier point for train ID = 2869 is test ID = 7976 ; their Euclidean distance from t-SNE embedding is 0.00882892
 Ten closest test poi

 Presumed outlier point for train ID = 3060 is test ID = 1246 ; their Euclidean distance from t-SNE embedding is 0.00004917
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741b88>
 Presumed outlier point for train ID = 3062 is test ID = 4025 ; their Euclidean distance from t-SNE embedding is 0.00000761
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a48>
 Presumed outlier point for train ID = 3064 is test ID = 7366 ; their Euclidean distance from t-SNE embedding is 0.00333171
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a08>
 Presumed outlier point for train ID = 3065 is test ID = 471 ; their Euclidean distance from t-SNE embedding is 0.00158249
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741048>
 Presumed outlier point for train ID = 3070 is test ID = 3715 ; their Euclidean distance from t-SNE embedding is 0.00002108
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a88>
 Presumed outlier poi

 Ten closest test points (ID, distance):
<zip object at 0x7f5636741bc8>
 Presumed outlier point for train ID = 3278 is test ID = 3132 ; their Euclidean distance from t-SNE embedding is 0.00002870
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741f48>
 Presumed outlier point for train ID = 3281 is test ID = 5934 ; their Euclidean distance from t-SNE embedding is 0.00024554
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741c08>
 Presumed outlier point for train ID = 3282 is test ID = 3192 ; their Euclidean distance from t-SNE embedding is 0.00000004
 Ten closest test points (ID, distance):
<zip object at 0x7f56367411c8>
 Presumed outlier point for train ID = 3284 is test ID = 7554 ; their Euclidean distance from t-SNE embedding is 0.00016177
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e88>
 Presumed outlier point for train ID = 3286 is test ID = 2481 ; their Euclidean distance from t-SNE embedding is 0.00000000
 Ten closest test po

 Presumed outlier point for train ID = 3448 is test ID = 1029 ; their Euclidean distance from t-SNE embedding is 0.00064529
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a88>
 Presumed outlier point for train ID = 3450 is test ID = 4484 ; their Euclidean distance from t-SNE embedding is 0.00008681
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741fc8>
 Presumed outlier point for train ID = 3451 is test ID = 3562 ; their Euclidean distance from t-SNE embedding is 0.00000000
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a48>
 Presumed outlier point for train ID = 3452 is test ID = 3671 ; their Euclidean distance from t-SNE embedding is 0.00000008
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741948>
 Presumed outlier point for train ID = 3454 is test ID = 3173 ; their Euclidean distance from t-SNE embedding is 0.00091300
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741fc8>
 Presumed outlier po

 Presumed outlier point for train ID = 3656 is test ID = 3559 ; their Euclidean distance from t-SNE embedding is 0.00000034
 Ten closest test points (ID, distance):
<zip object at 0x7f56367418c8>
 Presumed outlier point for train ID = 3657 is test ID = 6831 ; their Euclidean distance from t-SNE embedding is 0.00025698
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741c48>
 Presumed outlier point for train ID = 3658 is test ID = 6071 ; their Euclidean distance from t-SNE embedding is 0.05984924
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741788>
 Presumed outlier point for train ID = 3661 is test ID = 2129 ; their Euclidean distance from t-SNE embedding is 0.00139634
 Ten closest test points (ID, distance):
<zip object at 0x7f56367413c8>
 Presumed outlier point for train ID = 3663 is test ID = 2360 ; their Euclidean distance from t-SNE embedding is 0.00447333
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741948>
 Presumed outlier po

 Presumed outlier point for train ID = 3826 is test ID = 4465 ; their Euclidean distance from t-SNE embedding is 0.00095249
 Ten closest test points (ID, distance):
<zip object at 0x7f56367419c8>
 Presumed outlier point for train ID = 3827 is test ID = 1611 ; their Euclidean distance from t-SNE embedding is 0.00059784
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741f88>
 Presumed outlier point for train ID = 3829 is test ID = 7173 ; their Euclidean distance from t-SNE embedding is 0.00006206
 Ten closest test points (ID, distance):
<zip object at 0x7f56367417c8>
 Presumed outlier point for train ID = 3833 is test ID = 3780 ; their Euclidean distance from t-SNE embedding is 0.00000609
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741d88>
 Presumed outlier point for train ID = 3837 is test ID = 368 ; their Euclidean distance from t-SNE embedding is 0.00028360
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741bc8>
 Presumed outlier poi

 Presumed outlier point for train ID = 4019 is test ID = 3515 ; their Euclidean distance from t-SNE embedding is 0.00006831
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741f48>
 Presumed outlier point for train ID = 4020 is test ID = 4080 ; their Euclidean distance from t-SNE embedding is 0.00000018
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741948>
 Presumed outlier point for train ID = 4021 is test ID = 138 ; their Euclidean distance from t-SNE embedding is 0.00013295
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a08>
 Presumed outlier point for train ID = 4022 is test ID = 5256 ; their Euclidean distance from t-SNE embedding is 0.00006462
 Ten closest test points (ID, distance):
<zip object at 0x7f56370c3b88>
 Presumed outlier point for train ID = 4023 is test ID = 4039 ; their Euclidean distance from t-SNE embedding is 0.00025154
 Ten closest test points (ID, distance):
<zip object at 0x7f56370bf788>
 Presumed outlier poi

 Presumed outlier point for train ID = 4189 is test ID = 4232 ; their Euclidean distance from t-SNE embedding is 0.00000000
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741cc8>
 Presumed outlier point for train ID = 4191 is test ID = 5317 ; their Euclidean distance from t-SNE embedding is 0.00018248
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741bc8>
 Presumed outlier point for train ID = 4197 is test ID = 4322 ; their Euclidean distance from t-SNE embedding is 0.00664254
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741d88>
 Presumed outlier point for train ID = 4198 is test ID = 1296 ; their Euclidean distance from t-SNE embedding is 0.00417321
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e88>
 Presumed outlier point for train ID = 4199 is test ID = 2991 ; their Euclidean distance from t-SNE embedding is 0.00007729
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741bc8>
 Presumed outlier po

 Ten closest test points (ID, distance):
<zip object at 0x7f5636740888>
 Presumed outlier point for train ID = 4375 is test ID = 4196 ; their Euclidean distance from t-SNE embedding is 0.00014260
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741c08>
 Presumed outlier point for train ID = 4378 is test ID = 4333 ; their Euclidean distance from t-SNE embedding is 0.03462237
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741948>
 Presumed outlier point for train ID = 4379 is test ID = 6868 ; their Euclidean distance from t-SNE embedding is 0.07446683
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e88>
 Presumed outlier point for train ID = 4381 is test ID = 2289 ; their Euclidean distance from t-SNE embedding is 0.03565112
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741d48>
 Presumed outlier point for train ID = 4382 is test ID = 6818 ; their Euclidean distance from t-SNE embedding is 0.00000216
 Ten closest test po

 Ten closest test points (ID, distance):
<zip object at 0x7f56367733c8>
 Presumed outlier point for train ID = 4563 is test ID = 3190 ; their Euclidean distance from t-SNE embedding is 0.00000350
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a08>
 Presumed outlier point for train ID = 4564 is test ID = 5912 ; their Euclidean distance from t-SNE embedding is 0.00000403
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741988>
 Presumed outlier point for train ID = 4565 is test ID = 4625 ; their Euclidean distance from t-SNE embedding is 0.00000001
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741748>
 Presumed outlier point for train ID = 4567 is test ID = 3671 ; their Euclidean distance from t-SNE embedding is 0.00000770
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741788>
 Presumed outlier point for train ID = 4568 is test ID = 1729 ; their Euclidean distance from t-SNE embedding is 0.00600230
 Ten closest test po

 Ten closest test points (ID, distance):
<zip object at 0x7f5636740c08>
 Presumed outlier point for train ID = 4768 is test ID = 114 ; their Euclidean distance from t-SNE embedding is 0.00000004
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741748>
 Presumed outlier point for train ID = 4773 is test ID = 3202 ; their Euclidean distance from t-SNE embedding is 0.00025870
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e08>
 Presumed outlier point for train ID = 4775 is test ID = 2624 ; their Euclidean distance from t-SNE embedding is 0.00005928
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a08>
 Presumed outlier point for train ID = 4779 is test ID = 951 ; their Euclidean distance from t-SNE embedding is 0.00002328
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741c08>
 Presumed outlier point for train ID = 4781 is test ID = 6831 ; their Euclidean distance from t-SNE embedding is 0.00213750
 Ten closest test poin

 Presumed outlier point for train ID = 4959 is test ID = 4190 ; their Euclidean distance from t-SNE embedding is 0.00026735
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e08>
 Presumed outlier point for train ID = 4961 is test ID = 3979 ; their Euclidean distance from t-SNE embedding is 0.00011255
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741f08>
 Presumed outlier point for train ID = 4962 is test ID = 3161 ; their Euclidean distance from t-SNE embedding is 0.00057362
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741c08>
 Presumed outlier point for train ID = 4963 is test ID = 5771 ; their Euclidean distance from t-SNE embedding is 0.00816831
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741748>
 Presumed outlier point for train ID = 4964 is test ID = 1527 ; their Euclidean distance from t-SNE embedding is 0.00009552
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741f08>
 Presumed outlier po

 Ten closest test points (ID, distance):
<zip object at 0x7f5636741cc8>
 Presumed outlier point for train ID = 5152 is test ID = 1796 ; their Euclidean distance from t-SNE embedding is 0.00113459
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a88>
 Presumed outlier point for train ID = 5153 is test ID = 1044 ; their Euclidean distance from t-SNE embedding is 0.00168437
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e88>
 Presumed outlier point for train ID = 5156 is test ID = 5425 ; their Euclidean distance from t-SNE embedding is 0.00000000
 Ten closest test points (ID, distance):
<zip object at 0x7f56367414c8>
 Presumed outlier point for train ID = 5157 is test ID = 4539 ; their Euclidean distance from t-SNE embedding is 0.15832335
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741948>
 Presumed outlier point for train ID = 5158 is test ID = 1482 ; their Euclidean distance from t-SNE embedding is 0.95299537
 Ten closest test po

 Ten closest test points (ID, distance):
<zip object at 0x7f5636741948>
 Presumed outlier point for train ID = 5335 is test ID = 5232 ; their Euclidean distance from t-SNE embedding is 0.02199692
 Ten closest test points (ID, distance):
<zip object at 0x7f56367416c8>
 Presumed outlier point for train ID = 5337 is test ID = 5466 ; their Euclidean distance from t-SNE embedding is 0.00000030
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740a08>
 Presumed outlier point for train ID = 5339 is test ID = 6926 ; their Euclidean distance from t-SNE embedding is 0.00000208
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740888>
 Presumed outlier point for train ID = 5340 is test ID = 6751 ; their Euclidean distance from t-SNE embedding is 0.00000002
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740888>
 Presumed outlier point for train ID = 5342 is test ID = 1259 ; their Euclidean distance from t-SNE embedding is 0.00004423
 Ten closest test po

 Presumed outlier point for train ID = 5519 is test ID = 4936 ; their Euclidean distance from t-SNE embedding is 0.00000972
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a48>
 Presumed outlier point for train ID = 5520 is test ID = 3011 ; their Euclidean distance from t-SNE embedding is 0.00085393
 Ten closest test points (ID, distance):
<zip object at 0x7f56367417c8>
 Presumed outlier point for train ID = 5523 is test ID = 5676 ; their Euclidean distance from t-SNE embedding is 0.00003456
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741c88>
 Presumed outlier point for train ID = 5524 is test ID = 1895 ; their Euclidean distance from t-SNE embedding is 0.00028424
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741b48>
 Presumed outlier point for train ID = 5525 is test ID = 8029 ; their Euclidean distance from t-SNE embedding is 0.00000631
 Ten closest test points (ID, distance):
<zip object at 0x7f56367419c8>
 Presumed outlier po

 Ten closest test points (ID, distance):
<zip object at 0x7f5636740488>
 Presumed outlier point for train ID = 5716 is test ID = 3533 ; their Euclidean distance from t-SNE embedding is 0.02422914
 Ten closest test points (ID, distance):
<zip object at 0x7f56367417c8>
 Presumed outlier point for train ID = 5717 is test ID = 5548 ; their Euclidean distance from t-SNE embedding is 0.00000004
 Ten closest test points (ID, distance):
<zip object at 0x7f56367417c8>
 Presumed outlier point for train ID = 5719 is test ID = 5111 ; their Euclidean distance from t-SNE embedding is 0.00086573
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740fc8>
 Presumed outlier point for train ID = 5720 is test ID = 1016 ; their Euclidean distance from t-SNE embedding is 0.00000107
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740788>
 Presumed outlier point for train ID = 5722 is test ID = 5655 ; their Euclidean distance from t-SNE embedding is 0.00000407
 Ten closest test po

 Presumed outlier point for train ID = 5921 is test ID = 591 ; their Euclidean distance from t-SNE embedding is 0.00031602
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e08>
 Presumed outlier point for train ID = 5922 is test ID = 3226 ; their Euclidean distance from t-SNE embedding is 0.00933576
 Ten closest test points (ID, distance):
<zip object at 0x7f56367416c8>
 Presumed outlier point for train ID = 5923 is test ID = 8280 ; their Euclidean distance from t-SNE embedding is 0.00591470
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741c08>
 Presumed outlier point for train ID = 5925 is test ID = 4987 ; their Euclidean distance from t-SNE embedding is 0.00000001
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a48>
 Presumed outlier point for train ID = 5927 is test ID = 4038 ; their Euclidean distance from t-SNE embedding is 0.00000095
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e48>
 Presumed outlier poi

 Ten closest test points (ID, distance):
<zip object at 0x7f5636740788>
 Presumed outlier point for train ID = 6122 is test ID = 2751 ; their Euclidean distance from t-SNE embedding is 0.00000000
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e08>
 Presumed outlier point for train ID = 6124 is test ID = 7633 ; their Euclidean distance from t-SNE embedding is 0.00002915
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741948>
 Presumed outlier point for train ID = 6126 is test ID = 7631 ; their Euclidean distance from t-SNE embedding is 0.00228269
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e48>
 Presumed outlier point for train ID = 6127 is test ID = 5656 ; their Euclidean distance from t-SNE embedding is 0.00003803
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741c88>
 Presumed outlier point for train ID = 6129 is test ID = 5460 ; their Euclidean distance from t-SNE embedding is 0.00009598
 Ten closest test po

 Presumed outlier point for train ID = 6301 is test ID = 6059 ; their Euclidean distance from t-SNE embedding is 0.00091739
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741748>
 Presumed outlier point for train ID = 6302 is test ID = 5229 ; their Euclidean distance from t-SNE embedding is 0.00000072
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741ec8>
 Presumed outlier point for train ID = 6303 is test ID = 8046 ; their Euclidean distance from t-SNE embedding is 0.02241083
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741c88>
 Presumed outlier point for train ID = 6305 is test ID = 4012 ; their Euclidean distance from t-SNE embedding is 0.00027716
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741bc8>
 Presumed outlier point for train ID = 6306 is test ID = 6407 ; their Euclidean distance from t-SNE embedding is 0.00006478
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741988>
 Presumed outlier po

 Ten closest test points (ID, distance):
<zip object at 0x7f56367417c8>
 Presumed outlier point for train ID = 6480 is test ID = 711 ; their Euclidean distance from t-SNE embedding is 0.01533294
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741848>
 Presumed outlier point for train ID = 6482 is test ID = 2718 ; their Euclidean distance from t-SNE embedding is 0.00008726
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740f48>
 Presumed outlier point for train ID = 6483 is test ID = 6954 ; their Euclidean distance from t-SNE embedding is 0.00001282
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740688>
 Presumed outlier point for train ID = 6485 is test ID = 5208 ; their Euclidean distance from t-SNE embedding is 0.00270633
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740888>
 Presumed outlier point for train ID = 6486 is test ID = 7244 ; their Euclidean distance from t-SNE embedding is 0.00017519
 Ten closest test poi

 Ten closest test points (ID, distance):
<zip object at 0x7f5636740588>
 Presumed outlier point for train ID = 6655 is test ID = 6402 ; their Euclidean distance from t-SNE embedding is 0.00848224
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741f88>
 Presumed outlier point for train ID = 6656 is test ID = 7843 ; their Euclidean distance from t-SNE embedding is 0.02165415
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a88>
 Presumed outlier point for train ID = 6658 is test ID = 591 ; their Euclidean distance from t-SNE embedding is 0.00007236
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741548>
 Presumed outlier point for train ID = 6659 is test ID = 7441 ; their Euclidean distance from t-SNE embedding is 0.00014928
 Ten closest test points (ID, distance):
<zip object at 0x7f56367411c8>
 Presumed outlier point for train ID = 6660 is test ID = 6868 ; their Euclidean distance from t-SNE embedding is 0.00862082
 Ten closest test poi

 Ten closest test points (ID, distance):
<zip object at 0x7f5636740888>
 Presumed outlier point for train ID = 6855 is test ID = 1934 ; their Euclidean distance from t-SNE embedding is 0.00000000
 Ten closest test points (ID, distance):
<zip object at 0x7f56367418c8>
 Presumed outlier point for train ID = 6856 is test ID = 3901 ; their Euclidean distance from t-SNE embedding is 0.00144715
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741f88>
 Presumed outlier point for train ID = 6857 is test ID = 7741 ; their Euclidean distance from t-SNE embedding is 0.00000888
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741cc8>
 Presumed outlier point for train ID = 6859 is test ID = 7092 ; their Euclidean distance from t-SNE embedding is 0.00003006
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a88>
 Presumed outlier point for train ID = 6860 is test ID = 6867 ; their Euclidean distance from t-SNE embedding is 0.00000004
 Ten closest test po

 Ten closest test points (ID, distance):
<zip object at 0x7f56367413c8>
 Presumed outlier point for train ID = 7053 is test ID = 8101 ; their Euclidean distance from t-SNE embedding is 0.00028494
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741848>
 Presumed outlier point for train ID = 7056 is test ID = 7159 ; their Euclidean distance from t-SNE embedding is 0.00000020
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741cc8>
 Presumed outlier point for train ID = 7058 is test ID = 7107 ; their Euclidean distance from t-SNE embedding is 0.84867364
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741bc8>
 Presumed outlier point for train ID = 7059 is test ID = 3567 ; their Euclidean distance from t-SNE embedding is 0.00113764
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741948>
 Presumed outlier point for train ID = 7062 is test ID = 1771 ; their Euclidean distance from t-SNE embedding is 0.13196918
 Ten closest test po

 Presumed outlier point for train ID = 7262 is test ID = 2474 ; their Euclidean distance from t-SNE embedding is 0.00127093
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741988>
 Presumed outlier point for train ID = 7263 is test ID = 693 ; their Euclidean distance from t-SNE embedding is 0.00748284
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e08>
 Presumed outlier point for train ID = 7265 is test ID = 5591 ; their Euclidean distance from t-SNE embedding is 0.00216881
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741ec8>
 Presumed outlier point for train ID = 7266 is test ID = 7889 ; their Euclidean distance from t-SNE embedding is 0.00001180
 Ten closest test points (ID, distance):
<zip object at 0x7f56367413c8>
 Presumed outlier point for train ID = 7267 is test ID = 2693 ; their Euclidean distance from t-SNE embedding is 0.04681350
 Ten closest test points (ID, distance):
<zip object at 0x7f56367419c8>
 Presumed outlier poi

 Ten closest test points (ID, distance):
<zip object at 0x7f5636741948>
 Presumed outlier point for train ID = 7468 is test ID = 4968 ; their Euclidean distance from t-SNE embedding is 0.00001972
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741b88>
 Presumed outlier point for train ID = 7470 is test ID = 1497 ; their Euclidean distance from t-SNE embedding is 0.01056594
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740488>
 Presumed outlier point for train ID = 7471 is test ID = 7613 ; their Euclidean distance from t-SNE embedding is 0.00151711
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740488>
 Presumed outlier point for train ID = 7474 is test ID = 7521 ; their Euclidean distance from t-SNE embedding is 0.00018125
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740888>
 Presumed outlier point for train ID = 7475 is test ID = 8243 ; their Euclidean distance from t-SNE embedding is 0.00027314
 Ten closest test po

 Presumed outlier point for train ID = 7657 is test ID = 6819 ; their Euclidean distance from t-SNE embedding is 0.00000090
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e48>
 Presumed outlier point for train ID = 7658 is test ID = 7151 ; their Euclidean distance from t-SNE embedding is 0.00333783
 Ten closest test points (ID, distance):
<zip object at 0x7f56367412c8>
 Presumed outlier point for train ID = 7660 is test ID = 812 ; their Euclidean distance from t-SNE embedding is 0.00055209
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741748>
 Presumed outlier point for train ID = 7661 is test ID = 5707 ; their Euclidean distance from t-SNE embedding is 0.00000173
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740488>
 Presumed outlier point for train ID = 7663 is test ID = 96 ; their Euclidean distance from t-SNE embedding is 0.00116786
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741f88>
 Presumed outlier point

 Presumed outlier point for train ID = 7841 is test ID = 3834 ; their Euclidean distance from t-SNE embedding is 0.01930347
 Ten closest test points (ID, distance):
<zip object at 0x7f56367415c8>
 Presumed outlier point for train ID = 7845 is test ID = 7049 ; their Euclidean distance from t-SNE embedding is 0.00002153
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741bc8>
 Presumed outlier point for train ID = 7847 is test ID = 4570 ; their Euclidean distance from t-SNE embedding is 0.00004774
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a48>
 Presumed outlier point for train ID = 7848 is test ID = 4299 ; their Euclidean distance from t-SNE embedding is 0.00002812
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741f48>
 Presumed outlier point for train ID = 7849 is test ID = 8030 ; their Euclidean distance from t-SNE embedding is 0.00007724
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740788>
 Presumed outlier po

 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e08>
 Presumed outlier point for train ID = 8041 is test ID = 2921 ; their Euclidean distance from t-SNE embedding is 0.00003440
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741848>
 Presumed outlier point for train ID = 8043 is test ID = 6569 ; their Euclidean distance from t-SNE embedding is 0.00000011
 Ten closest test points (ID, distance):
<zip object at 0x7f56370c3b88>
 Presumed outlier point for train ID = 8045 is test ID = 963 ; their Euclidean distance from t-SNE embedding is 0.00000204
 Ten closest test points (ID, distance):
<zip object at 0x7f56370bf708>
 Presumed outlier point for train ID = 8047 is test ID = 8141 ; their Euclidean distance from t-SNE embedding is 0.00000000
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741788>
 Presumed outlier point for train ID = 8051 is test ID = 2370 ; their Euclidean distance from t-SNE embedding is 0.00023079
 Ten closest test poi

<zip object at 0x7f5636740f48>
 Presumed outlier point for train ID = 8213 is test ID = 7248 ; their Euclidean distance from t-SNE embedding is 0.24991622
 Ten closest test points (ID, distance):
<zip object at 0x7f56367416c8>
 Presumed outlier point for train ID = 8215 is test ID = 210 ; their Euclidean distance from t-SNE embedding is 0.00020924
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741848>
 Presumed outlier point for train ID = 8216 is test ID = 7571 ; their Euclidean distance from t-SNE embedding is 0.01223437
 Ten closest test points (ID, distance):
<zip object at 0x7f56367bab48>
 Presumed outlier point for train ID = 8219 is test ID = 4595 ; their Euclidean distance from t-SNE embedding is 0.00018753
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740fc8>
 Presumed outlier point for train ID = 8220 is test ID = 1316 ; their Euclidean distance from t-SNE embedding is 0.00036270
 Ten closest test points (ID, distance):
<zip object at 0x7f56

 Presumed outlier point for train ID = 8397 is test ID = 1923 ; their Euclidean distance from t-SNE embedding is 0.73691312
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741748>
 Presumed outlier point for train ID = 8399 is test ID = 7616 ; their Euclidean distance from t-SNE embedding is 0.00000016
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741e88>
 Presumed outlier point for train ID = 8402 is test ID = 6947 ; their Euclidean distance from t-SNE embedding is 0.00000402
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741a48>
 Presumed outlier point for train ID = 8403 is test ID = 6547 ; their Euclidean distance from t-SNE embedding is 0.00001819
 Ten closest test points (ID, distance):
<zip object at 0x7f5636741c08>
 Presumed outlier point for train ID = 8405 is test ID = 6445 ; their Euclidean distance from t-SNE embedding is 0.00203628
 Ten closest test points (ID, distance):
<zip object at 0x7f5636740988>
 Presumed outlier po

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


 Time taken: 0 minutes and 9.74 seconds.
------ RUNNING for treshold: 0.5

# Reading and Processing Data

 Initial Train Set Matrix Dimensions: 4209 x 376

 Initial Test Set Matrix Dimensions: 4209 x 376

 Converting categorical features:
 Converting X0
 Converting X1
 Converting X2
 Converting X3
 Converting X4
 Converting X5
 Converting X6
 Converting X8

 Number of columns before cleaning: 579
 Column X16 is identical to X2_ap. Removing X2_ap
 Column X17 is identical to X382. Removing X382
 Column X23 is identical to X2_f. Removing X2_f
 Column X26 is identical to X2_b. Removing X2_b
 Column X28 is identical to X2_n. Removing X2_n
 Column X30 is identical to X2_ag. Removing X2_ag
 Column X31 is identical to X35. Removing X35
 Column X31 is identical to X37. Removing X37
 Column X32 is identical to X2_a. Removing X2_a
 Column X33 is identical to X39. Removing X39
 Column X35 is identical to X37. Removing X37
 Column X36 is identical to X2_z. Removing X2_z
 Column X44 is identical to 

In [4]:
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
train = train.iloc[:,0:1]
test = test.iloc[:,0:1]


def outlier_constructor(outlier_name, outlier_file_train, outlier_file_test):
    global train
    outliers_train = pd.read_csv(os.path.join(ASSET_PATH, outlier_file_train))
    outliers_train[outlier_name] = 1.0
    outliers_train = outliers_train[['ID', outlier_name]]
    train = pd.merge(train, outliers_train, on='ID', how='left')
    train[outlier_name] = train[outlier_name].apply(lambda x: 1 if x==1.0 else 0)
    
    global test
    outliers_test = pd.read_csv(os.path.join(ASSET_PATH, outlier_file_test))    
    outliers_test = outliers_test.drop_duplicates()
    outliers_test[outlier_name] = 1.0
    outliers_test = outliers_test[['ID', outlier_name]]
    test = pd.merge(test, outliers_test, on='ID', how='left')
    test[outlier_name] = test[outlier_name].apply(lambda x: 1 if x==1.0 else 0)
    
    
    return

for i, threshold in enumerate(threshold_list):
    outlier_name = 'is_outlier_' + threshold[1]
    outlier_file_train = 'train-outliers_' + threshold[1] + '.csv'
    outlier_file_test = 'test-outliers_' + threshold[1] + '.csv'
    outlier_constructor(outlier_name, outlier_file_train, outlier_file_test)

In [5]:
train.to_csv(os.path.join(ASSET_PATH, 'train_outlier_list.csv'), index=False)
test.to_csv(os.path.join(ASSET_PATH, 'test_outlier_list.csv'), index=False)