# Euclidian Distance: Threshold for Outlier Detection 

In [1]:
%matplotlib inline
import math
import pandas as pd
import numpy as np
import scipy as sci
from scipy import stats
from sklearn.cluster import KMeans
import matplotlib.pyplot as plot

## Read Data for Top 30 Areas

In [2]:
data = pd.read_csv('dataset_kmeans_clustering_top30.csv')
print(data.shape)
data.head()


(363, 4)


Unnamed: 0,pickup_lat,pickup_lon,count_pickup_loc,class
0,8,220,2,0
1,20,378,2,0
2,66,405,2,0
3,194,493,2,0
4,296,569,2,0


In [3]:
data.describe()

Unnamed: 0,pickup_lat,pickup_lon,count_pickup_loc,class
count,363.0,363.0,363.0,363.0
mean,270.586777,543.438017,25480.47,1.504132
std,188.902164,178.84948,94144.3,1.118335
min,0.0,15.0,2.0,0.0
25%,117.0,427.5,15.0,1.0
50%,227.0,578.0,169.0,2.0
75%,399.5,694.0,4764.5,2.5
max,765.0,785.0,1097844.0,3.0


## Compute Distance among data objects using Euclidian

In [9]:
from scipy.spatial.distance import pdist, squareform

dist = pdist(data[['pickup_lat', 'pickup_lon', 'count_pickup_loc']], 'euclidean')
df_dist = pd.DataFrame(squareform(dist))
df_dist = df_dist.replace(0,100)  # replace 0 with 100 to find the correct min distance
df_dist.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,353,354,355,356,357,358,359,360,361,362
0,100.0,158.455041,193.878828,330.340733,452.487569,338.710791,227.002203,441.184769,260.224903,715.178999,...,282895.456402,306816.340095,328603.407838,346781.662118,378453.590245,449378.000032,456090.11325,555342.232186,628968.501603,1097842.0
1,158.455041,100.0,53.338541,208.568933,335.644157,278.519299,170.299736,379.578714,135.325533,607.415015,...,282895.230997,306816.233405,328603.275058,346781.468976,378453.387731,449378.026264,456090.048854,555342.139659,628968.378541,1097842.0
2,193.878828,53.338541,100.0,155.331903,282.481858,232.215417,132.034087,330.945615,81.987804,554.147995,...,282895.176194,306816.174518,328603.210931,346781.385426,378453.317261,449378.040055,456090.027364,555342.104473,628968.322307,1097842.0
3,330.340733,208.568933,155.331903,100.0,127.200629,142.898565,136.091881,211.596314,73.878278,398.848342,...,282895.067461,306816.059818,328603.075765,346781.188714,378453.15164,449378.11924,456090.000861,555342.030918,628968.184367,1097842.0
4,452.487569,335.644157,282.481858,127.200629,100.0,174.011494,237.472104,171.233758,201.022387,272.007353,...,282895.039345,306816.026403,328603.021,346781.079741,378453.062387,449378.22524,456090.018853,555342.003061,628968.100248,1097842.0


In [10]:
df_dist.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,353,354,355,356,357,358,359,360,361,362
count,363.0,363.0,363.0,363.0,363.0,363.0,363.0,363.0,363.0,363.0,...,363.0,363.0,363.0,363.0,363.0,363.0,363.0,363.0,363.0,363.0
mean,25720.37,25662.05,25639.49,25599.94,25597.22,25617.33,25627.58,25626.81,25613.76,25677.59,...,268448.505204,291183.303067,312009.986603,329486.98538,360111.945399,429083.210429,435647.082333,533258.516437,606073.321508,1072364.0
std,94079.6,94095.13,94101.14,94111.66,94112.36,94106.99,94104.29,94104.43,94107.99,94090.9,...,55189.557653,56663.512642,58158.301663,59348.292838,61315.806808,66555.135835,67000.649694,72415.276881,75693.328945,94141.15
min,80.41144,25.96151,28.30194,34.53983,31.82766,32.15587,23.74868,20.90454,26.64583,5.830952,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
25%,425.3162,326.2186,290.0176,211.3704,226.1456,272.4195,266.3792,306.8859,248.2057,352.6235,...,278212.205459,302133.098118,323873.05598,342051.113775,373723.119146,444648.285818,451360.068571,550579.540756,624205.658342,1093080.0
50%,644.0085,547.9151,499.7649,403.8118,385.2194,430.9722,453.0188,443.555,434.6619,567.1164,...,282736.426489,306657.319909,328438.103438,346616.019988,378288.059135,449213.671686,455925.241851,555175.140789,628801.358111,1097675.0
75%,4785.621,4776.335,4771.555,4765.967,4766.175,4763.327,4765.524,4763.744,4767.287,4777.19,...,282883.117471,306804.0264,328590.655199,346768.674092,378440.622694,449365.898392,456077.597718,555329.079001,628955.139822,1097829.0
max,1097842.0,1097842.0,1097842.0,1097842.0,1097842.0,1097842.0,1097842.0,1097842.0,1097842.0,1097842.0,...,814947.025418,791026.096149,769239.103137,751061.158509,719389.121214,648464.1144,641752.043992,555342.232186,628968.501603,1097842.0


## Find Max Distance

In [21]:
my_max = df_dist.max().max()
print('Max: ', my_max)

Max:  1097842.125719814


## Find Min Distance

In [19]:
my_min = df_dist.min().min()
print('Min: ', my_min)

Min:  5.0990195135927845


## Compute the value for Threshold

In [20]:
threshold = (my_max + my_min ) / 2  # average of min- and max-distance
print('Threshold: ', threshold)

Threshold:  548923.6123696639
