In [2]:
# Import KNN from the relevant pyod module
from pyod.models.knn import KNN
import pandas as np
import numpy as np
# Import the euclidean function from scipy
from scipy.spatial.distance import euclidean
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer


In [None]:
# Import KNN from the relevant pyod module
from pyod.models.knn import KNN

# Instantiate KNN and fit to females
knn = KNN(contamination=0.005, n_neighbors=20, n_jobs=-1)
knn.fit(females)

# Create a boolean index that checks for outliers
is_outlier =knn.labels_==1

# Isolate the outliers
outliers = females[is_outlier]

print(len(outliers))

# KNN with outlier probabilities
Since we cannot wholly trust the output when using contamination, let's double-check our work using outlier probabilities. They are more trustworthy.

In [None]:
# Instantiate a KNN with 20 neighbors and fit to `females`
knn = KNN(n_neighbors=20,n_jobs=-1)
knn.fit(females)

# Calculate probabilities
probs = knn.predict_proba(females)

# Create a boolean  of 55%
is_outlier = probs[:, 1]>0.55

# Use the boolean mask to filter the outliers
outliers = females[is_outlier]

print(len(outliers))

# Finding the euclidean distance manually
Euclidean distance is the most popular distance metric in statistics. Its popularity mainly comes from the fact that it is intuitive to understand. It is the Pythagorean theorem applied in Cartesian coordinates.

In [3]:
M = np.array([14, 17, 18, 20, 14, 12, 19, 13, 17, 20])
N = np.array([63, 74, 76, 72, 64, 75, 75, 61, 50, 53])

# Subtract M from N and square the result
squared_diffs = (N - M) ** 2

# Calculate the sum of squared differences
sum_diffs = np.sum(squared_diffs)

# Find the square root
dist_MN = np.sqrt(sum_diffs)

print(dist_MN)

160.63934760823702


# Finding the euclidean distance with SciPy
Instead of writing multiple lines of code to calculate the euclidean distance, you can use SciPy. The library not only contains the euclidean function, but more than 40 other distance metrics—all a single import statement away.

In [6]:
M = np.array([14, 17, 18, 20, 14, 12, 19, 13, 17, 20])
N = np.array([63, 74, 76, 72, 64, 75, 75, 61, 50, 53])

# Use the euclidean function on M and N
dist_MN = euclidean(M,N)

print(dist_MN)

160.63934760823702


# Practicing standardization
It is dangerous to use KNN on unknown distributions blindly. Its performance suffers greatly when the feature distributions don't have the same scales. Unscaled features will skew distance calculations and thus return unrealistic anomaly scores.

A common technique to counter this is using standardization, which involves removing the mean from a feature and dividing it by the standard deviation. This has the effect of making the feature have a mean of 0 and a variance of 1.

In [7]:
# Initialize a StandardScaler
ss = StandardScaler()

# Extract feature and target arrays
X = females.drop("weightkg",axis=1) 
y = females["weightkg"]

# Fit/transform X
X_transformed = ss.fit_transform(X)

# Fit/transform X but preserve the column names
X.loc[:,:] = ss.fit_transform(X)

NameError: name 'females' is not defined

# Testing QuantileTransformer
Standardization is prone to the same pitfalls as z-scores. Both use mean and standardization in their calculations, which makes them highly sensitive to extreme values.

To get around this problem, you should use QuantileTransformer which uses quantiles. Quantiles of a distribution stay the same regardless of the magnitude of outliers.

You should use StandardScaler when the data is normally distributed (which can be checked with a histogram). For other distributions, QuantileTransformer is a better choice.

In [None]:
# Instantiate an instance that casts to normal
qt = QuantileTransformer(output_distribution="normal")

# Fit and transform the feature array
X.loc[:,:] = qt.fit_transform(X)

# Plot a histogram of palm length
plt.hist(X["palmlength"], color='red')

plt.xlabel("Palm length")
plt.show()

# Calculating manhattan distance manually
While euclidean distance is very popular, it only scales well beyond two or three-dimensional data. In these cases, you can use manhattan distance as an alternative. It has the advantage of working exceptionally well with datasets with many categorical features.

In [3]:
M = np.array([14, 17, 18, 20, 14, 12, 19, 13, 17, 20])
N = np.array([63, 74, 76, 72, 64, 75, 75, 61, 50, 53])

# Subtract M from N and find the absolute value
abs_diffs = np.abs(N-M)

# Calculate the final manhattan distance
manhattan_dist_MN = np.sum(abs_diffs)

print(manhattan_dist_MN)

499


# Tuning n_neighbors
n_neighbors is the most crucial parameter of KNN. When you are unsure about the number of outliers in the dataset (which happens often), you can't use the rule of thumb that suggests using 20 neighbors when contamination is below 10%.

For such cases, you'll have to tune n_neighbors. Practice the process on the transformed version of the females dataset from the last exercise. It has been loaded as females_transformed. KNN estimator, evaluate_outlier_classifier and evaluate_regressor functions are also loaded.

Here are the function bodies as reminders:

In [4]:
def evaluate_outlier_classifier(model, data, threshold=.75):
    model.fit(data)

    probs = model.predict_proba(data)
    inliers = data[probs[:, 1] <= threshold]

    return inliers

def evaluate_regressor(inliers):
    X, y = inliers.drop("weightkg", axis=1), inliers[['weightkg']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, train_size=0.8)

    lr = LinearRegression()
    lr.fit(X_train, y_train)

    preds = lr.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared=False)

    return round(rmse, 3)

In [None]:
# Create a list of values for n_neigbors
n_neighbors = [5, 10, 20]
scores = dict()

for k in n_neighbors:
    # Instantiate KNN with the current k
    knn = KNN(n_neighbors=k, n_jobs=-1)
    
    # Find the inliers with the current KNN
    inliers = evaluate_outlier_classifier(knn, females_transformed, .50)
    
    # Calculate and store RMSE into scores
    scores[k] = evaluate_regressor(inliers)
    
print(scores)

In [None]:
n_neighbors = [5, 20]
methods = ['largest', 'mean', 'median']
scores = dict()

for k, m in product(n_neighbors,methods):
    # Create a KNN instance
    knn = KNN(n_neighbors=k, method=m, n_jobs=-1)
    
    # Find the inliers with the current KNN
    inliers = evaluate_outlier_classifier(knn,females_transformed,.55)

    # Calculate and store RMSE into scores
    scores[(k, m)] = evaluate_regressor(inliers)
    
print(scores)

# LOF for the first time
LOF differs from KNN only in the internal algorithm and the lack of the method parameter. Practice detecting outliers with it using contamination filtering on the scaled version of females dataset from previous exercises.

The dataset has been loaded as females_transformed.

In [None]:
# Import LOF from its relevant module
from pyod.models.lof import LOF

# Instantiate LOF and fit to females_transformed
lof = LOF(contamination=0.003,n_neighbors=20,n_jobs=-1)
lof.fit(females_transformed)

# Create a boolean index that checks for outliers
is_outlier = lof.labels_==1

# Isolate the outliers
outliers = females_transformed[is_outlier]

print(len(outliers))

# LOF with outlier probabilities
As always, double-check that the chosen contamination level is trustworthy by filtering the outliers with a probability threshold. The syntax is the same as with KNN.

LOF estimator has already been imported, and the females_transformed dataset is also available

In [None]:
# Instantiate an LOF with 20 neighbors and fit to the data
lof = LOF(n_neighbors=20)
lof.fit(females_transformed)

# Calculate probabilities
probs = lof.predict_proba(females_transformed)

# Create a boolean mask
is_outlier = probs[:,1]>0.5

# Use the boolean mask to filter the outliers
outliers = females_transformed[is_outlier]

print(len(outliers))