# Copula-Based Outlier Detection (Copula-Based Outlier Detection)
### Data
* https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz

### Importing libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from pyod.models.copod import COPOD
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

### Loading data

In [None]:
train_data = pd.read_csv('https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz')
train_data.head()

Unnamed: 0,total_area,ceiling_height,rooms,living_area,price,day_mean,doy_108,price_locality_name_median,target
0,105.0,3.0,3,50.0,95000,2.456912,0,2.261905,1
1,40.0,3.0,1,19.200001,25000,3.028689,0,1.0,2
2,37.599998,2.64,0,19.0,26000,3.091993,0,0.619048,2
3,80.0,3.0,3,49.0,35000,3.10101,0,1.25,2
4,100.0,3.0,3,49.0,80000,2.495468,0,1.904762,3


### Data normalization
Normalize all values to the segment [0;1], remove the target from the data

In [None]:
train_data_an = pd.DataFrame(StandardScaler().fit_transform(train_data[train_data.columns[:-1]]))

In [None]:
train_data_an.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.709793,1.115052,1.469889,1.025395,0.378181,-1.60709,-0.108831,0.278301
1,-0.483202,1.115052,-0.861151,-0.679471,-0.217544,0.337784,-0.108831,-0.132179
2,-0.564174,-0.645106,-2.026671,-0.690542,-0.209033,0.553112,-0.108831,-0.256098
3,0.866333,1.115052,1.469889,0.970042,-0.13244,0.583783,-0.108831,-0.050858
4,1.541101,1.115052,1.469889,0.970042,0.250526,-1.475943,-0.108831,0.162128


### COPOD

In [None]:
copod = COPOD(contamination=0.001).fit(train_data_an)

Filter out the anomalous data

In [None]:
train_data_filtered = train_data_an[train_data_an.index.isin(np.where(copod.labels_==0)[0])]
print ("Percentage of anomalies:", round(100*len(copod.labels_[np.where(copod.labels_==1)])/len(train_data_an),2), "%")

In [None]:
train_data_filtered.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.709793,1.115052,1.469889,1.025395,0.378181,-1.60709,-0.108831,0.278301
1,-0.483202,1.115052,-0.861151,-0.679471,-0.217544,0.337784,-0.108831,-0.132179
2,-0.564174,-0.645106,-2.026671,-0.690542,-0.209033,0.553112,-0.108831,-0.256098
3,0.866333,1.115052,1.469889,0.970042,-0.13244,0.583783,-0.108831,-0.050858
4,1.541101,1.115052,1.469889,0.970042,0.250526,-1.475943,-0.108831,0.162128


### Compare K-means
Build a model on the filtered data

In [None]:
kmeans_an = KMeans(n_clusters=100, random_state=0, max_iter=100, n_init=10).fit(train_data_filtered)

In [None]:
train_data_an["target_cluster_an"] = kmeans_an.predict(train_data_an)

### Evaluation of prediction accuracy
Assign the mean value over the cluster. No filtering/ABOD: 4.18, Smirnov: 4.16, Ellipsoidal approximation/LOF: 4.19.

In [None]:
train_data_an["target"] = train_data["target"]

In [None]:
target_cluster_an = np.round(train_data_an.groupby("target_cluster_an").mean()["target"])
train_data_an["target_pred_an"] = train_data_an["target_cluster_an"].apply(lambda x: target_cluster_an[x])

In [None]:
print ("Filtering anomalies: ", np.exp(np.abs(train_data_an["target"] - train_data_an["target_pred_an"])).sum() / len(train_data_an))