In [1]:
import pandas as pd
import os
import numpy as np
import plotly.express as px
import itertools
import matplotlib.pyplot as plt
import math
import plotly.graph_objects as go
import plotly.colors
from plotly.subplots import make_subplots        
from PIL import ImageColor
import pickle
import time
from scipy.spatial import distance
import dcor

from sklearn.pipeline import Pipeline, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.neighbors import LocalOutlierFactor

from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF

In [2]:
df = pd.read_csv("processed_country_indicators.csv").drop(columns=["Unnamed: 0"])

In [3]:
# to work with 2007 data only
df2007 = df.loc[df["Year"]==2007]

In [4]:
def data_prep_split(data, inputs, outputs):
    # get x and y value(s)
    curr_x = np.array(data[inputs]).reshape(-1, 1)
    curr_y = np.array(data[outputs])

    # train test split
    curr_X_train, curr_X_test, curr_y_train, curr_y_test = train_test_split(curr_x, curr_y, random_state=1,
                                                                            test_size=.3, shuffle=True)
    curr_y_train = curr_y_train.ravel()
    curr_y_test = curr_y_test.ravel()
    
    return curr_X_train, curr_X_test, curr_y_train, curr_y_test

In [5]:
inputs = ["Electric power consumption (kWh per capita)"]
outputs = ["Life expectancy at birth, total (years)"]

In [6]:
curr_data = df2007[["Country Name", "Year"]+inputs+outputs].dropna()

### ABOD

Angle Based Outlier Detection

https://www.dbs.ifi.lmu.de/~zimek/publications/KDD2008/KDD08-ABOD.pdf

'The angle-based outlier factor ABOF(A) is the variance over the
angles between the difference vectors of A to all pairs (B, C) of points in
D weighted by the distance of the points:'\
    $$ \mathrm{ABOF}(A)= \mathrm{Var}(A)_{B, C \in D} \left( \frac{\langle \overline{AB},\overline{AC}\rangle}{||\overline{AB}||^2\cdot||\overline{AC}||^2} \right) ,$$
with $\overline{AB} = B-A$.
![title](ABOD.png)

In [101]:
# have to state the expected rate of outliers = contaminations!
abod_clf = ABOD(contamination=.1)
abod_clf.fit(curr_data[inputs+outputs])

ABOD(contamination=0.1, method='fast', n_neighbors=5)

In [102]:
abod_outlier_yn = [str(label) for label in abod_clf.labels_]

In [103]:
fig = px.scatter(curr_data,
                 x="Electric power consumption (kWh per capita)",
                 y="Life expectancy at birth, total (years)",
                 color=abod_outlier_yn,
                 hover_name="Country Name"
                )

fig.show()


### CBLOF

Clustering Based Local Outlier Factor

In [98]:

cblof_clf = CBLOF(n_clusters = 8, # can specify the number of clusters (default = 8)
                  contamination = .1, # have to state the expected rate of outliers = contaminations!
                  alpha = 0.95, # Coefficient for deciding small and large clusters. The ratio of the number of samples in large clusters to the number of samples in small clusters.
                  beta = 5, # Coefficient for deciding small and large clusters. For a list sorted clusters by size |C1|, |C2|, …, |Cn|, beta = |Ck|/|Ck-1|
                  #use_weights = True,
                  #check_estimator = True
                 )
cblof_clf.fit(curr_data[inputs+outputs])

CBLOF(alpha=0.95, beta=5, check_estimator=False, clustering_estimator=None,
   contamination=0.1, n_clusters=8, n_jobs=None, random_state=None,
   use_weights=False)

In [99]:
cblof_outlier_yn = [str(label) for label in cblof_clf.labels_]

In [100]:
fig = px.scatter(curr_data,
                 x="Electric power consumption (kWh per capita)",
                 y="Life expectancy at birth, total (years)",
                 color=cblof_outlier_yn,
                 hover_name="Country Name"
                )

fig.show()


In [10]:
#X_train, X_test, y_train, y_test = data_prep_split(df2007, inputs, outputs)