#Preliminary operations

In [1]:
# import main libraries
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns

# import isolation forest
from sklearn.ensemble import IsolationForest

In [2]:
# mount Google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# import data
df_data = pd.read_excel("drive/MyDrive/Progetto Data Mining 2/CODICE PROGETTO/Data Understanding and Preparation/DATASET PREPARED/df_prep_TRAIN.xlsx", index_col="Unnamed: 0")
df_info = pd.read_excel("drive/MyDrive/Progetto Data Mining 2/CODICE PROGETTO/Data Understanding and Preparation/DATASET PREPARED/df_prep_info_TRAIN.xlsx", index_col="Unnamed: 0")

In [4]:
# get data values
X = df_data.values

In [5]:
df_data.head()

Unnamed: 0,sum,std,q25,q50,kur,skew,lag1_sum,lag1_kur,lag1_skew,zc_sum,...,mfcc_q95_w4,mfcc_q99_w4,mfcc_kur_w4,sc_std_w4,sc_kur_w4,sc_skew_w4,stft_sum_w4,stft_mean_w4,stft_q95_w4,stft_kur_w4
0,0.604904,-1.113727,1.155846,-0.064505,0.746422,1.597947,0.032809,0.526054,0.663058,-1.434221,...,-0.583254,-1.51548,-0.248903,-0.405919,3.146246,-1.505367,0.202942,1.325865,0.035922,3.971428
1,0.565092,-1.067513,1.155846,-0.064505,0.941449,1.64812,0.032809,0.186319,0.920248,-1.261744,...,-0.816861,-0.814428,0.141589,-0.408215,-0.236257,-0.713717,-0.12076,0.953175,0.035922,-0.381447
2,0.610689,-0.946842,1.89125,-0.064505,1.011526,1.770978,-0.885806,0.614675,0.111653,-1.673729,...,-0.514856,-1.120598,-0.095555,-0.227969,2.741296,-1.301205,-0.232259,1.008646,0.035922,2.877028
3,0.598277,-0.975367,1.155846,-0.064505,1.569306,1.97046,-0.696296,0.5659,0.856282,-1.352954,...,-0.059808,-0.7063,-0.404156,-0.535901,-0.469754,-0.612675,-0.537815,0.888896,0.035922,-0.307128
4,0.595865,-1.306279,1.155846,-0.064505,0.48884,1.426924,-0.696296,0.269613,0.151788,-1.499722,...,-0.503638,-0.784317,0.042496,-0.230801,0.600646,-0.70106,0.212217,0.950638,0.035922,1.797345


In [6]:
df_info.head()

Unnamed: 0,vocal_channel,emotion,emotional_intensity,statement,repetition,actor,sex,filename,frame_count
0,speech,neutral,normal,Kids are talking by the door,1st,1,M,03-01-01-01-01-01-01.wav,158558
1,speech,neutral,normal,Kids are talking by the door,2nd,1,M,03-01-01-01-01-02-01.wav,160160
2,speech,neutral,normal,Dogs are sitting by the door,1st,1,M,03-01-01-01-02-01-01.wav,156956
3,speech,neutral,normal,Dogs are sitting by the door,2nd,1,M,03-01-01-01-02-02-01.wav,152152
4,speech,calm,normal,Kids are talking by the door,1st,1,M,03-01-02-01-01-01-01.wav,169769


# Isolation Forest

In [7]:
alg = IsolationForest(random_state=0, n_estimators=10000)
alg.fit(X)

In [8]:
if_outliers = alg.predict(X)
if_scores = abs(alg.score_samples(X))

In [9]:
def take_second(elem):
    return elem[1]

sorted_10_out_if = sorted(list(enumerate(if_scores)), key=take_second, reverse=True)[:183]

In [10]:
outliers_10_IF = [i[0] for i in sorted_10_out_if]

# Deviation Based

In [11]:
# transpose data
df_data_T = df_data.T
X_t = df_data_T.values
# create a score matrix like df_data
# df_scores = df_data.copy()

In [12]:
# def sf function (professor code)
def sf(x, X):
    idx = list(X).index(x)
    X1 = np.array([x1 for i, x1 in enumerate(X) if i != idx])
    return np.abs(np.var(X) - np.var(X1))

In [13]:
# define a score matrix
deviations = np.empty((106, 1828))
# initialize features counter
c = 0
# iterate features
for col in X_t:
  # iterate elements in the feature
  for i, x in enumerate(col):
    # calculate deviation
    deviations[c][i] = sf(x, col)
  c += 1

In [14]:
# transpose deviations
deviations_t = np.transpose(deviations)
# transform deviations matrix in a DataFrame
df_deviations = pd.DataFrame(deviations_t, columns=df_data.columns)

In [15]:
# extract top 1% outliers
deviations_sum = np.sum(df_deviations, axis=1)
assert len(deviations_sum) == 1828, "wrong axis"

In [16]:
outliers_10_DEV = deviations_sum.sort_values(ascending=False)[:183].index

In [17]:
outliers_10_DEV = list(outliers_10_DEV)

# Likelihood Approach

In [18]:
# definiamo la funzione norm_dist, che prende in input x, nu e sigma
def norm_dist(x, mu, sigma):
    # la funzione ritorna il risultato della seguente formula
    return 1/(np.sqrt(2*np.pi)*sigma) * np.e**(-((x-mu)**2)/(2*sigma**2))

# definiamo la funzione unif_dist, che prende in input x e n
def unif_dist(x, n):
    # la funzione ritorna il risultato della seguente formula
    return 1/n

In [19]:
# viene creata una copia M in formato np.array del dataset
M = np.copy(df_data)
# creo la trasposta del dataset (per iterare le features)
M_T = np.transpose(M)

In [20]:
# creiamo una lista vuota
As = np.empty((106, 0))

In [21]:
# viene calcolata la media di ogni feature
muMs = M.mean(axis=0)
muMs

array([ 3.49829574e-17, -1.47705820e-16,  1.39931830e-16, -3.88699527e-18,
        3.61490560e-16,  7.77399054e-18,  0.00000000e+00,  4.17851992e-16,
       -8.55138960e-17,  5.63614314e-16,  1.01061877e-16, -4.35343470e-16,
       -1.08835868e-15, -1.23217750e-15, -2.56541688e-16, -5.59727319e-16,
        1.94349764e-16,  6.99659149e-16, -1.05726271e-15,  1.18320136e-14,
        7.96834031e-16, -3.77038541e-16, -1.32157839e-16,  1.55479811e-17,
        2.21947430e-15,  1.17775957e-15, -2.28166622e-15, -8.88955819e-15,
       -7.77399054e-17, -3.34281593e-16, -9.71748818e-19,  5.44179338e-17,
        2.79863660e-16,  9.71748818e-19, -1.08835868e-16,  2.13784740e-17,
       -3.01242134e-17,  1.16609858e-17, -1.39931830e-16, -1.39931830e-16,
        2.33219716e-17, -8.70686941e-16,  0.00000000e+00,  1.76275236e-15,
        6.21919244e-17,  1.02616675e-15,  7.61851073e-16,  8.47364969e-16,
        3.88699527e-17,  5.51953329e-16, -7.77399054e-18,  3.49829574e-17,
       -1.55479811e-17,  

In [22]:
# viene calolata la std deviation di ogni feature
sigmaMs = M.std(axis=0)
sigmaMs

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1.])

In [23]:
# definisco lamba_par
lambda_par = 0.3
sizeAs = []
for A in As:
  sizeAs.append(len(A))

In [24]:
sumMs = []
for f, muM, sigmaM in zip(M_T, muMs, sigmaMs):
  sumMs.append(np.sum([norm_dist(x, muM, sigmaM) for x in f]))

In [25]:
sumAs = []
for A in As:
  sumAs.append(np.sum([unif_dist(x, sizeA) for x in A]))

In [26]:
lls = []
for f, A, sumM, sumA in zip(M_T, As, sumMs, sumAs):
  lls.append(len(f) * np.log(1-lambda_par) + sumM + len(A) * np.log(lambda_par) + sumA)

In [27]:
delta_lls = np.empty((106, 1828))
A = list()
c = 36
lambda_par = 0.3
z = 0

for col, f, ll in zip(df_data.columns, M_T, lls):

  for i, x in enumerate(f):
    A.append(x)

    M = np.array([xo for j, xo in enumerate(f) if i!=j])
    muM = np.mean(M)
    sigmaM = np.std(M)

    sizeA = len(A)

    sumM = np.sum([norm_dist(x, muM, sigmaM) for x in M])
    sumA = np.sum([unif_dist(x, sizeA) for x in A])

    ll_xi = (len(M) * np.log(1-lambda_par) + sumM + len(A) * np.log(lambda_par) + sumA)
    delta_ll = abs(ll - ll_xi)
    delta_lls[z][i] = delta_ll

    #if delta_ll > c:
      #print(i, delta_ll, 'outlier' )

    A = list()
  z += 1

In [28]:
delta_lls_t = np.transpose(delta_lls)
df_delta_lls = pd.DataFrame(delta_lls_t, columns=df_data.columns)

In [29]:
# extract top 1% outliers
delta_lls_sum = np.sum(df_delta_lls, axis=1)
assert len(delta_lls_sum) == 1828, "wrong axis"

In [30]:
delta_lls_sum

0       32.613218
1       26.661167
2       26.590598
3       27.778535
4       27.383671
          ...    
1823    27.638813
1824    27.418867
1825    29.903592
1826    24.584672
1827    29.512139
Length: 1828, dtype: float64

In [31]:
outliers_10_LIK = delta_lls_sum.sort_values(ascending=False)[:183].index

In [32]:
outliers_10_LIK = list(outliers_10_LIK)

# ABOD

In [33]:
!pip install pyod

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyod
  Downloading pyod-1.0.9.tar.gz (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.0/150.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyod
  Building wheel for pyod (setup.py) ... [?25l[?25hdone
  Created wheel for pyod: filename=pyod-1.0.9-py3-none-any.whl size=184112 sha256=53e920822b81dbef5e7884c3899ec00f8efbba7e47fd111539cb0b55dd40d9cc
  Stored in directory: /root/.cache/pip/wheels/83/55/6b/552e083cf5509c0afe808b76cf434f1be284d01a112623bd37
Successfully built pyod
Installing collected packages: pyod
Successfully installed pyod-1.0.9


In [34]:
from pyod.models.abod import ABOD

In [35]:
clf = ABOD()
clf.fit(X)

ABOD(contamination=0.1, method='fast', n_neighbors=5)

In [36]:
outliers = clf.predict(X)
np.unique(outliers, return_counts=True)

(array([0, 1]), array([1581,  247]))

In [37]:
outliers = clf.predict(X)
index = df_data.index
prob = clf.decision_scores_

listatotale = []
for i, j in zip(index,prob):
  listatotale.append((i, j))

In [38]:
len(listatotale)

1828

In [39]:
outliers_10_ABOD = sorted(
    listatotale, 
    key=lambda x: x[1],
    reverse=True
)[:183]

In [40]:
outliers_10_AB = [i[0] for i in outliers_10_ABOD]

# Elliptic Envelope

In [41]:
from sklearn.covariance import EllipticEnvelope

In [42]:
X = df_data.values

In [43]:
clf = EllipticEnvelope(random_state=0, support_fraction=1)
clf.fit(X)

In [44]:
outliers = clf.predict(X)
scores = abs(clf.score_samples(X))

In [45]:
outliers

array([-1,  1,  1, ...,  1,  1,  1])

In [46]:
def take_second(elem):
    return elem[1]

sorted_10_out_ee = sorted(list(enumerate(scores)), key=take_second, reverse=True)[:183]

In [47]:
outliers_10_EE = [i[0] for i in sorted_10_out_ee]

#Comparison

In [48]:
out_comp = np.zeros((5, 5))

In [49]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

In [50]:
i = 0

for alg_r in [outliers_10_IF, outliers_10_DEV, outliers_10_LIK, outliers_10_AB, outliers_10_EE]:
  j = 0
  for alg_c in [outliers_10_IF, outliers_10_DEV, outliers_10_LIK, outliers_10_AB, outliers_10_EE]:
    #print(len(set(alg_r) and set(alg_c)) / float(len(set(alg_r) or set(alg_c))) * 100)
    #out_comp[i][j] = len(set(alg_r) and set(alg_c)) / float(len(set(alg_r) or set(alg_c))) * 100
    out_comp[i][j] = len(intersection(alg_r, alg_c)) / len(alg_r)
    j += 1
  i += 1

In [51]:
out_comp

array([[1.        , 0.74863388, 0.65027322, 0.49180328, 0.67759563],
       [0.74863388, 1.        , 0.86885246, 0.53551913, 0.7431694 ],
       [0.65027322, 0.86885246, 1.        , 0.46994536, 0.69398907],
       [0.49180328, 0.53551913, 0.46994536, 1.        , 0.52459016],
       [0.67759563, 0.7431694 , 0.69398907, 0.52459016, 1.        ]])