# Evaluation Metrics

In [13]:
import pandas as pd
import numpy as np
import copy
import os
from scipy.stats import friedmanchisquare
from scipy.stats import chi2

# change to your local path to project below
project_path = '/home/jorge/Insync/jorgitoje@gmail.com/OneDrive/Documentos/JORGE/EDUCATION/MASTER_DATASCIENCE/Semester1/AdvancedStatistics/GroupProject/KDS_Statistics_GroupProject'
# change to dataset name you want to analyze
dataset_name = 'USAhouseprices2014' # check "features_ranks.csv" "dataset" column for unique values

In [2]:

dataset_path = os.path.join(project_path, 'data/features_ranks_FAKE.csv')
data_long = pd.read_csv(dataset_path, sep=";").query("dataset == @dataset_name")
data_long.head()

Unnamed: 0,dataset,feature_name,method,feature_rank
0,USAhouseprices2014,date,linear_regression,1
1,USAhouseprices2014,bedrooms,linear_regression,2
2,USAhouseprices2014,bathrooms,linear_regression,3
3,USAhouseprices2014,sqft_living,linear_regression,4
4,USAhouseprices2014,sqft_lot,linear_regression,5


In [3]:
data_wide = data_long.pivot( index=['dataset', 'feature_name'], columns=['method'], 
    values=['feature_rank']).reset_index()
data_wide.head()

Unnamed: 0_level_0,dataset,feature_name,feature_rank,feature_rank,feature_rank
method,Unnamed: 1_level_1,Unnamed: 2_level_1,PCA,RandomForest,linear_regression
0,USAhouseprices2014,bathrooms,7,3,3
1,USAhouseprices2014,bedrooms,2,2,2
2,USAhouseprices2014,city,15,15,15
3,USAhouseprices2014,condition,9,9,9
4,USAhouseprices2014,country,17,17,17


## Kendall's W

In [16]:
data_kendall = data_wide['feature_rank'].astype(int)

### Using friedmanchisquare
(DOES NOT seem to work and requires manually putting all columns)

In [18]:

# spRes = friedmanchisquare(data_kendall['PCA'].to_numpy(), data_kendall['RandomForest'].to_numpy(), data_kendall['linear_regression'].to_numpy())
# spRes = friedmanchisquare(*[data_kendall[column] for column in data_kendall.columns])
# friedmanchisquare()
# selData()
# n = data_kendall.shape[0]
# k = data_kendall.shape[1]
# Q = spRes[0]
# print(f"n: {n},k: {k}, Q: {Q}")

In [19]:
# W = Q / (n*(k-1))
# print(f"W: {W}")

### Using direct formula

In [20]:
def calculate_kendallW(data_kendall, tie_correction=False):
    '''_summary_

    Args:
        data_kendall (pd.dataframe): Datafrem containing as columns the feature
        selection methods and as values ranks to each feature in the same order.

    Returns:
        W: Kendall's W coefficient
    '''
    n = data_kendall.shape[0] # number of features/objects
    k = data_kendall.shape[1] # numbe of raters/methods

    # Sum of each item ranks
    sums = data_kendall.sum(axis=1, numeric_only=True).to_numpy()
    # Mean of ranking sums
    Rbar = sums.mean()
    Rbar
    # Sum of squared deviations from the mean
    S = np.sum([(np.array(sums)[x] - Rbar) ** 2 for x in range(n)])
    if not tie_correction:
        W = (12 * S) / (k ** 2 * (n ** 3 - n))
    else:
        # TO DO: if we have many ties, we'll have to implement this also
        return None

    return W

W = calculate_kendallW(data_kendall)
print(f"Kendall's W: {W}")

Kendall's W: 0.9875243664717349


### Hypothesis testing for Kendall's W

In [26]:
n = data_kendall.shape[0]
k = data_kendall.shape[1]

alpha = 0.05
degrees_of_freedom = n - 1
chisquared_alpha = chi2.ppf(q = alpha, df = degrees_of_freedom)
chisquared_statistic = k *(n-1) * W

print(f"Statistic value for alpha={alpha}: {chisquared_statistic}")
print(f"Tabled value for Chi2 for alpha={alpha} and {degrees_of_freedom} degrees of freedom: {chisquared_alpha}")

if chisquared_statistic >= chisquared_alpha:
    print("We reject null hypothesis and conclude that H1: W ≠ 0.")
else:
    print("We cannot reject null hypothesis H0: W = 0")

Statistic value for alpha=0.05: 53.32631578947368
Tabled value for Chi2 for alpha=0.05 and 18 degrees of freedom: 9.390455080688984
We reject null hypothesis and conclude that H1: W ≠ 0.


## Top–k overlap

In [11]:
def calculate_topkoverlap(datatopk, k=5):

    data_all = datatopk.copy(deep=True)

    n = len(data_all)

    assert len(data_all) >= k, f"Maximum value for k is {n}, which is total number of features"

    methods = data_all.columns.to_list()
    data_all['feature_id'] = data_all.index

    all_toplists = []

    for method in methods:
        data_temp = data_all.filter(items=['feature_id', method]) \
            .sort_values(method).head(k)['feature_id'].to_list()
        all_toplists.append(data_temp)
    union_lists= np.unique(all_toplists)
    intersection_lists = set.intersection(*map(set, all_toplists))

    topk_overlap = len(intersection_lists)/len(union_lists)
        
    return topk_overlap


print(f"Top-k overlap: {calculate_topkoverlap(data_kendall)}")

Top-k overlap: 0.6666666666666666
