In [6]:
'''
playground for testing evaluation metrics on consitency with other author (Ben Hammer)
and plausibility checks
'''
import numpy as np
from utilities import *

In [9]:
'''
As a benchmark create case of a dumb model without any predictive power and test, how this model performs
measured in MAP@5 metric
Test set contains 15,610 images. 

compute:
1) a "dumb" model, that maps the images to the individuals uniform randomly, 
2) a model, that maps the images to the individuals along a probability distribution 
   that reflects the distribution of frequencies of observed whales (whale #1, 
   with 34 images, will occure 34 times more likely than whale # 2.500 with one image)
3) like 2), but without the category "new whale" (~800 occurences) 
'''
from scipy.stats import rv_discrete

def Compute_MAP(probs = 'uniform'):

    train_list = read_csv(file_name = "data/train.csv")   # for testing whole train data set

    whales, counts = get_whales(train_list)
    # print("{} individuals".format(len(counts)))  

    # to each image in train_list map a ranked list of max_pred whales
    # as random number between 1 and # of individuals in scenario (indeces in whale list)
    max_pred = 5
    dummy_preds = []
    
    if probs == 'uniform':
        for i in range(len(train_list)):
            ranks = np.random.randint(0,len(counts),max_pred)
            dummy_preds.append(ranks)

    elif probs == 'weighted':
        x = np.arange(0,len(whales))
        total_whales = np.sum(counts)
        # probability distribution = normalised distribution of frequencies
        px=[whales[i][1]/total_whales for i in range(len(whales))]   
        # create ranked list of max_pred = 5 labels per image in train_list
        for i in range(len(train_list)):
            # sample labels accodring probability distribution defined above
            ranks=rv_discrete(values=(x,px)).rvs(size=max_pred)  
            dummy_preds.append(ranks)

    elif probs == 'weighted_without_new':
        x = np.arange(1,len(whales))
        total_whales = np.sum(counts[1:])
        # probability distribution = normalised distribution of frequencies
        px=[whales[i][1]/total_whales for i in range(1,len(whales))]
        # create ranked list of max_pred = 5 labels per image in train_list
        for i in range(len(train_list)):
            # sample labels accodring probability distribution defined above
            ranks=rv_discrete(values=(x,px)).rvs(size=max_pred)  
            dummy_preds.append(ranks)
            
    else:
        raise AssertionError


    # get list of true labels: retrieve whale number from name
    true_labels = []
    for i, img in enumerate(train_list):
        name = img[1]
        true_labels.append([i for i, whale in enumerate(whales) if whale[0] == name][0])

    return mean_average_precision(dummy_preds, true_labels, max_pred)

In [10]:
print("\n MAP with uniform probability distribution over whales: ", 
      Compute_MAP(probs = 'uniform'))
print("\n MAP with prob. distribution over whales as in training data: ", 
      Compute_MAP(probs = 'weighted'))
print("\n MAP with prob. distribution over whales as in training data, without 'new whale': ",\
      Compute_MAP(probs = 'weighted_without_new'))


 MAP with uniform probability distribution over whales:  0.000548223350254

 MAP with prob. distribution over whales as in training data:  0.0154230118443

 MAP with prob. distribution over whales as in training data, without 'new whale':  0.00119627749577


In [4]:
# code from 
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
# only to double check validity of MAP metric functions (see below)
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]
    # print("ben Hammer predicted", predicted)

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:  
            num_hits += 1.0
            score += num_hits / (i+1.0)
            # print("ben Hammer add score", score)

    if not actual:
        return 0.0

    return score # / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [5]:
'''
evaluation metrics MAP@5
sources: 
https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
https://www.kaggle.com/c/FacebookRecruiting/discussion/2002
https://en.wikipedia.org/wiki/Information_retrieval
Note, that the metric is designed for "document retrieval", 
where many outcomes might be true (= "relevant documents")
Our case is specific, as there is only one "relevant document" per prediction 
(= the true prediction)
'''

# test implementation against Ben Hammers one
test = average_precision([1,12,3,12,8],12,5)
print("average precision my take: ",test)

test = apk([12],[1,12,3,12,8],5)
print("average precision Ben Hammer: ",test)


test = mean_average_precision([[1,13,3,12,8], [5,3,12,3,6], [8,6,11,2,4]],[12,15,8],5 )
print("mean average precision my take: ",test)

test = mapk([[12],[15],[8]], [[1,13,3,12,8], [5,3,12,3,6], [8,6,11,8,4]], 5)
print("mean average precision Ben Hammer: ",test)

true_lables = [12,15,8]
model_predictions = [[1,13,3,12,8], [5,3,12,3,6], [8,6,11,8,4]]

average precision my take:  0.5
average precision Ben Hammer:  0.5
mean average precision my take:  0.416666666667
mean average precision Ben Hammer:  0.416666666667
