In [13]:
from ContentBasedFiltering import ContentBasedFiltering
from sklearn.metrics import root_mean_squared_error, mean_squared_error, mean_absolute_error, f1_score
from scipy import stats

import pandas as pd
import numpy as np
import utils_local

## Evalaution functions

In [17]:
def f(df_species: pd.DataFrame):
    if np.sum(df_species["occurence"]) == 0:
        return 0

    preferences = df_species["similarity"]
    percentile_rank = 1 - stats.percentileofscore(preferences, preferences) / 100

    expected_rank_species = np.sum(df_species["occurence"] * percentile_rank) / np.sum(df_species["occurence"])

    return expected_rank_species

In [18]:
def calc_expected_percentile_rank(df_pred: pd.DataFrame) -> float:
    """Calculate the expected percentile rank as in paper "Collaborative Filtering for Implicit Feedback Datasets"

    Args:
        df_pred (pd.DataFrame): prediction dataframe

    Returns:
        float: expected percentile rank
    """
    expected_ranks = (
        df_pred.sort_values(by="similarity")
        .groupby(by="genus")
        .apply(f, include_groups=False)
    )
    expected_percentile_rank = expected_ranks[expected_ranks > 0].mean()

    return expected_percentile_rank

## Preprocessing train and test data

In [2]:
# Reading train and test data
PATH_RAW_DATA = "../data/"
df_raw_data = pd.read_csv(PATH_RAW_DATA + "AllSites_SiteOccurrences_AllGenera_26.1.24.csv")
df_genus_data = pd.read_csv(PATH_RAW_DATA + "FossilGenera_MammalMassDiet_Jan24.csv", sep=",")

PATH_DIR_DATA_PROCESS = "../data/data_processed/"

data_train = np.load(PATH_DIR_DATA_PROCESS + "data_train.npy", allow_pickle=True)
data_val = np.load(PATH_DIR_DATA_PROCESS + "data_val.npy", allow_pickle=True)

df_train = utils_local.conv_dataset_patch2df(data_train)
df_val = utils_local.conv_dataset_patch2df(data_val)

print(f"train: {df_train.shape}")
print(f"val: {df_val.shape}")

display(df_train.head())
display(df_val.head())

train: (304000, 3)
val: (20536, 3)


Unnamed: 0,site,species,occurence
0,564,348,0.0
1,564,349,0.0
2,564,350,0.0
3,564,351,0.0
4,565,348,0.0


Unnamed: 0,site,species,occurence
0,198,116,0.0
1,198,117,0.0
2,198,118,0.0
3,198,119,0.0
4,199,116,0.0


In [3]:
# Encrypting the genus and site information
path_dir_encode = PATH_DIR_DATA_PROCESS

path_enc_genera = path_dir_encode + "ordinal_enc_species.json"
path_enc_site = path_dir_encode + "ordinal_enc_site.json"

enc_genera = utils_local.CategoryDict.from_file(path_enc_genera)
enc_site = utils_local.CategoryDict.from_file(path_enc_site)

In [4]:
df_train["site"] = df_train["site"].map(enc_site.dict_id2name)
df_train["species"] = df_train["species"].map(enc_genera.dict_id2name)

df_val["site"] = df_val["site"].map(enc_site.dict_id2name)
df_val["species"] = df_val["species"].map(enc_genera.dict_id2name)

# Renaming columns
df_train = df_train.rename(columns={"site": "SITE_NAME", "species": "genus"})
df_val = df_val.rename(columns={"site": "SITE_NAME", "species": "genus"})

display(df_train.head())
display(df_val.head())

Unnamed: 0,SITE_NAME,genus,occurence
0,Tam Hang,Vespertilio,0.0
1,Tam Hang,Papio,0.0
2,Tam Hang,Cynocephalus,0.0
3,Tam Hang,Melursus,0.0
4,Tam Nang,Vespertilio,0.0


Unnamed: 0,SITE_NAME,genus,occurence
0,Grays Thurrock,Elephas,0.0
1,Grays Thurrock,Hystrix,0.0
2,Grays Thurrock,Cuon,0.0
3,Grays Thurrock,Rhinolophus,0.0
4,Grosse Grotte (Blaubeuren),Elephas,0.0


In [5]:
# Reshaping into matrix form for the algorithm
df_train = pd.pivot(df_train, index="SITE_NAME", columns="genus", values="occurence").fillna(0)
display(df_train.head())

genus,Acinonyx,Aepyosciurus,Aeretes,Ailuropoda,Ailurus,Alactagulus,Alcelaphus,Alces,Algarolutra,Alilepus,...,Villanyia,Viverra,Viverravus,Viverricula,Vormela,Vulpes,Wushanomys,Xenocyon,Yangia,Zygolophodon
SITE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aba Zawei,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Abric Romani,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Acheng_Jiaojie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adler cave,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Adyrgan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# The site information must be included into matrix for the algorithm
cols_redundant = ["SITE_NAME",
    'LAT',
    'LONG',
    'ALTITUDE',
    'MAX_AGE',
    'BFA_MAX',
    'BFA_MAX_ABS',
    'MIN_AGE',
    'BFA_MIN',
    'BFA_MIN_ABS',
    'COUNTRY',
    'age_range',
    'Total_Gen_Count',
    'Large_GenCount',
    'Small_GenCount',
    'smallperlarge',
    'smallprop',
    'Herb_GenCount',
    'Nonherb_GenCount',
    'DietRatio',
    'HerbProp',
    'mid_age'
    ]

df_redundant = df_raw_data[cols_redundant]
df_train = df_train.merge(df_redundant, how="left", left_index=True, right_on="SITE_NAME")
df_train.head()

Unnamed: 0,Acinonyx,Aepyosciurus,Aeretes,Ailuropoda,Ailurus,Alactagulus,Alcelaphus,Alces,Algarolutra,Alilepus,...,Total_Gen_Count,Large_GenCount,Small_GenCount,smallperlarge,smallprop,Herb_GenCount,Nonherb_GenCount,DietRatio,HerbProp,mid_age
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4,4,0,0.0,0.0,4,0,,1.0,0.0265
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12,12,0,0.0,0.0,6,5,1.2,0.5,0.055
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7,5,2,0.4,0.285714,5,2,2.5,0.714286,0.21
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10,5,5,1.0,0.5,6,4,1.5,0.6,0.0275
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11,5,6,1.2,0.545455,11,0,,1.0,2.2


## Training the model and making predictions

In [7]:
# Train the algorithm with train dataset
cbf = ContentBasedFiltering()
cbf.fit(df_train, df_genus_data)

  df_genus_info = df_genus_info.replace({False: 0, True: 1})




In [21]:
# Predictions and true values
true_and_pred = cbf.predict(df_val)
true_and_pred = true_and_pred.fillna(0)
true_and_pred.head()

Unnamed: 0,SITE_NAME,genus,occurence,similarity
19801,Aba Zawei,Procapra,0.0,0.784554
15883,Aba Zawei,Bison,0.0,0.755624
6473,Aba Zawei,Ammotragus,0.0,0.642511
15403,Aba Zawei,Hippopotamus,0.0,0.526133
11192,Aba Zawei,Asoletragus,0.0,0.491755


## Evaluating

In [22]:
epr = calc_expected_percentile_rank(true_and_pred)
rms = root_mean_squared_error(true_and_pred["occurence"], true_and_pred["similarity"])
mae = mean_absolute_error(true_and_pred["occurence"], true_and_pred["similarity"])

print("Expected percentile rank:", epr, "RMS:", rms, "MAE:", mae)

Expected percentile rank: 0.3693940390239605 RMS: 0.5111042102644483 MAE: 0.48137775325148974
