In [None]:
from ContentBasedFiltering import ContentBasedFiltering
from sklearn.metrics import root_mean_squared_error, mean_squared_error, mean_absolute_error, f1_score
from scipy import stats

from surprise import Dataset, Reader
from surprise import KNNBasic, KNNWithMeans
from surprise.model_selection import train_test_split

import pandas as pd
import numpy as np
import utils_local

## Evalaution functions

In [None]:
def f(df_species: pd.DataFrame):
    if np.sum(df_species["occurence"]) == 0:
        return 0

    preferences = df_species["similarity"]
    percentile_rank = 1 - stats.percentileofscore(preferences, preferences) / 100

    expected_rank_species = np.sum(df_species["occurence"] * percentile_rank) / np.sum(df_species["occurence"])

    return expected_rank_species

In [None]:
def calc_expected_percentile_rank(df_pred: pd.DataFrame) -> float:
    """Calculate the expected percentile rank as in paper "Collaborative Filtering for Implicit Feedback Datasets"

    Args:
        df_pred (pd.DataFrame): prediction dataframe

    Returns:
        float: expected percentile rank
    """
    expected_ranks = (
        df_pred.sort_values(by="similarity")
        .groupby(by="genus")
        .apply(f, include_groups=False)
    )
    expected_percentile_rank = expected_ranks[expected_ranks > 0].mean()

    return expected_percentile_rank

## Preprocessing train and test data

In [None]:
# Reading train and test data
PATH_RAW_DATA = "../data/"
df_raw_data = pd.read_csv(PATH_RAW_DATA + "AllSites_SiteOccurrences_AllGenera_26.1.24.csv")
df_genus_data = pd.read_csv(PATH_RAW_DATA + "FossilGenera_MammalMassDiet_Jan24.csv", sep=",")
df_dental_data = pd.read_csv(PATH_RAW_DATA + "DentalTraits_Genus_PPPA_ds.csv", sep=",")

display(df_raw_data)
display(df_genus_data)
display(df_dental_data)

In [None]:
dental_cols = [
    "Genus",
    "HY",
    "LOP",
    "AL",
    "OL",
    "SF",
    "BUN",
    "OT",
    "Excl_AL"
]

df_dental_data = df_dental_data[dental_cols]
display(df_dental_data)

In [None]:
# With genus info, give the columns you want to use and convert categorical using one-hot-encoding
genus_info_cols = [
    "Genus",
    "Order",
    "Family",
    "Massg",
    "Diet",
    "DietSource"
]
        
df_genus_data = df_genus_data[genus_info_cols]

dummy_cols = [
    "Order",
    "Family",
    "Diet",
    "DietSource"
]

#The genus column must be the first one in genus data
df_genus_data = pd.get_dummies(df_genus_data, columns=dummy_cols)
df_genus_data = df_genus_data.replace({False: 0, True: 1})
df_genus_data = df_genus_data.merge(df_dental_data, "left", on="Genus")


PATH_DIR_DATA_PROCESS = "../data/data_processed/"

data_train = np.load(PATH_DIR_DATA_PROCESS + "data_train.npy", allow_pickle=True)
data_val = np.load(PATH_DIR_DATA_PROCESS + "data_val.npy", allow_pickle=True)

df_train = utils_local.conv_dataset_patch2df(data_train)
df_val = utils_local.conv_dataset_patch2df(data_val)

print(f"train: {df_train.shape}")
print(f"val: {df_val.shape}")

display(df_genus_data.head())
display(df_train.head())
display(df_val.head())

In [None]:
# Deal with the missing values in df_genus_data
df_genus_data = df_genus_data.fillna(-1)

In [None]:
# Encrypting the genus and site information
path_dir_encode = PATH_DIR_DATA_PROCESS

path_enc_genera = path_dir_encode + "ordinal_enc_species.json"
path_enc_site = path_dir_encode + "ordinal_enc_site.json"

enc_genera = utils_local.CategoryDict.from_file(path_enc_genera)
enc_site = utils_local.CategoryDict.from_file(path_enc_site)

In [None]:
df_train["site"] = df_train["site"].map(enc_site.dict_id2name)
df_train["species"] = df_train["species"].map(enc_genera.dict_id2name)

df_val["site"] = df_val["site"].map(enc_site.dict_id2name)
df_val["species"] = df_val["species"].map(enc_genera.dict_id2name)

# Renaming columns
df_train = df_train.rename(columns={"site": "SITE_NAME", "species": "genus"})
df_val = df_val.rename(columns={"site": "SITE_NAME", "species": "genus"})

display(df_train.head())
display(df_val.head())

In [None]:
# Reshaping into matrix form for the algorithm
df_train_non_matrix = df_train.copy()
df_train = pd.pivot(df_train, index="SITE_NAME", columns="genus", values="occurence").fillna(0)
display(df_train.head())

In [None]:
# The site information must be included into matrix for the algorithm
cols_redundant = ["SITE_NAME",
    'LAT',
    'LONG',
    'MAX_AGE',
    'MIN_AGE',
    'age_range',
    'Large_GenCount',
    'Small_GenCount',
    'Herb_GenCount',
    'Nonherb_GenCount',
    'mid_age'
    ]

df_redundant = df_raw_data[cols_redundant]
df_train = df_train.merge(df_redundant, how="left", left_index=True, right_on="SITE_NAME")

desired_column_order = ['SITE_NAME'] + [col for col in df_train.columns if col != 'SITE_NAME']
df_train = df_train[desired_column_order]

df_train.head()

## Training the model and making predictions

In [None]:
# Train the algorithm with train dataset
cbf = ContentBasedFiltering()
cbf.fit(df_train, df_genus_data, n_site_columns=10, normalization="min-max")

In [None]:
# Predictions and true values
true_and_pred = cbf.predict(df_val)
true_and_pred = true_and_pred.fillna(0)
true_and_pred.head()

## Evaluating

In [None]:
epr = calc_expected_percentile_rank(true_and_pred)
rms = root_mean_squared_error(true_and_pred["occurence"], true_and_pred["similarity"])
mae = mean_absolute_error(true_and_pred["occurence"], true_and_pred["similarity"])

print("Expected percentile rank:", epr, "RMS:", rms, "MAE:", mae)

## Checking the site info and genus info matrices

In [None]:
site_info = cbf.site_info_with_genus_info
genus_info = cbf.genus_info_with_site_info

# Evaluating collaborative-knn

In [None]:
df_raw_train = df_raw_data.iloc[:, :-21].set_index("SITE_NAME")
df_raw_train_non_matrix = df_raw_train.stack().reset_index().rename(columns={"level_1": "genus", 0: "presence"})
df_raw_train_non_matrix

In [None]:
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(df_train_non_matrix, reader) # Column order must be user, item, rating

sim_options = {
    'name': "MSD",
    'user_based': True  # True for user-user, False for item-item
}

trainset = data.build_full_trainset()

knn = KNNBasic(k=5, min_k=1, sim_options=sim_options)
knn.fit(trainset)


In [None]:
data_test = Dataset.load_from_df(df_val, reader)
data_test = data_test.build_full_trainset()
testset = data_test.build_testset()

# Get predictions for all user-item pairs
predictions = knn.test(testset)

# Get item scores from the predictions
item_scores = [(prediction.uid, prediction.iid, prediction.est) for prediction in predictions]
knn_pred = pd.DataFrame(item_scores, columns =['SITE_NAME', 'PREDICTED_GENUS', 'similarity'])

knn_pred

In [None]:
knn_pred = knn_pred.merge(df_val, how="left", left_on=["SITE_NAME", "PREDICTED_GENUS"], right_on=["SITE_NAME", "genus"])
knn_pred

In [None]:
epr = calc_expected_percentile_rank(knn_pred)
rms = root_mean_squared_error(knn_pred["occurence"], knn_pred["similarity"])
mae = mean_absolute_error(knn_pred["occurence"], knn_pred["similarity"])

print("Expected percentile rank:", epr, "RMS:", rms, "MAE:", mae)