In [1]:
import pandas as pd
import numpy as np

from scipy import stats

from surprise import Dataset, Reader
from surprise import KNNBasic, KNNWithMeans

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.metrics import root_mean_squared_error, mean_squared_error, mean_absolute_error, f1_score

In [2]:
# Genera and sites
path = "../data/AllSites_SiteOccurrences_AllGenera_26.1.24.csv"

df = pd.read_csv(path)
df.head()

Unnamed: 0,SITE_NAME,Equus,Coelodonta,Bos,Gazella,Ursus,Vulpes,Cervus,Canis,Sus,...,Total_Gen_Count,Large_GenCount,Small_GenCount,smallperlarge,smallprop,Herb_GenCount,Nonherb_GenCount,DietRatio,HerbProp,mid_age
0,Aba Zawei,1,1,1,1,0,0,0,0,0,...,4,4,0,0.0,0.0,4,0,,1.0,0.0265
1,Abric Romani,1,0,1,0,1,1,1,1,1,...,12,12,0,0.0,0.0,6,5,1.2,0.5,0.055
2,Acheng_Jiaojie,0,0,0,0,0,0,1,0,0,...,7,5,2,0.4,0.285714,5,2,2.5,0.714286,0.21
3,Adler cave,1,0,0,0,0,1,0,1,0,...,10,5,5,1.0,0.5,6,4,1.5,0.6,0.0275
4,Adyrgan,1,0,0,1,0,0,0,0,0,...,11,5,6,1.2,0.545455,11,0,,1.0,2.2


In [3]:
# Dental traits genera
path = "../data/DentalTraits_Genus_PPPA.csv"

df_dental = pd.read_csv(path, sep=",")
df_dental.head()

Unnamed: 0,Genus,n,Mass.g_Mean,Diet.Plant_Mean,HYP_Mean,LOP_Mean,FCT_AL_Mean,FCT_OL_Mean,FCT_SF_Mean,BUN_Mean,...,HYP_Mode,LOP_Mode,FCT_AL_Mode,FCT_OL_Mode,FCT_SF_Mode,BUN_Mode,Order,ConsInGenhyp,ConsInGen_bun,Unnamed: 21
0,Addax,1.0,70000.3,100.0,3.0,2.0,0.0,1.0,1.0,0.0,...,3,2,0,1,1.0,0,Cetartiodactyla,True,True,
1,Aepyceros,1.0,52500.1,100.0,3.0,2.0,0.0,1.0,0.0,0.0,...,3,2,0,1,0.0,0,Cetartiodactyla,True,True,
2,Alcelaphus,1.0,171001.5,100.0,3.0,2.0,0.0,1.0,0.0,0.0,...,3,2,0,1,0.0,0,Cetartiodactyla,True,True,
3,Alces,1.0,356998.0,100.0,1.0,2.0,1.0,1.0,0.0,0.0,...,1,2,1,1,0.0,0,Cetartiodactyla,True,True,
4,Allochrocebus,3.0,5708.333333,90.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1,0,0,0,0.0,1,Primates,True,True,


In [4]:
# Mass-diet
path = "../data/FossilGenera_MammalMassDiet_Jan24.csv"

df_mass_diet = pd.read_csv(path, sep=",")
df_mass_diet.head()

Unnamed: 0,Genus,Order,Family,MassSource,Massg,LogMass,LargeSmall,SizeClass,Diet,DietSource
0,Abudhabia,Rodentia,Muridae,Family average,134.3147,2.128124,Small,small,Herbivore,Phylacine
1,Aceratherium,Perissodactyla,Rhinocerotidae,Cooke,1099006.0,6.041,Large,large,Herbivore,Phylacine
2,Acinonyx,Carnivora,Felidae,Phylacine,46700.0,4.669317,Large,large,Non-Herbivore,Phylacine
3,Aepyosciurus,Rodentia,Sciuridae,NOW,286.0,2.456366,Small,small,Herbivore,Phylacine
4,Aeretes,Rodentia,Sciuridae,Phylacine,732.4,2.864748,Small,small,Herbivore,Phylacine


Evaluation similarly as in paper: Collaborative Filtering for Implicit Feedback Datasets ?

## Evalaution

In [5]:
def f(df_species: pd.DataFrame):
    if np.sum(df_species["occurence"]) == 0:
        return 0

    preferences = df_species["pred"]
    percentile_rank = 1 - stats.percentileofscore(preferences, preferences) / 100

    expected_rank_species = np.sum(df_species["occurence"] * percentile_rank) / np.sum(df_species["occurence"])

    return expected_rank_species

In [6]:
def calc_expected_percentile_rank(df_pred: pd.DataFrame) -> float:
    """Calculate the expected percentile rank as in paper "Collaborative Filtering for Implicit Feedback Datasets"

    Args:
        df_pred (pd.DataFrame): prediction dataframe

    Returns:
        float: expected percentile rank
    """
    expected_ranks = (
        df_pred.sort_values(by="pred")
        .groupby(by="genus")
        .apply(f, include_groups=False)
    )
    expected_percentile_rank = expected_ranks[expected_ranks > 0].mean()

    return expected_percentile_rank

## Building content-based filtering

Add an option to recommend n (5) more genera on top of the "normal" cutoff and threshold

In [7]:
def build_genus_info_from_genus_data(df):
    genus_info_cols = [
        "Genus",
        "Order",
        "Family",
        "Massg",
        "Diet",
       # "DietSource"
    ]
    
    df_genus_info = df[genus_info_cols]

    dummy_cols = [
        "Order",
        "Family",
        "Diet",
       # "DietSource"
    ]

    df_genus_info = pd.get_dummies(df_genus_info, columns=dummy_cols)
    df_genus_info = df_genus_info.replace({False: 0, True: 1})
    df_genus_info = df_genus_info.rename(columns={"Genus": "genus"})

    return df_genus_info

In [8]:
def build_site_genus_matrix(df):
    cols_redundant = ['LAT',
    'LONG',
    'ALTITUDE',
    'MAX_AGE',
    'BFA_MAX',
    'BFA_MAX_ABS',
    'MIN_AGE',
    'BFA_MIN',
    'BFA_MIN_ABS',
    'COUNTRY',
    'age_range',
    'Total_Gen_Count',
    'Large_GenCount',
    'Small_GenCount',
    'smallperlarge',
    'smallprop',
    'Herb_GenCount',
    'Nonherb_GenCount',
    'DietRatio',
    'HerbProp',
    'mid_age'
    ]

    df_site_genus = df.drop(columns=cols_redundant).set_index('SITE_NAME')

    return df_site_genus

In [9]:
def build_site_info(df):
    site_info_cols = [
    'SITE_NAME',
    'LAT',
    'LONG',
    'MAX_AGE',
    'MIN_AGE',
    'age_range',
    'Large_GenCount',
    'Small_GenCount',
    'Herb_GenCount',
    'Nonherb_GenCount',
    'mid_age'
    ]

    df_site_info = df[site_info_cols].set_index('SITE_NAME')
    
    return df_site_info

In [10]:
def build_genus_info(df, df_genus_data):
    site_genus = build_site_genus_matrix(df)
    site_genus = site_genus.stack().reset_index().rename(columns={"level_1": "genus", 0: "presence"})
    site_info = build_site_info(df)

    genus_info = site_genus.merge(site_info, on="SITE_NAME", how="left")
    genus_info = genus_info[genus_info["presence"] == 1]

    genus_info = genus_info.drop(["SITE_NAME", "presence"], axis=1)
    genus_info = genus_info.groupby('genus').mean().reset_index().set_index("genus")

    df_genus_data = build_genus_info_from_genus_data(df_genus_data)
    genus_info = genus_info.merge(df_genus_data, left_index=True, right_on="genus", how="left").reset_index(drop=True).set_index("genus")

    return genus_info

In [11]:
# Building infro about the genera for each site (in progress)
def build_site_info_from_genus_data(df, df_mass_diet):
    genus_info = build_genus_info_from_genus_data(df_mass_diet)
    site_genus = build_site_genus_matrix(df)

    site_genus = site_genus.stack().reset_index().rename(columns={"level_1": "genus", 0: "presence"})
    site_genus = site_genus[site_genus["presence"] == 1].drop("presence", axis="columns")

    site_genus = site_genus.merge(genus_info, on="genus", how="left")
    site_genus = site_genus.drop(["genus"], axis=1)
    site_genus = site_genus.groupby('SITE_NAME').mean().reset_index().set_index("SITE_NAME")
    
    return site_genus


In [12]:
def build_site_info_with_genus_info(df, df_genus_data):
    site_info_cols = [
    'SITE_NAME',
    'LAT',
    'LONG',
    'MAX_AGE',
    'MIN_AGE',
    'age_range',
    'Large_GenCount',
    'Small_GenCount',
    'Herb_GenCount',
    'Nonherb_GenCount',
    'mid_age'
    ]

    df_site_info = df[site_info_cols].set_index('SITE_NAME')
    
    df_site_info_by_genera = build_site_info_from_genus_data(df, df_genus_data)
    df_site_info = df_site_info.merge(df_site_info_by_genera, left_index=True, right_on="SITE_NAME", how="left")
    
    return df_site_info

In [13]:
def get_recommendations_for_site(genus_info, site_name, site_indices, genus_site_similarity_matrix, num_recommend = 10):
    idx = site_indices[site_name]

    # Sorted similarity scores
    sim_scores = sorted(list(enumerate(genus_site_similarity_matrix[:,idx])), key=lambda x: x[1], reverse=True)

    # Get the scores of the num_recommend most similar sites
    similar_genus_for_site = sim_scores[:num_recommend]

    # Get the genus indices
    genus_indices = [i[0] for i in similar_genus_for_site]
    genus_site_similarities = [i[1] for i in similar_genus_for_site]

    recommended_genus = genus_info.iloc[genus_indices].index.to_frame(index=False).assign(similarity=genus_site_similarities)
    recommended_genus.insert(0, "SITE_NAME", site_name)

    return recommended_genus

In [14]:
def normalize_columns_min_max(df):
    return (df - df.min()) / (df.max() - df.min())

In [15]:
def normalize_columns_mean(df):
    return (df - df.mean()) / df.std()

In [23]:
def find_recommendations_for_all_sites(df, df_genus_data, n_species_to_recommend, normalization: None):
    genus_info = build_genus_info(df, df_genus_data)
    site_info = build_site_info_with_genus_info(df, df_genus_data)
    
    if normalization != None:
        genus_info = normalization(genus_info)
        site_info = normalization(site_info)
    
    if genus_info.isnull().values.any():
        print("WARNING! Genus info data contains nans. Assigning to zeros")
        genus_info = genus_info.fillna(0)
    
    if site_info.isnull().values.any():
        print("WARNING! Site info data contains nans. Assigning to zeros")
        site_info = site_info.fillna(0)


    site_indices = pd.Series(df.index, index=df["SITE_NAME"]).drop_duplicates()
    sim = cosine_similarity(genus_info, site_info)

    recommendations = []
    for site, idx in site_indices.items():
        site_recommendations = get_recommendations_for_site(
            genus_info=genus_info,
            site_name=site,
            site_indices=site_indices,
            genus_site_similarity_matrix=sim,
            num_recommend=n_species_to_recommend                        
        )

        recommendations.append(site_recommendations)
    
    return pd.concat(recommendations).reset_index(drop=True)

In [40]:
all_site_recommendations = find_recommendations_for_all_sites(df, df_mass_diet, 414, normalization=normalize_columns_min_max)
all_site_recommendations_matrix = pd.pivot(all_site_recommendations, index="SITE_NAME", columns="genus", values="similarity").fillna(0)
all_site_recommendations_matrix.head()

  df_genus_info = df_genus_info.replace({False: 0, True: 1})
  df_genus_info = df_genus_info.replace({False: 0, True: 1})




genus,Acinonyx,Aepyosciurus,Aeretes,Ailuropoda,Ailurus,Alactagulus,Alcelaphus,Alces,Algarolutra,Alilepus,...,Villanyia,Viverra,Viverravus,Viverricula,Vormela,Vulpes,Wushanomys,Xenocyon,Yangia,Zygolophodon
SITE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aba Zawei,0.180043,0.364299,0.445845,0.219695,0.212262,0.390973,0.550427,0.553821,0.0,0.389376,...,0.379635,0.214788,0.213867,0.216596,0.168772,0.17373,0.37853,0.164339,0.223829,0.407516
Abric Romani,0.566466,0.356161,0.379461,0.572111,0.487954,0.354519,0.456357,0.582281,0.416029,0.306577,...,0.326927,0.482849,0.482316,0.488196,0.499435,0.655447,0.352848,0.605723,0.0,0.297664
Acheng_Jiaojie,0.422511,0.453466,0.516303,0.470611,0.456165,0.461507,0.400687,0.617983,0.475462,0.663275,...,0.463813,0.462437,0.45384,0.459359,0.556993,0.423214,0.469126,0.405154,0.369447,0.447486
Adler cave,0.397478,0.378853,0.451871,0.389543,0.38686,0.45296,0.358536,0.496963,0.351992,0.591665,...,0.401224,0.39338,0.399579,0.385794,0.408016,0.578033,0.405792,0.525692,0.0,0.30986
Adyrgan,0.466275,0.7785,0.558725,0.361456,0.0,0.732205,0.638578,0.479227,0.33381,0.709398,...,0.883731,0.331066,0.0,0.318378,0.40969,0.331502,0.816011,0.455917,0.769099,0.726559


### Performance

In [43]:
site_genus_matrix = build_site_genus_matrix(df)
df_site_genus = site_genus_matrix.stack().reset_index().rename(columns={"level_1": "genus", 0: "presence"})

all_site_recommendations_and_real = all_site_recommendations.merge(
    df_site_genus,
    how="outer",
    on=["SITE_NAME", "genus"]
).fillna(0)

all_site_recommendations_and_real = all_site_recommendations_and_real.rename(
    columns={"similarity": "pred", "presence": "occurence"}
)

all_site_recommendations_and_real_sorted = all_site_recommendations_and_real.sort_values(by=["SITE_NAME", "pred"], ascending=[True, False]).head(10)
all_site_recommendations_and_real_sorted.head(10)

Unnamed: 0,SITE_NAME,genus,pred,occurence
82,Aba Zawei,Coelodonta,0.808066,1
129,Aba Zawei,Equus,0.794929,1
364,Aba Zawei,Rhinoceros,0.774805,0
99,Aba Zawei,Dicerorhinus,0.753962,0
336,Aba Zawei,Procapra,0.735801,0
405,Aba Zawei,Stephanorhinus,0.733861,0
372,Aba Zawei,Saiga,0.725962,0
282,Aba Zawei,Ovis,0.720584,0
281,Aba Zawei,Ovibos,0.716926,0
350,Aba Zawei,Pseudois,0.715576,0


In [55]:
def recommend_n(recommendation_and_real: pd.DataFrame, n: int=10):
    def top_n(df, n, column='pred'):
        return df.nlargest(n, column)
    
    recommendation_and_real = recommendation_and_real.groupby(
        'SITE_NAME', group_keys=False
    ).apply(
        top_n, n
    ).sort_values(
        by=["SITE_NAME", "pred"], ascending=[True, False]
    )

    return recommendation_and_real

recommend_n(all_site_recommendations_and_real, 10)

  ).apply(


Unnamed: 0,SITE_NAME,genus,pred,occurence
82,Aba Zawei,Coelodonta,0.808066,1
129,Aba Zawei,Equus,0.794929,1
364,Aba Zawei,Rhinoceros,0.774805,0
99,Aba Zawei,Dicerorhinus,0.753962,0
336,Aba Zawei,Procapra,0.735801,0
...,...,...,...,...
324196,zhoukoudian_Loc18,Eirictis,0.805448,0
324263,zhoukoudian_Loc18,Hypolagus,0.800639,1
324475,zhoukoudian_Loc18,Sivapanthera,0.788398,0
324399,zhoukoudian_Loc18,Pliolagus,0.788361,0


In [56]:
def recommend_using_cutoff(recommendation_and_real: pd.DataFrame, cutoff: float=0.75):
    recommendation_and_real = recommendation_and_real.groupby(
        'SITE_NAME', group_keys=False
    ).apply(
        lambda x: x[x['pred'] >= cutoff]
    ).sort_values(
        by=["SITE_NAME", "pred"], ascending=[True, False]
    )
    
    return recommendation_and_real

recommend_using_cutoff(all_site_recommendations_and_real, 0.75)

  ).apply(


Unnamed: 0,SITE_NAME,genus,pred,occurence
82,Aba Zawei,Coelodonta,0.808066,1
129,Aba Zawei,Equus,0.794929,1
364,Aba Zawei,Rhinoceros,0.774805,0
99,Aba Zawei,Dicerorhinus,0.753962,0
2150,Adyrgan,Promimomys,0.891514,1
...,...,...,...,...
324533,zhoukoudian_Loc18,Xenocyon,0.760099,1
324358,zhoukoudian_Loc18,Nyctereutes,0.759231,0
324085,zhoukoudian_Loc18,Aepyosciurus,0.756777,0
324157,zhoukoudian_Loc18,Chardinomys,0.753189,0


In [59]:
def recommend_n_more(recommendations_and_real: pd.DataFrame, n: int):
    def filter_rows(group):
        sorted_group = group.sort_values(by='pred', ascending=False)
        occurence_1 = sorted_group[sorted_group['occurence'] == 1]
        occurence_0_top_n = sorted_group[sorted_group['occurence'] == 0].head(n)
        return pd.concat([occurence_1, occurence_0_top_n])
    
    recommendations_and_real = recommendations_and_real.groupby('SITE_NAME').apply(filter_rows)
    recommendations_and_real.reset_index(drop=True, inplace=True)
    
    return recommendations_and_real

test_df = recommend_n_more(all_site_recommendations_and_real, 5)
test_df.head(50)

  recommendations_and_real = recommendations_and_real.groupby('SITE_NAME').apply(filter_rows)


Unnamed: 0,SITE_NAME,genus,pred,occurence
0,Aba Zawei,Coelodonta,0.808066,1
1,Aba Zawei,Equus,0.794929,1
2,Aba Zawei,Bos,0.712633,1
3,Aba Zawei,Gazella,0.681797,1
4,Aba Zawei,Rhinoceros,0.774805,0
5,Aba Zawei,Dicerorhinus,0.753962,0
6,Aba Zawei,Procapra,0.735801,0
7,Aba Zawei,Stephanorhinus,0.733861,0
8,Aba Zawei,Saiga,0.725962,0
9,Abric Romani,Vulpes,0.655447,1


#### Expected percentile rank

In [26]:
calc_expected_percentile_rank(all_site_recommendations_and_real)

0.3158818321427803

In [27]:
# Find missing columns from recommendations

df_site_genus = build_site_genus_matrix(df)
missing_columns = df_site_genus.columns.difference(all_site_recommendations_matrix.columns)

for col in missing_columns:
    all_site_recommendations_matrix[col] = 0

In [28]:
column_order = df_site_genus.columns
all_site_recommendations_matrix = all_site_recommendations_matrix[column_order]
all_site_recommendations_matrix

genus,Equus,Coelodonta,Bos,Gazella,Ursus,Vulpes,Cervus,Canis,Sus,Homo,...,Euarctos,Paracervulus,Eostyloceros,Cervocerus,Antispiroides,Sinoryx,Prospalax,Pliopetaurista,Predicrostonyx,Boocercus
SITE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aba Zawei,0.794929,0.808066,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0
Abric Romani,0.000000,0.000000,0.0,0.0,0.0,0.655447,0.0,0.644547,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0
Acheng_Jiaojie,0.000000,0.642129,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0
Adler cave,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0
Adyrgan,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zuurland (-42 to -46 m),0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0
Zverinogolovskoe,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.833241,0.0,0
kudaro 3 cave l.3,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0
lang trang,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0


In [29]:
all_site_recommendations_matrix[all_site_recommendations_matrix > 0.5] = 1


In [31]:
rms = root_mean_squared_error(df_site_genus, all_site_recommendations_matrix)
mae = mean_absolute_error(df_site_genus, all_site_recommendations_matrix)
# f1 = f1_score(df_site_genus, all_site_recommendations_matrix, zero_division=1.0, average='macro')
print(rms, mae)#, f1)

0.17743505886287805 0.04533368242018128


## Building collaborative filtering

In [None]:
def fit_knn_collaborative_filtering(df, site_based=True, similarity="cosine", k=5, min_k=1):
    site_genus_matrix = build_site_genus_matrix(df)
    df_site_genus = site_genus_matrix.stack().reset_index().rename(columns={"level_1": "genus", 0: "presence"})

    reader = Reader(rating_scale=(0, 1))
    data = Dataset.load_from_df(df_site_genus, reader) # Column order must be user, item, rating

    sim_options = {
    'name': similarity,
    'user_based': site_based  # True for user-user, False for item-item
    }

    trainset = data.build_full_trainset()

    knn = KNNBasic(k=k, min_k=min_k, sim_options=sim_options)
    knn.fit(trainset)

    return knn, trainset

In [None]:
knn, trainset = fit_knn_collaborative_filtering(
    df, 
    site_based=True, 
    similarity="cosine",
    k=5,
    min_k=1
)

In [None]:
def predict_collaborative_filtering(algotihm, trainset):
    # Testset equal to trainset
    testset = trainset.build_testset()

    # Get predictions for all user-item pairs
    predictions = algotihm.test(testset)

    # Get item scores from the predictions
    item_scores = [(prediction.uid, prediction.iid, prediction.est) for prediction in predictions]
    return pd.DataFrame(item_scores, columns =['SITE_NAME', 'PREDICTED_GENUS', 'SCORE'])

In [None]:
predictions = predict_collaborative_filtering(knn, trainset)

In [None]:
display(predictions.head(10))

In [None]:
items_scores_matrix_knn = pd.pivot(predictions, index="SITE_NAME", columns="PREDICTED_GENUS", values="SCORE")
items_scores_matrix_knn.head()

### Performance

In [None]:
predictions = predictions.rename(
    columns={"PREDICTED_GENUS": "genus"}
)

site_genus_matrix = build_site_genus_matrix(df)
df_site_genus = site_genus_matrix.stack().reset_index().rename(columns={"level_1": "genus", 0: "presence"})

all_site_recommendations_and_real = predictions.merge(
    df_site_genus,
    how="outer",
    on=["SITE_NAME", "genus"]
).fillna(0)

all_site_recommendations_and_real = all_site_recommendations_and_real.rename(
    columns={"SCORE": "pred", "presence": "occurence"}
)

all_site_recommendations_and_real.head()

In [None]:
calc_expected_percentile_rank(all_site_recommendations_and_real)

In [None]:
column_order = df_site_genus.columns
items_scores_matrix_knn = items_scores_matrix_knn[column_order]
items_scores_matrix_knn

In [None]:
items_scores_matrix_knn[items_scores_matrix_knn > 0.5] = 1
items_scores_matrix_knn[items_scores_matrix_knn <= 0.5] = 0

In [None]:
items_scores_matrix_knn

In [None]:
rms = root_mean_squared_error(df_site_genus, items_scores_matrix_knn)
mae = mean_absolute_error(df_site_genus, items_scores_matrix_knn)
f1 = f1_score(df_site_genus, items_scores_matrix_knn, zero_division=1.0, average='macro')
print(rms, mae, f1)

## Hybrid algorithm content-knn