In [1]:
import pandas as pd
import numpy as np

from scipy import stats

from surprise import Dataset, Reader
from surprise import KNNBasic, KNNWithMeans

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.metrics import root_mean_squared_error, mean_squared_error, mean_absolute_error, f1_score

In [2]:
# Genera and sites
path = "../data/AllSites_SiteOccurrences_AllGenera_26.1.24.csv"

df = pd.read_csv(path)
df.head()

Unnamed: 0,SITE_NAME,Equus,Coelodonta,Bos,Gazella,Ursus,Vulpes,Cervus,Canis,Sus,...,Total_Gen_Count,Large_GenCount,Small_GenCount,smallperlarge,smallprop,Herb_GenCount,Nonherb_GenCount,DietRatio,HerbProp,mid_age
0,Aba Zawei,1,1,1,1,0,0,0,0,0,...,4,4,0,0.0,0.0,4,0,,1.0,0.0265
1,Abric Romani,1,0,1,0,1,1,1,1,1,...,12,12,0,0.0,0.0,6,5,1.2,0.5,0.055
2,Acheng_Jiaojie,0,0,0,0,0,0,1,0,0,...,7,5,2,0.4,0.285714,5,2,2.5,0.714286,0.21
3,Adler cave,1,0,0,0,0,1,0,1,0,...,10,5,5,1.0,0.5,6,4,1.5,0.6,0.0275
4,Adyrgan,1,0,0,1,0,0,0,0,0,...,11,5,6,1.2,0.545455,11,0,,1.0,2.2


In [52]:
# Dental traits genera
path = "../data/DentalTraits_Genus_PPPA.csv"

df_dental = pd.read_csv(path, sep=",")
df_dental.head()

Unnamed: 0,Genus,n,Mass.g_Mean,Diet.Plant_Mean,HYP_Mean,LOP_Mean,FCT_AL_Mean,FCT_OL_Mean,FCT_SF_Mean,BUN_Mean,...,HYP_Mode,LOP_Mode,FCT_AL_Mode,FCT_OL_Mode,FCT_SF_Mode,BUN_Mode,Order,ConsInGenhyp,ConsInGen_bun,Unnamed: 21
0,Addax,1.0,70000.3,100.0,3.0,2.0,0.0,1.0,1.0,0.0,...,3,2,0,1,1.0,0,Cetartiodactyla,True,True,
1,Aepyceros,1.0,52500.1,100.0,3.0,2.0,0.0,1.0,0.0,0.0,...,3,2,0,1,0.0,0,Cetartiodactyla,True,True,
2,Alcelaphus,1.0,171001.5,100.0,3.0,2.0,0.0,1.0,0.0,0.0,...,3,2,0,1,0.0,0,Cetartiodactyla,True,True,
3,Alces,1.0,356998.0,100.0,1.0,2.0,1.0,1.0,0.0,0.0,...,1,2,1,1,0.0,0,Cetartiodactyla,True,True,
4,Allochrocebus,3.0,5708.333333,90.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1,0,0,0,0.0,1,Primates,True,True,


In [54]:
# Mass-diet
path = "../data/FossilGenera_MammalMassDiet_Jan24.csv"

df_mass_diet = pd.read_csv(path, sep=",")
df_mass_diet.head()

Unnamed: 0,Genus,Order,Family,MassSource,Massg,LogMass,LargeSmall,SizeClass,Diet,DietSource
0,Abudhabia,Rodentia,Muridae,Family average,134.3147,2.128124,Small,small,Herbivore,Phylacine
1,Aceratherium,Perissodactyla,Rhinocerotidae,Cooke,1099006.0,6.041,Large,large,Herbivore,Phylacine
2,Acinonyx,Carnivora,Felidae,Phylacine,46700.0,4.669317,Large,large,Non-Herbivore,Phylacine
3,Aepyosciurus,Rodentia,Sciuridae,NOW,286.0,2.456366,Small,small,Herbivore,Phylacine
4,Aeretes,Rodentia,Sciuridae,Phylacine,732.4,2.864748,Small,small,Herbivore,Phylacine


Evaluation similarly as in paper: Collaborative Filtering for Implicit Feedback Datasets ?

## Evalaution

In [39]:
def f(df_species: pd.DataFrame):
    if np.sum(df_species["occurence"]) == 0:
        return 0

    preferences = df_species["pred"]
    percentile_rank = 1 - stats.percentileofscore(preferences, preferences) / 100

    expected_rank_species = np.sum(df_species["occurence"] * percentile_rank) / np.sum(df_species["occurence"])

    return expected_rank_species

In [40]:
def calc_expected_percentile_rank(df_pred: pd.DataFrame) -> float:
    """Calculate the expected percentile rank as in paper "Collaborative Filtering for Implicit Feedback Datasets"

    Args:
        df_pred (pd.DataFrame): prediction dataframe

    Returns:
        float: expected percentile rank
    """
    expected_ranks = (
        df_pred.sort_values(by="pred")
        .groupby(by="genus")
        .apply(f, include_groups=False)
    )
    expected_percentile_rank = expected_ranks[expected_ranks > 0].mean()

    return expected_percentile_rank

## Building content-based filtering

Add an option to recommend n (5) more genera on top of the "normal" cutoff and threshold

In [75]:
def build_genus_info_from_genus_data(df):
    genus_info_cols = [
        "Genus",
        "Order",
        "Family",
        "Massg",
        "Diet",
       # "DietSource"
    ]
    
    df_genus_info = df[genus_info_cols]

    dummy_cols = [
        "Order",
        "Family",
        "Diet",
       # "DietSource"
    ]

    df_genus_info = pd.get_dummies(df_genus_info, columns=dummy_cols)
    df_genus_info = df_genus_info.replace({False: 0, True: 1})
    df_genus_info = df_genus_info.rename(columns={"Genus": "genus"})

    return df_genus_info

In [76]:
build_genus_info_from_genus_data(df_mass_diet)

  df_genus_info = df_genus_info.replace({False: 0, True: 1})


Unnamed: 0,genus,Massg,Order_Artiodactyla,Order_Carnivora,Order_Cetartiodactyla,Order_Chiroptera,Order_Dermoptera,Order_Eulipotyphla,Order_Hyracoidea,Order_Lagomorpha,...,Family_Talpidae,Family_Tapiridae,Family_Tragulidae,Family_Tupaiidae,Family_Ursidae,Family_Vespertilionidae,Family_Viverravidae,Family_Viverridae,Diet_Herbivore,Diet_Non-Herbivore
0,Abudhabia,1.343147e+02,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,Aceratherium,1.099006e+06,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,Acinonyx,4.670000e+04,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Aepyosciurus,2.860000e+02,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,Aeretes,7.324000e+02,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,Xenocyon,4.759825e+04,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
570,Yangia,1.343147e+02,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
571,Yanshuella,7.842381e+01,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
572,Zelceina,1.145527e+01,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
def build_site_genus_matrix(df):
    cols_redundant = ['LAT',
    'LONG',
    'ALTITUDE',
    'MAX_AGE',
    'BFA_MAX',
    'BFA_MAX_ABS',
    'MIN_AGE',
    'BFA_MIN',
    'BFA_MIN_ABS',
    'COUNTRY',
    'age_range',
    'Total_Gen_Count',
    'Large_GenCount',
    'Small_GenCount',
    'smallperlarge',
    'smallprop',
    'Herb_GenCount',
    'Nonherb_GenCount',
    'DietRatio',
    'HerbProp',
    'mid_age'
    ]

    df_site_genus = df.drop(columns=cols_redundant).set_index('SITE_NAME')

    return df_site_genus

In [6]:
def build_site_info(df):
    site_info_cols = [
    'SITE_NAME',
    'LAT',
    'LONG',
    'MAX_AGE',
    'MIN_AGE',
    'age_range',
    'Large_GenCount',
    'Small_GenCount',
    'Herb_GenCount',
    'Nonherb_GenCount',
    'mid_age'
    ]

    df_site_info = df[site_info_cols].set_index('SITE_NAME')

    return df_site_info

In [7]:
def build_genus_info(df):
    site_genus = build_site_genus_matrix(df)
    site_genus = site_genus.stack().reset_index().rename(columns={"level_1": "genus", 0: "presence"})
    site_info = build_site_info(df)

    genus_info = site_genus.merge(site_info, on="SITE_NAME", how="left")
    genus_info = genus_info[genus_info["presence"] == 1]

    genus_info = genus_info.drop(["SITE_NAME", "presence"], axis=1)
    genus_info = genus_info.groupby('genus').mean().reset_index().set_index("genus")

    return genus_info

In [78]:
# Building infro about the genera for each site (in progress)
def build_site_info_from_genus_data(df):
    pass

genus_info = build_genus_info_from_genus_data(df_mass_diet)

site_genus = build_site_genus_matrix(df)
site_genus = site_genus.stack().reset_index().rename(columns={"level_1": "genus", 0: "presence"})
site_genus = site_genus[site_genus["presence"] == 1].drop("presence", axis="columns")
site_genus = site_genus.merge(genus_info, on="genus", how="left")
site_genus = site_genus.drop(["genus"], axis=1)
site_genus = site_genus.groupby('SITE_NAME').mean().reset_index().set_index("SITE_NAME")
site_genus

  df_genus_info = df_genus_info.replace({False: 0, True: 1})


Unnamed: 0_level_0,Massg,Order_Artiodactyla,Order_Carnivora,Order_Cetartiodactyla,Order_Chiroptera,Order_Dermoptera,Order_Eulipotyphla,Order_Hyracoidea,Order_Lagomorpha,Order_Perissodactyla,...,Family_Talpidae,Family_Tapiridae,Family_Tragulidae,Family_Tupaiidae,Family_Ursidae,Family_Vespertilionidae,Family_Viverravidae,Family_Viverridae,Diet_Herbivore,Diet_Non-Herbivore
SITE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aba Zawei,1.007188e+06,0.000000,0.000000,0.500000,0.0,0.0,0.000000,0.0,0.000000,0.500000,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,1.000000,0.000000
Abric Romani,4.223222e+05,0.000000,0.333333,0.416667,0.0,0.0,0.000000,0.0,0.000000,0.166667,...,0.0,0.000000,0.0,0.0,0.083333,0.0,0.0,0.000000,0.500000,0.416667
Acheng_Jiaojie,4.923990e+05,0.000000,0.285714,0.285714,0.0,0.0,0.000000,0.0,0.142857,0.142857,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.714286,0.285714
Adler cave,4.688946e+04,0.000000,0.200000,0.100000,0.0,0.0,0.200000,0.0,0.200000,0.100000,...,0.1,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.600000,0.400000
Adyrgan,9.638690e+05,0.090909,0.000000,0.090909,0.0,0.0,0.000000,0.0,0.000000,0.181818,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zuurland (-42 to -46 m),1.083352e+02,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.0,0.000000,0.000000,...,0.6,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,1.000000
Zverinogolovskoe,8.863084e+02,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.222222,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,1.000000,0.000000
kudaro 3 cave l.3,4.185366e+04,0.000000,0.687500,0.312500,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.062500,0.0,0.0,0.000000,0.312500,0.687500
lang trang,5.754996e+05,0.000000,0.375000,0.208333,0.0,0.0,0.000000,0.0,0.000000,0.083333,...,0.0,0.041667,0.0,0.0,0.125000,0.0,0.0,0.041667,0.541667,0.416667


In [8]:
def get_recommendations_for_site(genus_info, site_name, site_indices, genus_site_similarity_matrix, num_recommend = 10):
    idx = site_indices[site_name]

    # Sorted similarity scores
    sim_scores = sorted(list(enumerate(genus_site_similarity_matrix[:,idx])), key=lambda x: x[1], reverse=True)

    # Get the scores of the num_recommend most similar sites
    similar_genus_for_site = sim_scores[:num_recommend]

    # Get the genus indices
    genus_indices = [i[0] for i in similar_genus_for_site]
    genus_site_similarities = [i[1] for i in similar_genus_for_site]

    recommended_genus = genus_info.iloc[genus_indices].index.to_frame(index=False).assign(similarity=genus_site_similarities)
    recommended_genus.insert(0, "SITE_NAME", site_name)

    return recommended_genus

In [9]:
def normalize_columns_min_max(df):
    return (df - df.min()) / (df.max() - df.min())

In [10]:
def normalize_columns_mean(df):
    return (df - df.mean()) / df.std()

In [11]:
def find_recommendations_for_all_sites(df, n_species_to_recommend, normalization: None):
    genus_info = build_genus_info(df)
    site_info = build_site_info(df)
    
    if normalization != None:
        genus_info = normalization(genus_info)
        site_info = normalization(site_info)

    site_indices = pd.Series(df.index, index=df["SITE_NAME"]).drop_duplicates()
    sim = cosine_similarity(genus_info, site_info)

    recommendations = []
    for site, idx in site_indices.items():
        site_recommendations = get_recommendations_for_site(
            genus_info=genus_info,
            site_name=site,
            site_indices=site_indices,
            genus_site_similarity_matrix=sim,
            num_recommend=n_species_to_recommend                        
        )

        recommendations.append(site_recommendations)
    
    return pd.concat(recommendations).reset_index(drop=True)

In [12]:
all_site_recommendations = find_recommendations_for_all_sites(df, 10, normalization=normalize_columns_min_max)
all_site_recommendations_matrix = pd.pivot(all_site_recommendations, index="SITE_NAME", columns="genus", values="similarity").fillna(0)
all_site_recommendations_matrix.head()

genus,Acinonyx,Aepyosciurus,Aeretes,Ailuropoda,Ailurus,Alactagulus,Alcelaphus,Alces,Algarolutra,Alilepus,...,Villanyia,Viverra,Viverravus,Viverricula,Vormela,Vulpes,Wushanomys,Xenocyon,Yangia,Zygolophodon
SITE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aba Zawei,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Abric Romani,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.926382,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Acheng_Jiaojie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adler cave,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.954767,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adyrgan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.985751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.970305,0.0


### Performance

In [44]:
site_genus_matrix = build_site_genus_matrix(df)
df_site_genus = site_genus_matrix.stack().reset_index().rename(columns={"level_1": "genus", 0: "presence"})

all_site_recommendations_and_real = all_site_recommendations.merge(
    df_site_genus,
    how="outer",
    on=["SITE_NAME", "genus"]
).fillna(0)

all_site_recommendations_and_real = all_site_recommendations_and_real.rename(
    columns={"similarity": "pred", "presence": "occurence"}
)

all_site_recommendations_and_real.head()

Unnamed: 0,SITE_NAME,genus,pred,occurence
0,Aba Zawei,Acinonyx,0.0,0
1,Aba Zawei,Aepyosciurus,0.0,0
2,Aba Zawei,Aeretes,0.0,0
3,Aba Zawei,Ailuropoda,0.0,0
4,Aba Zawei,Ailurus,0.0,0


### Expected percentile rank

In [48]:
calc_expected_percentile_rank(all_site_recommendations_and_real)

0.30402448945255994

In [13]:
# Find missing columns from recommendations

df_site_genus = build_site_genus_matrix(df)
missing_columns = df_site_genus.columns.difference(all_site_recommendations_matrix.columns)

for col in missing_columns:
    all_site_recommendations_matrix[col] = 0

In [14]:
column_order = df_site_genus.columns
all_site_recommendations_matrix = all_site_recommendations_matrix[column_order]
all_site_recommendations_matrix

genus,Equus,Coelodonta,Bos,Gazella,Ursus,Vulpes,Cervus,Canis,Sus,Homo,...,Euarctos,Paracervulus,Eostyloceros,Cervocerus,Antispiroides,Sinoryx,Prospalax,Pliopetaurista,Predicrostonyx,Boocercus
SITE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aba Zawei,0.0,0.944944,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Abric Romani,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Acheng_Jiaojie,0.0,0.975505,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adler cave,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adyrgan,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zuurland (-42 to -46 m),0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zverinogolovskoe,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
kudaro 3 cave l.3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lang trang,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
all_site_recommendations_matrix[all_site_recommendations_matrix > 0.5] = 1


In [16]:
rms = root_mean_squared_error(df_site_genus, all_site_recommendations_matrix)
mae = mean_absolute_error(df_site_genus, all_site_recommendations_matrix)
f1 = f1_score(df_site_genus, all_site_recommendations_matrix, zero_division=1.0, average='macro')
print(rms, mae, f1)

0.1864828321767479 0.0479083984519437 0.1386704617611954


## Building collaborative filtering

In [17]:
def fit_knn_collaborative_filtering(df, site_based=True, similarity="cosine", k=5, min_k=1):
    site_genus_matrix = build_site_genus_matrix(df)
    df_site_genus = site_genus_matrix.stack().reset_index().rename(columns={"level_1": "genus", 0: "presence"})

    reader = Reader(rating_scale=(0, 1))
    data = Dataset.load_from_df(df_site_genus, reader) # Column order must be user, item, rating

    sim_options = {
    'name': similarity,
    'user_based': site_based  # True for user-user, False for item-item
    }

    trainset = data.build_full_trainset()

    knn = KNNBasic(k=k, min_k=min_k, sim_options=sim_options)
    knn.fit(trainset)

    return knn, trainset

In [18]:
knn, trainset = fit_knn_collaborative_filtering(
    df, 
    site_based=True, 
    similarity="cosine",
    k=5,
    min_k=1
)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [19]:
def predict_collaborative_filtering(algotihm, trainset):
    # Testset equal to trainset
    testset = trainset.build_testset()

    # Get predictions for all user-item pairs
    predictions = algotihm.test(testset)

    # Get item scores from the predictions
    item_scores = [(prediction.uid, prediction.iid, prediction.est) for prediction in predictions]
    return pd.DataFrame(item_scores, columns =['SITE_NAME', 'PREDICTED_GENUS', 'SCORE'])

In [20]:
predictions = predict_collaborative_filtering(knn, trainset)

In [21]:
display(predictions.head(10))

Unnamed: 0,SITE_NAME,PREDICTED_GENUS,SCORE
0,Aba Zawei,Equus,1.0
1,Aba Zawei,Coelodonta,1.0
2,Aba Zawei,Bos,0.647551
3,Aba Zawei,Gazella,1.0
4,Aba Zawei,Ursus,0.0
5,Aba Zawei,Vulpes,0.0
6,Aba Zawei,Cervus,0.353843
7,Aba Zawei,Canis,0.0
8,Aba Zawei,Sus,0.0
9,Aba Zawei,Homo,0.0


In [22]:
items_scores_matrix_knn = pd.pivot(predictions, index="SITE_NAME", columns="PREDICTED_GENUS", values="SCORE")
items_scores_matrix_knn.head()

PREDICTED_GENUS,Acinonyx,Aepyosciurus,Aeretes,Ailuropoda,Ailurus,Alactagulus,Alcelaphus,Alces,Algarolutra,Alilepus,...,Villanyia,Viverra,Viverravus,Viverricula,Vormela,Vulpes,Wushanomys,Xenocyon,Yangia,Zygolophodon
SITE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aba Zawei,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Abric Romani,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.365156,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.453297,0.0,0.0,0.0,0.0
Acheng_Jiaojie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.317348,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.183209,0.0,0.0,0.0,0.0
Adler cave,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.832358,0.0,0.157586,0.0,0.0
Adyrgan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.424503,0.0,0.0,0.0,0.0,0.147793,0.0,0.0,0.0,0.0


### Performance

In [50]:
predictions = predictions.rename(
    columns={"PREDICTED_GENUS": "genus"}
)

site_genus_matrix = build_site_genus_matrix(df)
df_site_genus = site_genus_matrix.stack().reset_index().rename(columns={"level_1": "genus", 0: "presence"})

all_site_recommendations_and_real = predictions.merge(
    df_site_genus,
    how="outer",
    on=["SITE_NAME", "genus"]
).fillna(0)

all_site_recommendations_and_real = all_site_recommendations_and_real.rename(
    columns={"SCORE": "pred", "presence": "occurence"}
)

all_site_recommendations_and_real.head()

Unnamed: 0,SITE_NAME,genus,pred,occurence
0,Aba Zawei,Acinonyx,0.0,0
1,Aba Zawei,Aepyosciurus,0.0,0
2,Aba Zawei,Aeretes,0.0,0
3,Aba Zawei,Ailuropoda,0.0,0
4,Aba Zawei,Ailurus,0.0,0


In [51]:
calc_expected_percentile_rank(all_site_recommendations_and_real)

0.0219643529308981

In [23]:
column_order = df_site_genus.columns
items_scores_matrix_knn = items_scores_matrix_knn[column_order]
items_scores_matrix_knn

PREDICTED_GENUS,Equus,Coelodonta,Bos,Gazella,Ursus,Vulpes,Cervus,Canis,Sus,Homo,...,Euarctos,Paracervulus,Eostyloceros,Cervocerus,Antispiroides,Sinoryx,Prospalax,Pliopetaurista,Predicrostonyx,Boocercus
SITE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aba Zawei,1.000000,1.000000,0.647551,1.000000,0.000000,0.000000,0.353843,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Abric Romani,1.000000,0.000000,1.000000,0.000000,1.000000,0.453297,1.000000,1.000000,1.000000,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Acheng_Jiaojie,0.176293,0.000000,0.000000,0.000000,0.493641,0.183209,1.000000,0.359502,0.500557,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adler cave,1.000000,0.161531,0.000000,0.000000,0.319117,0.832358,0.319117,0.804836,0.161531,0.157586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adyrgan,1.000000,0.127992,0.127992,0.872008,0.000000,0.147793,0.000000,0.147793,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zuurland (-42 to -46 m),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zverinogolovskoe,0.000000,0.167554,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
kudaro 3 cave l.3,0.544506,0.182855,0.172964,0.000000,1.000000,1.000000,1.000000,0.818662,0.172964,0.188687,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lang trang,0.000000,0.000000,0.576834,0.000000,0.613459,0.000000,1.000000,0.000000,1.000000,0.372874,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
items_scores_matrix_knn[items_scores_matrix_knn > 0.5] = 1
items_scores_matrix_knn[items_scores_matrix_knn <= 0.5] = 0

In [25]:
items_scores_matrix_knn

PREDICTED_GENUS,Equus,Coelodonta,Bos,Gazella,Ursus,Vulpes,Cervus,Canis,Sus,Homo,...,Euarctos,Paracervulus,Eostyloceros,Cervocerus,Antispiroides,Sinoryx,Prospalax,Pliopetaurista,Predicrostonyx,Boocercus
SITE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aba Zawei,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Abric Romani,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Acheng_Jiaojie,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adler cave,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adyrgan,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zuurland (-42 to -46 m),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zverinogolovskoe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
kudaro 3 cave l.3,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lang trang,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
rms = root_mean_squared_error(df_site_genus, items_scores_matrix_knn)
mae = mean_absolute_error(df_site_genus, items_scores_matrix_knn)
f1 = f1_score(df_site_genus, items_scores_matrix_knn, zero_division=1.0, average='macro')
print(rms, mae, f1)

0.12981215592416928 0.030708457613331033 0.21787198677551572


## Hybrid algorithm content-knn