In [1]:
%cd ../

/Users/macos/Uni/1st_year/period_3/DSProj/code/models


In [7]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns

import evaluation
import utils

In [3]:
plt.style.use('seaborn-v0_8')
plt.rcParams.update({'font.size': 8})

In [8]:
PATH_DIR_DATA_PROCESS = Path("data_processed")

## 0. Load data and encoding

In [4]:
path = "/Users/macos/Uni/1st_year/period_3/DSProj/data/AllSites_SiteOccurrences_AllGenera_26.1.24.csv"

df = pd.read_csv(path)

df.head()

Unnamed: 0,SITE_NAME,Equus,Coelodonta,Bos,Gazella,Ursus,Vulpes,Cervus,Canis,Sus,...,Total_Gen_Count,Large_GenCount,Small_GenCount,smallperlarge,smallprop,Herb_GenCount,Nonherb_GenCount,DietRatio,HerbProp,mid_age
0,Aba Zawei,1,1,1,1,0,0,0,0,0,...,4,4,0,0.0,0.0,4,0,,1.0,0.0265
1,Abric Romani,1,0,1,0,1,1,1,1,1,...,12,12,0,0.0,0.0,6,5,1.2,0.5,0.055
2,Acheng_Jiaojie,0,0,0,0,0,0,1,0,0,...,7,5,2,0.4,0.285714,5,2,2.5,0.714286,0.21
3,Adler cave,1,0,0,0,0,1,0,1,0,...,10,5,5,1.0,0.5,6,4,1.5,0.6,0.0275
4,Adyrgan,1,0,0,1,0,0,0,0,0,...,11,5,6,1.2,0.545455,11,0,,1.0,2.2


In [5]:
path_embd_site = "data_processed/embedding_mf/embd_site_mf_regression.npy"
path_embd_species = "data_processed/embedding_mf/embd_species_mf_regression.npy"

emb_site = np.load(path_embd_site)
emb_species = np.load(path_embd_species)

In [9]:
path_dir_encode = PATH_DIR_DATA_PROCESS / "encoder"

path_enc_genera = path_dir_encode / "ordinal_enc_genera.json"
path_enc_site = path_dir_encode / "ordinal_enc_site.json"

enc_genera = utils.CategoryDict.from_file(path_enc_genera)
enc_site = utils.CategoryDict.from_file(path_enc_site)

In [14]:
dict_species_embd = {enc_genera.ids2names(idx)[0]: emb for idx, emb in enumerate(emb_species)}

dict_species_embd['Equus']

array([ 0.48133853, -0.885859  ,  0.5315192 , -1.2709452 ,  1.2409518 ,
       -0.2803186 , -0.48638493,  0.37118807, -1.0038189 , -0.19006532,
       -0.7072423 , -0.49020004,  1.2547532 ,  0.30509952,  0.96412003,
        0.08104583, -0.77335626,  1.1636934 , -0.2723879 , -0.7135465 ,
        1.4242524 , -1.9960505 , -0.8124531 ,  1.0922112 ,  0.52428555,
        1.040531  , -0.39423501,  0.00695098,  0.6585643 , -0.15677549,
       -1.7565837 ,  0.4915725 ,  0.756618  , -0.4780721 , -0.6011286 ,
       -1.3769549 , -0.14506647,  0.60927707,  0.31258953, -0.34109437,
        0.2901921 ,  1.6217002 ,  0.26392817,  1.0714238 ,  0.42034328,
        0.94025475,  0.04049683,  0.30804008, -1.7388293 , -0.9920322 ,
        1.6728448 ,  1.1007311 ,  0.28074533,  0.8518006 , -0.93585443,
       -0.9983699 , -0.25397107,  0.23555003,  2.185074  ,  2.1986005 ,
        0.08198294,  0.22334267,  0.27503908, -1.5106016 ], dtype=float32)

In [16]:
data_val = np.load(PATH_DIR_DATA_PROCESS / "trainval/data_val.npy", allow_pickle=True)
data_val[0]

{'occurence': array([[0., 0., 0., 0.],
        [0., 0., 0., 0.]], dtype=float32),
 'sites': [198, 199],
 'genera': [116, 117, 118, 119]}

In [17]:
list_val = []

for x in data_val:
    for i_site, site in enumerate(x['sites']):
        for i_gen, genera in enumerate(x['genera']):
            if x['occurence'][i_site, i_gen] == 1:
                list_val.append({'site': site, 'genera': genera})

In [18]:
df_val = pd.DataFrame.from_records(list_val)

df_val.head()

Unnamed: 0,site,genera
0,707,96
1,200,148
2,200,150
3,201,148
4,201,150


In [19]:
df_val.loc[:, 'genera_name'] = enc_genera.ids2names(df_val['genera'])

df_val.head()

Unnamed: 0,site,genera,genera_name
0,707,96,Pseudaxis
1,200,148,Bubalus
2,200,150,Rhizomys
3,201,148,Bubalus
4,201,150,Rhizomys


## 2. Start validating

In [20]:
emb_species_normed = emb_species / np.clip(np.linalg.norm(emb_species, axis=1)[:, None], a_max=10, a_min=1e-6)

In [21]:
N = 15
N_SITES = 10

In [25]:
results = []

for species in df_val['genera_name'].unique():
    emb = dict_species_embd[species]

    # Calculate cosine sim
    emb = emb / np.clip(np.linalg.norm(emb), a_min=1e-6, a_max=10)
    sim = emb_species_normed @ emb[:, None]

    # Get top N similar species
    idx_top = np.argpartition(sim.squeeze(), -N)[-N:]
    species_sim = enc_genera.ids2names(idx_top)

    # Sample several sites to calculate co-occurence
    sites_available = df[df[species] == 1][['SITE_NAME', species]]['SITE_NAME']
    if len(sites_available) <= N_SITES:
        sites_sample = sites_available
    else:
        sites_sample = sites_available.sample(N_SITES, replace=False)
    results.extend(df[df['SITE_NAME'].isin(sites_sample)][species_sim].mean(axis=1).tolist())


In [27]:
print(f"MSE : {evaluation.calc_mse(results):.6f}")
print(f"RMSE: {evaluation.calc_rmse(results):.6f}")
print(f"TPR : {evaluation.calc_tpr(results):.6f}")

MSE : 0.787281
RMSE: 0.887289
TPR : 0.000000
