In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns

from models import models, utils

In [None]:
path = "../data/AllEurasia_NOWinput_Feb24_U3O4_siteinfo.csv"

dataframe = pd.read_csv(path, index_col=0)

dataframe.head()

In [None]:
# dataframe = dataframe.drop(columns=['LIDNUM']).set_index('NAME')

## 1. Test with Matrix Factorization

In [None]:
dim_hid = 10
output_prob = True
include_tnr = True

In [None]:
out = models.get_recommend_list_mf(dataframe, output_prob=output_prob, num_epochs=120, lr=4e-3, dim_hid=dim_hid)
out.head()

#### NOTE

If `output_prob` is False, we dont calculate `True Negative Rate` no matter we set flag `include_tnr` as True

In [None]:
models.get_metrics_mf(dataframe, output_prob=output_prob, dim_hid=dim_hid)

### Calculate True Negative Rate (TNR)
#### WARNING

This is only used if the flag `output_prob` = True and `include_tnr` is True

#### NOTE:
Note that, in calculating TNR, we only concern about the `true_negative_rate` in the output of the below cell. It is because the model was not trained with dataframe `df_tnr`, so testing `expected_percentile_rank` and `true_positive_rate` on this dataframe is not proper.

To calculate TNR, we need a separated dataframe

In [None]:
if output_prob is True and include_tnr is True:
    path_tnr = "../data/AllSites_truenegatives_Feb24.csv"

    df_tnr_raw = pd.read_csv(path_tnr)

    df_tnr = df_tnr_raw\
        .rename(columns={'Unnamed: 0': 'loc_name'})\
        .set_index('loc_name')\
        .map(lambda x: 1 - x)   # Convert non-occurence from 1 to 0 to match with the code in function calc_tnr()

    df_tnr_mf = utils.create_test_tnr(df_tnr)

    models.get_metrics_mf(df_tnr_mf, output_prob=output_prob, dim_hid=dim_hid, include_tnr=include_tnr)

## 2. Test with KNN

In [None]:
out = models.get_recommend_list_knn(dataframe, output_prob=output_prob)

out.head()

In [None]:
models.get_metrics_knn(dataframe, output_prob=output_prob)

### Calculate True Negative Rate (TNR)
#### WARNING:
This is only used if the flag `output_prob` = True and `include_tnr` = True

#### NOTE:
Note that, in calculating TNR, we only concern about the `true_negative_rate` in the output of the below cell. It is because the model was not trained with dataframe `df_tnr`, so testing `expected_percentile_rank` and `true_positive_rate` on this dataframe is not proper.

To calculate TNR with KNN, the input dataframe `df_tnr` 

In [None]:
if output_prob is True and include_tnr is True:
    path_tnr = "../data/AllSites_truenegatives_Feb24.csv"

    df_tnr_raw = pd.read_csv(path_tnr)

    df_tnr = df_tnr_raw\
        .rename(columns={'Unnamed: 0': 'loc_name'})\
        .set_index('loc_name')\
        .map(lambda x: 1 - x)   # Convert non-occurence from 1 to 0 to match with the code in function calc_tnr()
    
    models.get_metrics_knn(df_tnr, output_prob=output_prob, include_tnr=include_tnr)

## 3. Test with Content-Based Filtering

In [None]:
# Genus data is necessary for content based filtering
genus_data_path = "../data/FossilGenera_MammalMassDiet_Jan24.csv"
df_genus_data = pd.read_csv(genus_data_path, sep=",")

dental_data_path = "../data/DentalTraits_Genus_PPPA_ds.csv"
df_dental_data = pd.read_csv(dental_data_path, sep=",")

# Genus data must be preprocessed beforehand

dental_cols = [
    "Genus",
    "HY",
    "LOP",
    "AL",
    "OL",
    "SF",
    "BUN",
    "OT",
    "Excl_AL"
]

df_dental_data = df_dental_data[dental_cols]

# With genus info, give the columns you want to use and convert categorical using one-hot-encoding
genus_info_cols = [
    "Genus",
    "Order",
    "Family",
    "Massg",
    "Diet",
    "DietSource"
]
        
df_genus_data = df_genus_data[genus_info_cols]

dummy_cols = [
    "Order",
    "Family",
    "Diet",
    "DietSource"
]

#The genus column must be the first one in genus data
df_genus_data = pd.get_dummies(df_genus_data, columns=dummy_cols)
df_genus_data = df_genus_data.replace({False: 0, True: 1})

# Merging dental data to other genus data
df_genus_data = df_genus_data.merge(df_dental_data, "left", on="Genus")

# The site information must be included into matrix for the algorithm
site_columns = ["NAME",
    # 'LAT',
    # 'LONG',
    'MAX_AGE',
    'MIN_AGE',
    # 'age_range',
    # 'Large_GenCount',
    # 'Small_GenCount',
    # 'Herb_GenCount',
    # 'Nonherb_GenCount',
    'MID_AGE'
    ]

NUM_OF_SITE_INFO_COLS = 8


dataframe_cleaned = dataframe.reset_index()
genus_columns = dataframe_cleaned.iloc[:,:-NUM_OF_SITE_INFO_COLS].columns.tolist()
df_occurences = dataframe_cleaned[genus_columns]
df_site = dataframe_cleaned[site_columns]

In [None]:
out = models.get_recommend_list_content_base(df_occurences, df_site, df_genus_data)

out

In [None]:
# Dataframe without the site information
models.get_metrics_content_base(dataframe.iloc[:,:-NUM_OF_SITE_INFO_COLS])

## 4. Test create train/test data

In [None]:
df_train, df_test = utils.split_traintest(dataframe, is_packed=False, is_encoded=False)

In [None]:
df_train

In [None]:
df_test