In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns

from models import models, utils

In [None]:
path = "../data/data_occ.csv"

dataframe = pd.read_csv(path, delimiter='\t')

dataframe.head()

## 1. Test with Matrix Factorization

In [None]:
out = models.get_recommend_list_mf(dataframe, output_prob=False, num_epochs=200)
out.head()

In [None]:
models.get_metrics_mf(dataframe, output_prob=False)

## 2. Test with KNN

In [None]:
out = models.get_recommend_list_knn(dataframe, output_prob=False)

out

In [None]:
models.get_metrics_knn(dataframe, output_prob=False)

## 3. Test with Content-Based Filtering

In [None]:
# Genus data is necessary for content based filtering
genus_data_path = "../data/FossilGenera_MammalMassDiet_Jan24.csv"
df_genus_data = pd.read_csv(genus_data_path, sep=",")

dental_data_path = "../data/DentalTraits_Genus_PPPA_ds.csv"
df_dental_data = pd.read_csv(dental_data_path, sep=",")

# Genus data must be preprocessed beforehand

dental_cols = [
    "Genus",
    "HY",
    "LOP",
    "AL",
    "OL",
    "SF",
    "BUN",
    "OT",
    "Excl_AL"
]

df_dental_data = df_dental_data[dental_cols]

# With genus info, give the columns you want to use and convert categorical using one-hot-encoding
genus_info_cols = [
    "Genus",
    "Order",
    "Family",
    "Massg",
    "Diet",
    "DietSource"
]
        
df_genus_data = df_genus_data[genus_info_cols]

dummy_cols = [
    "Order",
    "Family",
    "Diet",
    "DietSource"
]

#The genus column must be the first one in genus data
df_genus_data = pd.get_dummies(df_genus_data, columns=dummy_cols)
df_genus_data = df_genus_data.replace({False: 0, True: 1})

# Merging dental data to other genus data
df_genus_data = df_genus_data.merge(df_dental_data, "left", on="Genus")

# The site information must be included into matrix for the algorithm
site_columns = ["NAME",
    # 'LAT',
    # 'LONG',
    'MAX_AGE',
    'MIN_AGE',
    # 'age_range',
    # 'Large_GenCount',
    # 'Small_GenCount',
    # 'Herb_GenCount',
    # 'Nonherb_GenCount',
    'MID_AGE'
    ]

genus_columns = dataframe.iloc[:,:-10].columns.tolist()
dataframe_cleaned = dataframe[genus_columns + site_columns]

# Site name must be the first column
desired_column_order = ['NAME'] + [col for col in dataframe_cleaned.columns if col != 'NAME']
dataframe_cleaned = dataframe_cleaned[desired_column_order]
dataframe_cleaned.head()

In [None]:
out = models.get_recommend_list_content_base(dataframe_cleaned, df_genus_data, n_site_info_cols=3)

out

In [None]:
models.get_metrics_content_base(dataframe)

## 4. Test create train/test data

In [None]:
df_train, df_test = utils.split_traintest(dataframe, is_packed=False, is_encoded=False)

In [None]:
df_train

In [None]:
df_test