In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm.contrib.itertools import product

import utils

In [3]:
NUM_SITES_EACH_PACK = 2
NUM_GENERA_EACH_PACK = 4

PATH_DIR_DATA_PROCESS = Path("data_processed")

# 1. Load data

In [4]:
path = "../../data/AllSites_SiteOccurrences_AllGenera_26.1.24.csv"

df = pd.read_csv(path)

df.head()

Unnamed: 0,SITE_NAME,Equus,Coelodonta,Bos,Gazella,Ursus,Vulpes,Cervus,Canis,Sus,...,Total_Gen_Count,Large_GenCount,Small_GenCount,smallperlarge,smallprop,Herb_GenCount,Nonherb_GenCount,DietRatio,HerbProp,mid_age
0,Aba Zawei,1,1,1,1,0,0,0,0,0,...,4,4,0,0.0,0.0,4,0,,1.0,0.0265
1,Abric Romani,1,0,1,0,1,1,1,1,1,...,12,12,0,0.0,0.0,6,5,1.2,0.5,0.055
2,Acheng_Jiaojie,0,0,0,0,0,0,1,0,0,...,7,5,2,0.4,0.285714,5,2,2.5,0.714286,0.21
3,Adler cave,1,0,0,0,0,1,0,1,0,...,10,5,5,1.0,0.5,6,4,1.5,0.6,0.0275
4,Adyrgan,1,0,0,1,0,0,0,0,0,...,11,5,6,1.2,0.545455,11,0,,1.0,2.2


# 2. Preprocess

## 2.1. Remove redundant columns

In [5]:
cols_redundant = [
    'LAT',
    'LONG',
    'ALTITUDE',
    'MAX_AGE',
    'BFA_MAX',
    'BFA_MAX_ABS',
    'MIN_AGE',
    'BFA_MIN',
    'BFA_MIN_ABS',
    'COUNTRY',
    'age_range',
    'Total_Gen_Count',
    'Large_GenCount',
    'Small_GenCount',
    'smallperlarge',
    'smallprop',
    'Herb_GenCount',
    'Nonherb_GenCount',
    'DietRatio',
    'HerbProp',
    'mid_age'
 ]

df = df.drop(columns=cols_redundant).set_index('SITE_NAME')

df.head()

Unnamed: 0_level_0,Equus,Coelodonta,Bos,Gazella,Ursus,Vulpes,Cervus,Canis,Sus,Homo,...,Euarctos,Paracervulus,Eostyloceros,Cervocerus,Antispiroides,Sinoryx,Prospalax,Pliopetaurista,Predicrostonyx,Boocercus
SITE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aba Zawei,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Abric Romani,1,0,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
Acheng_Jiaojie,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Adler cave,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Adyrgan,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2.2. Do OrdinalEncoding

In [6]:
list_sites = df.index
list_genera = df.columns

In [7]:
enc_genera = utils.CategoryDict.from_list(list_genera)
enc_site = utils.CategoryDict.from_list(list_sites)

# 3. Create train/val data

In [8]:
def iterate_pack(values: list, n: int):
    assert len(values) % n == 0

    for i in range(0, len(values), n):
        yield values[i:i+n]


data = []
for sites, genera in product(
    iterate_pack(list_sites, NUM_SITES_EACH_PACK),
    iterate_pack(list_genera, NUM_GENERA_EACH_PACK)
):
    occurence = df.loc[sites, genera].to_numpy().astype(np.float32)
    sites_encoded = enc_site.names2ids(sites)
    genera_encoded = enc_genera.names2ids(genera)

    data.append({
        'occurence': occurence,
        'sites': sites_encoded,
        'genera': genera_encoded
    })


0it [00:00, ?it/s]

In [9]:
random.shuffle(data)

data_train, data_val = data[:38000], data[38000:]

# 4. Save processed data

## 4.1. Save encoding

In [12]:
path_dir_encode = PATH_DIR_DATA_PROCESS / "encoder"

path_dir_encode.mkdir(exist_ok=True, parents=True)

In [13]:
enc_genera.save_dict(path_dir_encode / "ordinal_enc_genera.json")
enc_site.save_dict(path_dir_encode / "ordinal_enc_site.json")

## 4.2. Save training/validating data

In [14]:
path_dir_data = PATH_DIR_DATA_PROCESS / "trainval"

path_dir_data.mkdir(exist_ok=True, parents=True)

In [15]:
np.save(path_dir_data / "data_train.npy", data_train)
np.save(path_dir_data / "data_val.npy", data_val)