In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from sklearn.metrics import mean_squared_error

from itertools import combinations

In [4]:
plt.style.use('seaborn-v0_8')
plt.rcParams.update({'font.size': 8})

## 1. Prepare data

In [16]:
path = "/home/tanakaki/Desktop/fossilNOW/FossilNOW/data/AllSites_SiteOccurrences_AllGenera_26.1.24.csv"

df_train = pd.read_csv(path)

df_train.head()

df_train.columns[450:]

Index(['Pliopetaurista', 'Predicrostonyx', 'Boocercus', 'LAT', 'LONG',
       'ALTITUDE', 'MAX_AGE', 'BFA_MAX', 'BFA_MAX_ABS', 'MIN_AGE', 'BFA_MIN',
       'BFA_MIN_ABS', 'COUNTRY', 'age_range', 'Total_Gen_Count',
       'Large_GenCount', 'Small_GenCount', 'smallperlarge', 'smallprop',
       'Herb_GenCount', 'Nonherb_GenCount', 'DietRatio', 'HerbProp',
       'mid_age'],
      dtype='object')

## 2. Collaborative Filtering

In [21]:
df_pivot = df_train.iloc[:, :453]
df_pivot = df_pivot.set_index("SITE_NAME")
df_pivot.head()

Unnamed: 0_level_0,Equus,Coelodonta,Bos,Gazella,Ursus,Vulpes,Cervus,Canis,Sus,Homo,...,Euarctos,Paracervulus,Eostyloceros,Cervocerus,Antispiroides,Sinoryx,Prospalax,Pliopetaurista,Predicrostonyx,Boocercus
SITE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aba Zawei,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Abric Romani,1,0,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
Acheng_Jiaojie,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Adler cave,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Adyrgan,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2.1. For each user, find top 10 most similar users

In [48]:
# TODO: HoangLe [Feb-29]: Do something
def calc_dis_cos(a, b):

    a = a.fillna(a.mean())
    b = b.fillna(b.mean())

    dis = scipy.spatial.distance.euclidean(a, b)
    cos = scipy.spatial.distance.cosine(a,b)

    return dis, cos

distances = {}
sites = {}

for i in df_pivot.index:
    sites[str(i)] = []

for i in combinations(df_pivot.index, 2):
    distances[(i[0], i[1])] = calc_dis_cos(df_pivot.loc[i[0]], df_pivot.loc[i[1]])

sorted_distances = sorted(distances.items(), key=lambda x:x[1])

for i in sorted_distances:
    a = i[0][0]
    b = i[0][1]
    sites[str(a)].append(b)
    sites[str(b)].append(a)

### 2.2. For each pair of user and item between which the rating is NaN, aggregate the mean of rating similar users rated that item

In [35]:
# TODO: HoangLe [Feb-29]: Do something

site_1 = "Akali"
genera_1 = "Equus"

top10 = sites[site_1][:10]

y_pred = np.mean(df_pivot.loc[top10, genera_1])

mean_squared_error([df_pivot.loc[site_1, genera_1]], [y_pred])

0.4


0.36

In [53]:
for site, genera in df_pivot.iterrows():
    for genera_name in df_pivot.columns:
        similar_sites = sites[site][:10]
        y_pred = np.mean(df_pivot.loc[similar_sites, genera_name])
        df_pivot.at[site, genera_name] = y_pred

df_pivot

Unnamed: 0_level_0,Equus,Coelodonta,Bos,Gazella,Ursus,Vulpes,Cervus,Canis,Sus,Homo,...,Euarctos,Paracervulus,Eostyloceros,Cervocerus,Antispiroides,Sinoryx,Prospalax,Pliopetaurista,Predicrostonyx,Boocercus
SITE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aba Zawei,0.978035,0.909039,0.410665,0.420968,0.017058,0.007239,0.477399,0.064005,0.037107,0.043670,...,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
Abric Romani,0.671293,0.292276,0.393628,0.040367,0.464165,0.196783,0.727557,0.378941,0.421602,0.206780,...,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
Acheng_Jiaojie,0.673009,0.363521,0.136172,0.045930,0.237651,0.062250,0.763885,0.183437,0.360925,0.037262,...,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
Adler cave,0.890801,0.600465,0.170891,0.059560,0.241584,0.178944,0.366412,0.352058,0.101980,0.005242,...,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
Adyrgan,0.901392,0.687698,0.229454,0.332778,0.044091,0.009555,0.393061,0.109569,0.019918,0.014337,...,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zhoukoudian_Upper Cave_sapiens,0.521219,0.296766,0.248939,0.073846,0.481412,0.240615,0.778430,0.275464,0.551403,0.131179,...,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.006689
Ziyang_B site,0.342496,0.239342,0.103211,0.070927,0.219449,0.014872,0.841182,0.049381,0.607921,0.383074,...,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
Zuurland,0.254258,0.158346,0.053157,0.037036,0.016059,0.021797,0.129571,0.033256,0.014747,0.004591,...,0,0,0,0,0,0,0.000234,0.000234,0.000234,0.000000
Zuurland (-42 to -46 m),0.255340,0.160434,0.052099,0.035651,0.016801,0.023841,0.123836,0.034068,0.014650,0.004908,...,0,0,0,0,0,0,0.000404,0.000404,0.000404,0.000000


In [102]:
df_train_trimmed = df_train.set_index("SITE_NAME")
df_train_trimmed = df_train_trimmed.iloc[:, :452]

mse = mean_squared_error(df_train_trimmed.values, df_pivot.values)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.019257188640569516


In [111]:
df_pivot = df_pivot.reset_index(names=["SITE_NAME"])

df_results = pd.concat([df_pivot, df_train.iloc[:, 453:]], axis=1)

df_results.to_csv("data/Collaborative_filtering_data.csv", index=False)