In [5]:
import pandas as pd

from surprise import Dataset, Reader
from surprise import KNNBasic, KNNWithMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [3]:
path = "../data/AllSites_SiteOccurrences_AllGenera_26.1.24.csv"

df = pd.read_csv(path)
df.head()

Unnamed: 0,SITE_NAME,Equus,Coelodonta,Bos,Gazella,Ursus,Vulpes,Cervus,Canis,Sus,...,Total_Gen_Count,Large_GenCount,Small_GenCount,smallperlarge,smallprop,Herb_GenCount,Nonherb_GenCount,DietRatio,HerbProp,mid_age
0,Aba Zawei,1,1,1,1,0,0,0,0,0,...,4,4,0,0.0,0.0,4,0,,1.0,0.0265
1,Abric Romani,1,0,1,0,1,1,1,1,1,...,12,12,0,0.0,0.0,6,5,1.2,0.5,0.055
2,Acheng_Jiaojie,0,0,0,0,0,0,1,0,0,...,7,5,2,0.4,0.285714,5,2,2.5,0.714286,0.21
3,Adler cave,1,0,0,0,0,1,0,1,0,...,10,5,5,1.0,0.5,6,4,1.5,0.6,0.0275
4,Adyrgan,1,0,0,1,0,0,0,0,0,...,11,5,6,1.2,0.545455,11,0,,1.0,2.2


In [4]:
cols_redundant = ['LAT',
 'LONG',
 'ALTITUDE',
 'MAX_AGE',
 'BFA_MAX',
 'BFA_MAX_ABS',
 'MIN_AGE',
 'BFA_MIN',
 'BFA_MIN_ABS',
 'COUNTRY',
 'age_range',
 'Total_Gen_Count',
 'Large_GenCount',
 'Small_GenCount',
 'smallperlarge',
 'smallprop',
 'Herb_GenCount',
 'Nonherb_GenCount',
 'DietRatio',
 'HerbProp',
 'mid_age'
 ]

df_site_genus = df.drop(columns=cols_redundant).set_index('SITE_NAME')

print(df_site_genus.shape)
df_site_genus.head()

(718, 452)


Unnamed: 0_level_0,Equus,Coelodonta,Bos,Gazella,Ursus,Vulpes,Cervus,Canis,Sus,Homo,...,Euarctos,Paracervulus,Eostyloceros,Cervocerus,Antispiroides,Sinoryx,Prospalax,Pliopetaurista,Predicrostonyx,Boocercus
SITE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aba Zawei,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Abric Romani,1,0,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
Acheng_Jiaojie,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Adler cave,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Adyrgan,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Building content-based filtering (by sites)

In [68]:
site_info_cols = [
 'SITE_NAME',
 'LAT',
 'LONG',
 'MAX_AGE',
 'MIN_AGE',
 'age_range',
 'Large_GenCount',
 'Small_GenCount',
 'Herb_GenCount',
 'Nonherb_GenCount',
 'mid_age'
 ]

df_site_info = df[site_info_cols].set_index('SITE_NAME')
df_site_info.head()

Unnamed: 0_level_0,LAT,LONG,MAX_AGE,MIN_AGE,age_range,Large_GenCount,Small_GenCount,Herb_GenCount,Nonherb_GenCount,mid_age
SITE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Aba Zawei,33.25,102.416667,0.0295,0.0235,0.006,4,0,4,0,0.0265
Abric Romani,41.530754,1.679613,0.07,0.04,0.03,12,0,6,5,0.055
Acheng_Jiaojie,45.351944,127.088056,0.266,0.154,0.112,5,2,5,2,0.21
Adler cave,49.25,16.667,0.045,0.01,0.035,5,5,6,4,0.0275
Adyrgan,43.05,80.2,2.5,1.9,0.6,5,6,11,0,2.2


In [69]:
cosine_sim = cosine_similarity(df_site_info, df_site_info)

In [70]:
cosine_sim.shape

(718, 718)

In [71]:
indices = pd.Series(df.index, index=df["SITE_NAME"]).drop_duplicates()
indices

SITE_NAME
Aba Zawei                           0
Abric Romani                        1
Acheng_Jiaojie                      2
Adler cave                          3
Adyrgan                             4
                                 ... 
Zhoukoudian_Upper Cave_sapiens    713
Ziyang_B site                     714
Zuurland                          715
Zuurland (-42 to -46 m)           716
Zverinogolovskoe                  717
Length: 718, dtype: int64

In [72]:
def get_recommendations(site_name, cosine_sim=cosine_sim, num_recommend = 10):
    idx = indices[site_name]

    # Get the pairwise similarity scores and sort the sites by them
    sim_scores = sorted(list(enumerate(cosine_sim[idx])), key=lambda x: x[1], reverse=True)

    # Get the scores of the num_recommend most similar sites
    top_similar = sim_scores[1:num_recommend+1]

    # Get the site indices
    site_indices = [i[0] for i in top_similar]

    return df["SITE_NAME"].iloc[site_indices]

In [78]:
def find_recommended_species_for_site(site_name, n_species_to_recommend, n_sites_use=len(indices)-1):
    # Find the n_sites_use number of similar sites
    similar_sites = list(get_recommendations(site_name, cosine_sim, n_sites_use))

    # For the site wanted to examine (site_name), find the species living there
    target_row = df_site_genus.loc[site_name]
    site_species = set(target_row[target_row == 1].index.tolist())
    site_species_recommended = set()

    similar_site_species = []
    
    # For the similar sites, find the species living there
    for site in similar_sites:
        target_row = df_site_genus.loc[site]
        similar_site_species += target_row[target_row == 1].index.tolist()
    
    for species in similar_site_species:
        if len(site_species_recommended) >= n_species_to_recommend:
            break
        
        if (species not in site_species) & (species not in site_species_recommended):
            site_species_recommended.add(species)

    return site_species, site_species_recommended

Find n number of recommendations instead of n sites?

In [81]:
find_recommended_species_for_site("Adyrgan", 3)

({'Anancus',
  'Cricetulus',
  'Equus',
  'Gazella',
  'Gigantocamelus',
  'Meriones',
  'Mimomys',
  'Orientalomys',
  'Promimomys',
  'Sinotherium',
  'Villanyia'},
 {'Capreolus', 'Coelodonta', 'Ochotona'})

In [42]:
get_recommendations('Aba Zawei', num_recommend = 10)[0:5]

77     Bolshaja Chukochja #25
78     Bolshaja Chukochja #27
79     Bolshaja Chukochja #36
278             Krestovka l.7
610                   Ushki I
Name: SITE_NAME, dtype: object

In [104]:
def find_recommendations_for_all_sites(n_species_to_recommend):
    sites_recommendations = df_site_genus.copy()
    sites_recommendations.loc[:, :] = 0
    for site, idx in indices.items():
        true_species, recommendations = find_recommended_species_for_site(site_name=site,
                                                            n_species_to_recommend=n_species_to_recommend)
        
        recommendations = list(recommendations)
        
        for recommendation in recommendations:
            sites_recommendations.at[site, recommendation] = 1
    
    return sites_recommendations        

In [105]:
test = find_recommendations_for_all_sites(5)
test.head()

Unnamed: 0_level_0,Equus,Coelodonta,Bos,Gazella,Ursus,Vulpes,Cervus,Canis,Sus,Homo,...,Euarctos,Paracervulus,Eostyloceros,Cervocerus,Antispiroides,Sinoryx,Prospalax,Pliopetaurista,Predicrostonyx,Boocercus
SITE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aba Zawei,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Abric Romani,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acheng_Jiaojie,1,1,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Adler cave,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Adyrgan,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Building collaborative filtering

In [21]:
df_site_genus_non_matrix = df_site_genus.stack().reset_index().rename(columns={"level_1": "genus", 0: "presence"})
df_site_genus_non_matrix = df_site_genus_non_matrix[df_site_genus_non_matrix["presence"] != 0].reset_index(drop=True)
df_site_genus_non_matrix

Unnamed: 0,SITE_NAME,genus,presence
0,Aba Zawei,Equus,1
1,Aba Zawei,Coelodonta,1
2,Aba Zawei,Bos,1
3,Aba Zawei,Gazella,1
4,Abric Romani,Equus,1
...,...,...,...
9925,Zverinogolovskoe,Spermophilus,1
9926,Zverinogolovskoe,Marmota,1
9927,Zverinogolovskoe,Sicista,1
9928,Zverinogolovskoe,Pliolagus,1


In [25]:
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(df_site_genus_non_matrix, reader) # Column order must be user, item, rating

In [27]:
sim_options = {
    'name': 'cosine',
    'user_based': True  # True for user-user, False for item-item
}

trainset = data.build_full_trainset()

knn = KNNBasic()
knn.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1449ccdd0>

In [32]:
trainset.all_users()

range(0, 718)

In [37]:
site_recommendations = {}
for site_id in trainset.all_users():
    site_name = trainset.to_raw_uid(site_id)
    raw_recommendations = knn.get_neighbors(site_id, k=5)
    # print(knn.get_neighbors(site_name, k=10))
    recommendations = [trainset.to_raw_uid(uid) for uid in raw_recommendations]
    site_recommendations[site_name] = recommendations

In [38]:
site_recommendations

{'Aba Zawei': ['Abric Romani',
  'Adler cave',
  'Adyrgan',
  'Akali',
  'Akhalkalaki'],
 'Abric Romani': ['Aba Zawei',
  'Acheng_Jiaojie',
  'Adler cave',
  'Adyrgan',
  'Akali'],
 'Acheng_Jiaojie': ['Abric Romani',
  'Adler cave',
  'Akali',
  'Akhalkalaki',
  'Akhshtyrskaja cave: l. 2'],
 'Adler cave': ['Aba Zawei',
  'Abric Romani',
  'Acheng_Jiaojie',
  'Adyrgan',
  'Akali'],
 'Adyrgan': ['Aba Zawei',
  'Abric Romani',
  'Adler cave',
  'Akali',
  'Akhalkalaki'],
 'Akali': ['Aba Zawei',
  'Abric Romani',
  'Acheng_Jiaojie',
  'Adler cave',
  'Adyrgan'],
 'Akhalkalaki': ['Aba Zawei',
  'Abric Romani',
  'Acheng_Jiaojie',
  'Adler cave',
  'Adyrgan'],
 'Akhshtyrskaja cave: l. 2': ['Abric Romani',
  'Acheng_Jiaojie',
  'Adler cave',
  'Akali',
  'Akhalkalaki'],
 'Akhstyrskaya Cave': ['Abric Romani',
  'Acheng_Jiaojie',
  'Adler cave',
  'Akali',
  'Akhalkalaki'],
 'Alekseevsk': ['Abric Romani',
  'Acheng_Jiaojie',
  'Adler cave',
  'Akali',
  'Akhalkalaki'],
 'Almenara-Casablanca 1':