## Loading the dataset and building vectors for machine learning models

In [27]:
import os
import pandas as pd
import numpy as np
from IPython.display import display
pd.options.display.max_columns=10
# random seed for reproducibility
np.random.seed(42)

## Visualizing the data

In [28]:
from sklearn.model_selection import train_test_split
from glcdataset import build_environmental_data
from sklearn.preprocessing import StandardScaler

In [6]:
# working on a subset of Pl@ntNet Trusted: 2500 occurrences
df = pd.read_csv('example_occurrences.csv',
             sep=';', header='infer', quotechar='"', low_memory=True)

df = df[['Longitude','Latitude','glc19SpId','scName']]\
       .dropna(axis=0, how='all')\
       .astype({'glc19SpId': 'int64'})

# target pandas series of the species identifiers (there are 505 labels)
target_df = df['glc19SpId']
# correspondence table between ids and the species taxonomic names
# (Taxref names with year of discoverie)
# taxonomic_names = pd.read_csv('../data/occurrences/taxaName_glc19SpId.csv',
#                              sep=';',header='infer', quotechar='"',low_memory=True)
print(len(df), 'occurrences in the dataset')
print(len(target_df.unique()), 'number of species\n')
duplicated_df = df[df.duplicated(subset=['Latitude','Longitude'],keep=False)]
print(f'{len(duplicated_df)} entries observed at interfering locations')
display(df.head(3))

2499 occurrences in the dataset
505 number of species

30 entries observed at interfering locations


Unnamed: 0,Longitude,Latitude,glc19SpId,scName
1746,6.724279,47.59021,30905,Datura stramonium L.
1935,4.462297,49.17102,33024,Euphorbia marginata Pursh
311,2.277133,48.8101,30144,Geum urbanum L.


One example of two interfering examples: at index 383 and index 1200: lat,lng =(44.978460,-1.075745) and species ids = 31867 (Arenaria montana L.) and 31734 (Tuberaria guttata (L.) Fourr.)

Building the environmental data: concatenated (lat,lng)+ environmental variables

In [30]:
# building the environmental data
env_df = build_environmental_data(df[['Latitude','Longitude']],patches_dir='example_envtensors')
display(env_df.head(3))
display(target_df.head(5))

Unnamed: 0,Latitude,Longitude,alti,awc_top,bs_top,...,etp,oc_top,pd_top,proxi_eau_fast,text
0,2.118889,43.95195,189.375,165.0,85.0,...,1219.375,1.0,2.0,0.0,2.0
1,-0.5925,45.10639,45.625,120.0,35.0,...,1140.625,1.0,1.0,0.0,1.0
2,-4.534861,48.38958,69.375,0.0,85.0,...,800.625,2.0,2.0,0.0,0.0


0    30021
1    31997
2    31385
3    33228
4    33228
Name: glc19SpId, dtype: int64

Getting the data (numpy arrays) and scaling the data to have a mean of 0 and unit variance: this is necessary for most of ML models to work as expected

In [32]:
X = env_df.values
y = target_df.values
# Standardize the features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X = scaler.fit_transform(X)

## Vector model

In [33]:
from vector_model import VectorModel
# Evaluate as the average accuracy on one train/split random sample:
print("Test vector model, euclidean metric")
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
classifier = VectorModel(metric='euclidean')
classifier.fit(X_train,y_train)
y_predicted = classifier.predict(X_test)
print(f'Top30 score:{classifier.top30_score(y_predicted, y_test)}')
print(f'MRR score:{classifier.mrr_score(y_predicted, y_test)}')
print('Params:',classifier.get_params())

print("\nTest vector model, cosine metric")

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
classifier = VectorModel(metric='cosine')
classifier.fit(X_train,y_train)
y_predicted = classifier.predict(X_test)
print(f'Top30 score:{classifier.top30_score(y_predicted, y_test)}')
print(f'MRR score:{classifier.mrr_score(y_predicted, y_test)}')
print('Params:',classifier.get_params())

Test vector model, euclidean metric
Top30 score:0.246
MRR score:0.05718168788586186
Params: {'metric': 'euclidean', 'ranking_size': 30}

Test vector model, cosine metric
Top30 score:0.258
MRR score:0.057060165142736534
Params: {'metric': 'cosine', 'ranking_size': 30}


## K-nearest neighbors model

In [34]:
from knn_model import KNearestNeighborsModel

# Evaluate as the average accuracy on one train/split random sample:
print("Test KNN model, uniform weights")
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
classifier = KNearestNeighborsModel(n_neighbors=150, weights='uniform')
classifier.fit(X_train,y_train)
y_predicted = classifier.predict(X_test)
print(f'Top30 score:{classifier.top30_score(y_predicted, y_test)}')
print(f'MRR score:{classifier.mrr_score(y_predicted, y_test)}')
print('Params:',classifier.get_params())

print("\nTest KNN model, distance weights")
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
classifier = KNearestNeighborsModel(n_neighbors=150, weights='distance')
classifier.fit(X_train,y_train)
y_predicted = classifier.predict(X_test)
print(f'Top30 score:{classifier.top30_score(y_predicted, y_test)}')
print(f'MRR score:{classifier.mrr_score(y_predicted, y_test)}')
print('Params:',classifier.get_params())

print("\nExample of predict proba:")
print(f"occurrence:\n{X_test[12]}")
y_pred, y_probas = classifier.predict(X_test[12].reshape(1,-1), return_proba=True)
print(f'predicted labels:\n{y_pred}')
print(f'predicted probas:\n{y_probas}')

Test KNN model, uniform weights
Top30 score:0.27
MRR score:0.058909794847547436
Params: {'metric': 'minkowski', 'n_neighbors': 150, 'p': None, 'ranking_size': 30, 'weights': 'uniform'}

Test KNN model, distance weights
Top30 score:0.268
MRR score:0.05116993690634731
Params: {'metric': 'minkowski', 'n_neighbors': 150, 'p': None, 'ranking_size': 30, 'weights': 'distance'}

Example of predict proba:
occurrence:
[ 0.04019058  1.08806812 -0.32832856  1.03770293  0.65384761 -0.73433122
 -0.56597094 -0.54729947 -0.46501198 -0.44076546 -0.88653399  0.38336852
 -0.98292161 -0.9797477   0.35280841  0.56206479 -0.36632291  0.55868843
  0.31648711 -0.04025585  0.04143279 -0.6445334   0.55454116 -0.94042465
 -1.56363946 -0.79182766  1.41840965 -1.7189894   0.28012152  0.75700134
 -0.03892492 -0.8874651   1.01713738 -0.34948162  0.94649875]
predicted labels:
[[30025 33042 32516 30683 29979 30003 30463 29980 30591 31347 30363 31218
  33900 30750 30728 32668 29981 30634 30905 30184 30425 31453 30925 3

In [35]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

class ClusteringModel():

    def _load_data(self, sklearn_load_ds):
        
        data = sklearn_load_ds
        X = pd.DataFrame(data.data)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, data.target, test_size=0.3, random_state=42)

    def __init__(self, sklearn_load_ds):
        self._load_data(sklearn_load_ds)

    def classify(self, model=LogisticRegression(random_state=42)):
        model.fit(self.X_train, self.y_train)
        y_pred = model.predict(self.X_test)
        print('Accuracy: {}'.format(accuracy_score(self.y_test, y_pred)))

    def clusterize(self, output='add'):
        n_clusters = len(np.unique(self.y_train))
        clf = KMeans(n_clusters = n_clusters, random_state=42)
        clf.fit(self.X_train)
        y_labels_train = clf.labels_
        y_labels_test = clf.predict(self.X_test)
        if output == 'add':
            self.X_train['km_clust'] = y_labels_train
            self.X_test['km_clust'] = y_labels_test
        elif output == 'replace':
            self.X_train = y_labels_train[:, np.newaxis]
            self.X_test = y_labels_test[:, np.newaxis]
        else:
            raise ValueError('output should be either add or replace')
        return self
