In [79]:
import numpy as np # array and vector manipulation
import pandas as pd # data manipulation
from sklearn.preprocessing import StandardScaler
import math

In [80]:
# loading our dataset: biodiversity project
# paper: https://doi.org/10.1080/0952813X.2021.1871972
# full dataset: https://data.mendeley.com/datasets/9x62992sw6/2
data_teste = pd.read_csv('/tmp/dataset/cleanDS.csv')
data_teste

Unnamed: 0,alt,temp2m,temp2mrange,humidity,precip,atm,wind,m.fapar,classe
0,0.361674,0.719384,0.719384,0.614403,0.594679,0.979630,0.192118,0.665940,0
1,0.067043,0.916464,0.916464,0.579856,0.516432,0.994444,0.170624,0.365190,0
2,0.039864,0.856448,0.856448,0.818496,0.624413,1.000000,0.123337,0.798657,0
3,0.064889,0.821573,0.821573,0.872974,0.632238,1.000000,0.173509,0.999390,0
4,0.025681,0.788321,0.788321,1.000000,0.871674,1.000000,0.283287,0.896407,0
...,...,...,...,...,...,...,...,...,...
860,0.118726,0.142741,0.142741,0.707414,0.397496,0.112963,0.250426,0.623997,0
861,0.014968,0.197891,0.197891,0.718044,0.369327,0.112963,0.252041,0.455267,1
862,0.129396,0.142741,0.142741,0.680308,0.367762,0.061111,0.242492,0.627834,0
863,0.000000,0.213301,0.213301,0.712463,0.350548,0.061111,0.240703,0.397279,0


In [35]:
# creating our version of euclidean distance

def euclidean_distance(x, y):
    return np.sqrt(np.sum(pow((x - y), 2)))

In [44]:
# a simple KNN version 

def knn(train, test, k = 1):
    distances = np.zeros(train.shape[0])
    for i in np.arange(train.shape[0]):
        distances[i] = euclidean_distance(train.iloc[i, 0:8], test[0:8])
        
    return (np.argsort(distances)[0:k])

In [81]:
# selecting an instance to be used as test
train = data_teste
index_test = 4
teste = train.iloc[index_test]

In [85]:
# removing it from the dataset
train.drop(train.index[[index_test]], inplace = True)
train

Unnamed: 0,alt,temp2m,temp2mrange,humidity,precip,atm,wind,m.fapar,classe
0,0.361674,0.719384,0.719384,0.614403,0.594679,0.979630,0.192118,0.665940,0
1,0.067043,0.916464,0.916464,0.579856,0.516432,0.994444,0.170624,0.365190,0
2,0.039864,0.856448,0.856448,0.818496,0.624413,1.000000,0.123337,0.798657,0
3,0.064889,0.821573,0.821573,0.872974,0.632238,1.000000,0.173509,0.999390,0
5,0.007400,0.847526,0.847526,0.855700,0.818466,1.000000,0.131243,0.793861,0
...,...,...,...,...,...,...,...,...,...
860,0.118726,0.142741,0.142741,0.707414,0.397496,0.112963,0.250426,0.623997,0
861,0.014968,0.197891,0.197891,0.718044,0.369327,0.112963,0.252041,0.455267,1
862,0.129396,0.142741,0.142741,0.680308,0.367762,0.061111,0.242492,0.627834,0
863,0.000000,0.213301,0.213301,0.712463,0.350548,0.061111,0.240703,0.397279,0


In [48]:
# running our knn
output = knn(train, teste, k=5)

In [54]:
# checking the nearest instantances
train.iloc[output]

Unnamed: 0,alt,temp2m,temp2mrange,humidity,precip,atm,wind,m.fapar,classe
26,0.04575,0.817518,0.817518,0.9519,0.940532,0.994444,0.268053,0.809121,1
20,0.046128,0.807786,0.807786,0.954026,0.99687,0.994444,0.253599,0.906348,0
16,0.046247,0.83455,0.83455,0.888121,0.790297,0.994444,0.266611,0.945239,1
6,0.026737,0.845093,0.845093,0.89317,0.784038,1.0,0.265399,0.724538,1
40,0.022491,0.822384,0.822384,0.859421,0.72457,0.97963,0.170191,0.92187,1


# Usando sklearn

In [55]:
# loading the necessary functions
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [58]:
# creating train/test folds
y = data_teste['classe']
data_teste = data_teste.drop(['classe'], axis = 1)
X_train,X_test,y_train,y_test = train_test_split(data_teste, y, test_size=0.1, random_state=42, 
                                                 stratify=y, shuffle=True)

In [75]:
# setting the hyperparameter k=10
lazy = KNeighborsClassifier(n_neighbors = 10)

In [76]:
# start modeling the problem
# as this algorithm is lazy, the model is basically storing the KB
lazy.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [77]:
# lets check the performance of our model by classifying new examples
lazy.score(X_test, y_test)

0.5747126436781609