In [19]:
import xarray as xr
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split

## Notebook for the implementation and evaluation of the k-Nearest Neighbor algorithm

(basic info about file structure and running)

#### Reading the data from 2025 & 2024

In [2]:
ds_2025 = xr.open_dataset("files/2025_KVS_deployment_flagged.nc")
ds_2024 = xr.open_dataset("files/2024_KVS_deployment_flagged.nc")

We calculate the difference between 1m temperature and surface temperature to train and classify based on the relationship between the surface and 1m temperature rather than just outliers in the 1m temperature

#### A simple example of the k-NN classification algorithm

In [None]:
def euclidean_distance(point1,point2):
    d_sum = np.sum((np.array(point1)-np.array(point2))**2)
    return np.sqrt(d_sum)



In [30]:
def kNN_predict(X_train,X_lab,test_point,k):
    '''
    K Nearest Neighbor prediction funcion

    Parameters
    ----------

    X_train : np.array[float]
        Training data 

    X_lab : np.array[int]
        Labels for training data, in this case likely array
        of values either 0, for normal or 1 for outlier
    
    test_point : float
        The data point which we are classifying

    k : int
        Hyperparameter for k-NN algorithm

    Returns
    -------

    label : int
        The predicted label for the test point

    '''
    dist = []

    for i in range(len(X_train)):
        d = euclidean_distance(test_point,X_train[i])
        dist.append((d,X_lab[i]))
    dist.sort(key=lambda x: x[0])
    k_nearest_labels = [label for _, label in dist[:k]]

    return Counter(k_nearest_labels).most_common(1)[0][0]

In [17]:
X_train = [[1,2],[2,3],[3,4],[6,7],[7,8]]
X_lab = ["A","A","A","B","B"]
test_point = [4,5]
k = 3

In [18]:
prediction = kNN_predict(X_train,X_lab,test_point,k)
prediction

'A'

#### Using sklearn

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [None]:
ds_2025_1m_temp = ds_2025["temp_1m_calibrated"]
ds_2025_1m_labels = ds_2025["temp_1m_quality_flag"]
ds_2025_ir_temp = ds_2025["temp_snowsurface"]
ds_2025_temp_diff = ds_2025_1m_temp - ds_2025_ir_temp

ds_2024_1m_temp = ds_2024["temp_1m_calibrated"]
ds_2024_1m_lab = ds_2024["temp_1m_quality_flag"]
ds_2024_ir_temp = ds_2024["temp_snowsurface_calibrated"]
ds_2024_temp_diff = ds_2024_1m_temp - ds_2024_ir_temp

#### Choosing a k value

On temperature data

In [None]:
X_train, X_test = train_test_split(ds_2025_1m_temp.isel(trajectory=2),test_size=0.2)


In [25]:
print(np.shape(X_train),np.shape(X_test))

(2516,) (629,)


Load in temperature dataset, test kNN_predict using labels and data from 2025 and test points from the 2024 dataset (also probably do a train test split for 2025 and then do the 2024)

when that is done we try and visualize that, and add all of those plots to the overleaf document

then using 2025 as training data and 2024 as test data, we run the actual use case