# Задача 1

In [35]:
import numpy as np
import pandas as pd
from itertools import islice
from scipy.stats import mode
from sklearn.preprocessing import MinMaxScaler

In [36]:
def kNN(x, data, k=5):
    # ddata = copy.deepcopy(data)
    ddata = data.copy()

    labels = ddata["label"].unique()
    vectors = ddata.drop("label", axis=1)

    # computing distances to other vectors
    ddata["distance"] = np.linalg.norm(vectors.sub(np.array(x)), axis=1)

    # sorting by distance to x
    ddata = ddata.sort_values(by="distance", axis=0)
    first_k = ddata.iloc[0:k]

    return labels[np.argmax([first_k[first_k["label"] == label].shape[0] for label in labels])]


data_spam = pd.read_csv("../data/spam.csv")
data_cancer = pd.read_csv("../data/cancer.csv")

# Задача 2

$ \operatorname { LOO } ( k , D ) = \frac { \sum _ { \mathbf { x } _ { i } \in D } \left[ h \left( \mathbf { x } _ { i } ; D \backslash \mathbf { x } _ { i } ; k \right) \neq y _ { i } \right] } { | D | } $

In [5]:
def LOOs(data):
    ddata = data.copy(deep=True)
    LOOs = np.zeros(10, dtype=int)

    neighbours = list()
    for x in ddata.itertuples():
        test_vec = ddata.iloc[[x.Index]].drop("label", axis=1)
        vectors = ddata.drop("label", axis=1)
        indexed_distances = zip(np.linalg.norm(vectors.sub(np.array(test_vec)), axis=1), range(0, len(vectors)))
        neighbours = list(list(zip(*islice(sorted(indexed_distances), 1, 11)))[1])

        for k, neighbour_index in enumerate(neighbours):
            if (x.label != mode(ddata.loc[neighbours[0:k+1]].label.to_list()).mode[0]):
                LOOs[k] += 1
    return pd.DataFrame({'k': range(1, 11), 'LOO(k)': LOOs/len(ddata)})


LOOs(data_cancer)



Unnamed: 0,k,LOO(k)
0,1,0.084359
1,2,0.077329
2,3,0.073814
3,4,0.073814
4,5,0.066784
5,6,0.070299
6,7,0.068541
7,8,0.070299
8,9,0.066784
9,10,0.070299


In [6]:
LOOs(data_spam)

Unnamed: 0,k,LOO(k)
0,1,0.168876
1,2,0.190393
2,3,0.184742
3,4,0.192349
4,5,0.185394
5,6,0.199957
6,7,0.195827
7,8,0.201043
8,9,0.202782
9,10,0.20213


# Задача 3

In [6]:
def radius_neighbours(x, data, r=10):
    ddata = data.copy()

    labels = ddata["label"].unique()
    vectors = ddata.drop("label", axis=1)

    # computing distances to other vectors
    ddata["distance"] = np.linalg.norm(vectors.sub(np.array(x)), axis=1)

    # getting neighbours in radius r
    neighbours = ddata[ddata["distance"] < r]

    if neighbours.empty:
        # if there are no neighbours in a sphere then use least frequent class
        return labels.value_counts()[-1]
    else:
        return labels[np.argmax([neighbours[neighbours["label"] == label].shape[0] for label in labels])]

# Задача 4

In [72]:
def LOO_radius(data, radii = [10]):
    ddata = data.copy(deep=True)
    labels = ddata["label"].unique()
    LOOs = np.zeros(len(radii), dtype=int)
    
    neighbours = list()
    for x in ddata.itertuples():
        test_vec = ddata.iloc[[x.Index]].drop("label", axis=1)
        vectors = ddata.drop("label", axis=1)
        
        ddata["distance"] = np.linalg.norm(vectors.sub(np.array(test_vec)), axis=1)
        #print(ddata)

        neighbours = [ ddata[ddata["distance"] < radius][1:] for radius in radii ]
        #print(neighbours)
        #indexed_distances = zip(np.linalg.norm(vectors.sub(np.array(test_vec)), axis=1), range(0, len(vectors)))
        #neighbours = list(list(zip(*islice(sorted(indexed_distances), 1, 11)))[1])

        for k, radius_neighbours in enumerate(neighbours):
            if radius_neighbours.empty:
                if x.label != labels[-1]:
                    LOOs[k] += 1
            else:
                if (x.label != mode(radius_neighbours.label.to_list()).mode[0]):
                    LOOs[k] += 1
    #print(LOOs)
    return pd.DataFrame({'R': radii, 'LOO(R)': LOOs/len(ddata)})

LOO_radius(data_cancer, radii = [1, 10, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 10000])

Unnamed: 0,R,LOO(R)
0,1,0.372583
1,10,0.372583
2,100,0.121265
3,200,0.112478
4,300,0.110721
5,400,0.11775
6,500,0.11775
7,600,0.128295
8,700,0.130053
9,800,0.13181


In [64]:
LOO_radius(data_spam, radii = [1, 10, 20, 30, 40, 50, 100])

Unnamed: 0,R,LOO(R)
0,1,0.275158
1,10,0.244729
2,50,0.277548
3,100,0.284721


# Задача 5

In [32]:
scaler = MinMaxScaler()
scaled_spam = data_spam.copy()
scaled_cancer = data_cancer.copy()

scaled_spam.loc[:, ~scaled_spam.columns.isin(['label'])] = scaler.fit_transform(scaled_spam.loc[:, ~scaled_spam.columns.isin(['label'])])
scaled_cancer.loc[:, ~scaled_cancer.columns.isin(['label'])] = scaler.fit_transform(scaled_cancer.loc[:, ~scaled_cancer.columns.isin(['label'])])

LOOs(scaled_cancer)

  return self.partial_fit(X, y)


Unnamed: 0,k,LOO(k)
0,1,0.047452
1,2,0.038664
2,3,0.029877
3,4,0.02812
4,5,0.033392
5,6,0.029877
6,7,0.029877
7,8,0.031634
8,9,0.029877
9,10,0.029877


In [33]:
LOOs(scaled_spam)

Unnamed: 0,k,LOO(k)
0,1,0.087155
1,2,0.101065
2,3,0.094762
3,4,0.1015
4,5,0.094979
5,6,0.102586
6,7,0.099326
7,8,0.102369
8,9,0.101934
9,10,0.105195


In [76]:
LOO_radius(scaled_cancer, radii = [0.1, 0.2, .6, 0.7, 0.8, 0.9])

Unnamed: 0,R,LOO(R)
0,0.1,0.372583
1,0.2,0.369069
2,0.6,0.080844
3,0.7,0.073814
4,0.8,0.082601
5,0.9,0.087873


In [78]:
LOO_radius(scaled_spam, radii = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

Unnamed: 0,R,LOO(R)
0,0.05,0.197348
1,0.1,0.173006
2,0.15,0.159313
3,0.2,0.157357
4,0.25,0.167355
5,0.3,0.182569
6,0.4,0.237992
7,0.5,0.299283
8,0.6,0.345794
9,0.7,0.367311
