## Import packages

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

## Data Loading

In [3]:
seed = 42
test_ratio = 0.2
scaler = StandardScaler()

In [4]:
data_df = (
    pd.read_csv('data.csv')
    .drop(columns=['id', 'Unnamed: 32'])
)
data_df.shape

(569, 31)

In [5]:
data_df.diagnosis.value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

## Data Preprocessing

In [6]:
X = scaler.fit_transform(data_df.drop(columns=['diagnosis']))
y = data_df['diagnosis'].map({'M': 0, 'B': 1}).values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True, random_state=seed, test_size=test_ratio)

In [8]:
X_m = X[y == 0].copy()
X_b = X[y == 1].copy()

n_m = X_m.shape[0]
n_b = X_b.shape[0]

p_m = n_m / X.shape[0]
p_b = n_b / X.shape[0]

In [9]:
X_m_train, X_m_test = train_test_split(X_m, random_state=seed, test_size=test_ratio)
X_b_train, X_b_test = train_test_split(X_b, random_state=seed, test_size=test_ratio)

In [10]:
X_m_train.shape, np.vstack((X_m_train, X_b_train,)).shape

((169, 30), (454, 30))

In [11]:
X_m_mean = np.mean(X_m_train, axis=0)
X_b_mean = np.mean(X_b_train, axis=0)

X_m_cov = np.cov(X_m_train, rowvar=False)
X_b_cov = np.cov(X_b_train, rowvar=False)

## Task 1

В першій задачі  зробити "вручну" класифікацію,  використовуючи ядерні оцінки щільності

a) нормальна щільність,  багатовимірний випадок

In [12]:
def compute_ln_f(x, x_mean, S, p=None):
    if p is None:
        p = 1
    x_centered = x - x_mean
    return -0.5 * (x.shape[0] * np.log(2 * np.pi) + np.log(np.linalg.det(S)) + x_centered.T @ np.linalg.inv(S) @ x_centered) + np.log(p)

In [13]:
def predict(X, x_m_mean, x_b_mean, x_m_cov, x_b_cov, p_m=None, p_b=None):
    result = []
    for row in X:
        result.append(
            int(compute_ln_f(row, x_b_mean, x_b_cov, p=p_b) > compute_ln_f(row, x_m_mean, x_m_cov, p=p_m))
        )
    return result

In [14]:
predictions = predict(np.vstack((X_m_test, X_b_test,)), X_m_mean, X_b_mean, X_m_cov, X_b_cov, p_m, p_b)
print(classification_report(
    np.asarray([0]*X_m_test.shape[0] + [1]*X_b_test.shape[0]),
    predictions
))

              precision    recall  f1-score   support

           0       0.89      0.98      0.93        43
           1       0.99      0.93      0.96        72

    accuracy                           0.95       115
   macro avg       0.94      0.95      0.95       115
weighted avg       0.95      0.95      0.95       115



б) взяти radius.mean  і використати  ядро $K(u)= (\pi)^{-1} sin^2 (u)/u^2$

In [15]:
def sin_kernal(u):
    return np.sin(u) ** 2 / ((u ** 2) * np.pi)

In [16]:
def estimate(linspace, sample, n, h, kernal):
    return np.sum(kernal((linspace - sample) / h), axis=1) / (n*h)

In [17]:
def predict(linspace1, linspace2, x, n, h, kernal, p1=1, p2=1):
    predict1 = estimate(linspace1, x, n, h, kernal)
    predict2 = estimate(linspace2, x, n, h, kernal)
    return (p2*predict2 > p1*predict1).astype(int)

In [18]:
data_df['radius_mean'].min(), data_df['radius_mean'].max()

(6.981, 28.11)

In [19]:
n = 100
h = 0.1
x_linspace = np.linspace(start=5, stop=30, num=n, endpoint=True)

In [20]:
prediction_m = predict(X_m_train[:, 0], X_b_train[:, 0], X_m_test[:, 0][:, None], n, h, sin_kernal, p1=p_m, p2=p_b)
prediction_b = predict(X_m_train[:, 0], X_b_train[:, 0], X_b_test[:, 0][:, None], n, h, sin_kernal, p1=p_m, p2=p_b)

print(classification_report(
    np.asarray([0]*X_m_test.shape[0] + [1]*X_b_test.shape[0]),
    np.concatenate((prediction_m, prediction_b,))
))

              precision    recall  f1-score   support

           0       0.57      0.91      0.70        43
           1       0.91      0.58      0.71        72

    accuracy                           0.70       115
   macro avg       0.74      0.75      0.70       115
weighted avg       0.78      0.70      0.71       115



  return np.sin(u) ** 2 / ((u ** 2) * np.pi)


## Task 2

Записати алгоритм для методу Fix & Hodges.

In [21]:
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy.spatial import distance_matrix

In [22]:
class KNearestNeighbor(BaseEstimator, ClassifierMixin):
    
    def __init__(self, n_neighbors: int = 1, p: int = 2):
        self.n_neighbors = n_neighbors
        self.p = p

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.proportions = np.unique(y, return_counts=True)[1] / len(y)

    def predict(self, X):
        return self._predict_labels(distance_matrix(X, self.X_train, p=self.p))

    def _predict_labels(self, dists):
        num_test = dists.shape[0]
        y_pred = np.zeros(num_test)
        for i in range(num_test):
            y_ind = np.argsort(dists[i, :], axis=0)
            closest_y = self.y_train[y_ind[:self.n_neighbors]]
            
            bins = np.bincount(closest_y)
            y_pred[i] = np.argmax(bins * self.proportions[:len(bins)])

        return y_pred

In [23]:
knn_clf = KNearestNeighbor(n_neighbors=3)
knn_clf.fit(X_train, y_train);

prediction = knn_clf.predict(X_test)

print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       1.00      0.95      0.98        42
           1       0.97      1.00      0.99        72

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

