In [234]:
import numpy as np
from scipy.stats import mode
from typing import Tuple
from sklearn.metrics import confusion_matrix
import seaborn as sns

csv_path = "iris.csv"

In [235]:
def load_csv(csv_path: str) -> Tuple[np.ndarray, np.ndarray]:
    np.random.seed(42)
    dataset = np.genfromtxt(csv_path, delimiter=',')
    #Data split miatt össze kell keverni az adatokat (Lesz amivel tanítunk, lesz amivel tesztelünk)
    np.random.shuffle(dataset)
    #Bemeneti paraméterek, és várt érték (label) szétválasztása
    x,y = dataset[:,:-1], dataset[:,-1]
    return x,y

In [236]:
x, y = load_csv(csv_path)

In [237]:
#Átlag, szórás
np.mean(x, axis=0), np.var(x, axis=0)

(array([nan, nan, nan, nan]), array([nan, nan, nan, nan]))

In [238]:
np.nanmean(x, axis=0), np.nanvar(x, axis=0)

(array([ 355.46503497, -280.09189189,    2.95      ,   21.74726027]),
 array([1.73561968e+07, 1.18405444e+07, 1.51049922e+04, 6.11729208e+04]))

In [239]:
#Nan-ok eltávolítása
x[np.isnan(x)] = 3.5

In [240]:
#Kiugró értékek eltávolítása
(x > 10.0).sum(), (x < 0.0).sum()

(4, 2)

In [241]:
x[np.where(np.logical_or(x > 10.0, x < 0.0))]

array([ -1111.,    100.,   1000.,  50000.,   3000., -42000.])

In [242]:
less_than = np.where(x < 0.0)
higher_than = np.where(x > 10.0)
less_than, higher_than

((array([  4, 140]), array([2, 1])),
 (array([14, 27, 28, 62]), array([1, 2, 0, 3])))

In [243]:
y = np.delete(y, np.where(x < 0.0)[0], axis=0) #csak a sorokat akarjuk kitörölni
y = np.delete(y, np.where(x > 10.0)[0], axis=0)
x = np.delete(x, np.where(x < 0.0)[0], axis=0)
x = np.delete(x, np.where(x > 10.0)[0], axis=0)
x.shape, y.shape

((144, 4), (144,))

Train test split

In [244]:
def train_test_split(features: np.ndarray, labels: np.ndarray, test_split_ratio: float):
    test_size = int(len(features) * test_split_ratio)
    train_size = len(features) - test_size
    assert len(features) == test_size + train_size, "Size mismatch"

    x_train, y_train = features[:train_size,:], labels[:train_size]
    x_test, y_test = features[train_size:,:], labels[train_size:]
    
    return(x_train, y_train, x_test, y_test)

In [245]:
x_train, y_train, x_test, y_test = train_test_split(x, y, 0.2)

In [246]:
def euclidean(points:np.ndarray, element_of_x:np.ndarray) -> np.ndarray:
    return np.sqrt(np.sum((points - element_of_x)**2, axis=1))

In [247]:
def predict(x_train: np.ndarray, y_train: np.ndarray, k:int):
    labels_pred = []
    for x_test_element in x_test:
        distances = euclidean(x_train, x_test_element)
        distances = np.array(sorted(zip(distances, y_train)))
        labels_pred = mode(distances[:k, 1], keepdims=False).mode
        labels_pred.append(labels_pred)
    return np.array(labels_pred, dtype=np.int64)

In [248]:
def accuracy(y_test: np.ndarray, y_preds: np.ndarray) -> float:
    true_positive = (y_test == y_preds).sum()
    return true_positive / len(y_test) * 100

In [None]:
def plot_confusion_matric(y_test:np.ndarray, y_preds:np.ndarray) -> None:
    conf_matrix = confusion_matrix(y_test, y_preds)
    sns.heatmap(conf_matrix, annot=True)