In [None]:
# required for importin modules from other directories
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 


import numpy as np
#from KNNRegressor import KNNRegressor
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler

import config
from common import DataParser
from KNN.KNNRegressor import KNNRegressor

In [None]:
samples, values = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]], [0, 0.5, 2.5]
neigh = KNNRegressor(n_neighbors=2, p=1,debug=True)
neigh.fit(samples, values)
result = neigh.predict(np.array([[0, 0., .1]]))
print()
should_be = 2.5
print(f"Result: (should be {should_be})")
print(f"{result} -> {result==should_be}")

# House Prices Test

In [None]:
x_train, y_train = DataParser.parse_test_housePrices(splitData=True)

k = 3
p = 1
weights = "uniform" #"distance"
reg = KNNRegressor(n_neighbors=k, p=p, debug=True)
reg.fit(x_train, y_train)

display(reg)

y_pred, distances = reg.predict(x_train.iloc[0:3,:], ret_distances=2)
y_pred

# Moneyball

In [None]:
df = DataParser.parse_moneyball(splitData=False)

In [None]:
numeric_features = ["RS", "RA", "OBP", "SLG", "BA", "Playoffs", "G", "OOBP", "OSLG"]

In [None]:
df = df[numeric_features + ["W"]]
df.dropna(inplace=True)
x = df[numeric_features]
y = df["W"]
x, y

In [None]:
def split_XY(x, y):
    x_train, x_split, y_train, y_split = train_test_split(x, y, test_size=0.4)
    x_valid, x_test, y_valid, y_test = train_test_split(x_split, y_split, test_size=0.5)
    return x_train, x_valid, x_test, y_train, y_valid, y_test

def split_df(df):
    train, split = train_test_split(df, test_size=0.4)
    valid, test = train_test_split(split, test_size=0.5)
    return y_train, y_valid, y_test

def preprocess(df, scaler=None, ret_df=True):
    numeric_features = ["RS", "RA", "OBP", "SLG", "BA", "Playoffs", "G", "OOBP", "OSLG"]
    df = df[numeric_features + ["W"]]
    df.dropna(inplace=True)
    x = df[numeric_features]
    if scaler:
        x = scaler.fit_transform(x)
    y = df["W"]
    return x, y, df

def get_time(start, scale=1e-9):
    return (timer() - start ) * scale

In [None]:
from time import time_ns as timer
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor as skKNN
from sklearn.model_selection import train_test_split
import pandas as pd


COLS = ["name", "param", "r2", "mse", "train_time", "inference_time"]

def experiment(df, models, scaler=None):
    x, y, df = preprocess(df, scaler, ret_df=True)
    x_train, x_valid, x_test, y_train, y_valid, y_test = split_XY(x, y)
    
    plt.figure(figsize=(10,8))


    results = pd.DataFrame(columns=COLS)


    for name, model in models.items():
        params = model.get_params()
        print(name)
        display(model)
        
        train_time = timer()
        model.fit(x_train, y_train)
        train_time = get_time(train_time)

        inference_time = timer()
        predictions = model.predict(x_valid)
        inference_time = get_time(inference_time)

        r2 = r2_score(y_valid, predictions)
        mse = mean_squared_error(y_valid, predictions)

        result = {
            "name": name,
            "param": params["n_neighbors"],
            "r2": r2,
            "mse": mse,
            "train_time": train_time,
            "inference_time": inference_time
        }
        results = results.append(result, ignore_index=True)
        display(result)

        marker = "x" if name=="my_knn" else "p"
        ls = "--" if name=="my_knn" else "-."
        plt.plot(range(len(predictions)), predictions, linestyle=ls, marker=marker, markersize=5, label=name);

    plt.plot(range(len(y_valid)), y_valid, linestyle="", marker="o", markersize=8, label="y_valid");
    plt.legend()
    return results

In [None]:
models = []

k = 5
p = 2
reg = KNNRegressor() #(k=k, p=p)

knn = skKNN(algorithm="brute")
knn.get_params()

knn2 = skKNN(algorithm="kd_tree")
knn3 = skKNN(algorithm="ball_tree")

models = [reg, knn]
for x in models:
    display(str(x)+": ")
    display(x.get_params())
    print("_"*30)

models = {"my_knn": reg, "sk_knn_brute": knn, "sk_knn_kd": knn2, "sk_knn_ball": knn3}

In [None]:

scaler = None #StandardScaler()
results = experiment(df, models, scaler=scaler)

In [None]:
results