In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import NMF
from sklearn.manifold import TSNE
from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from tqdm import tqdm

In [7]:
# read in datasets
train_df = pd.read_csv('train.csv')
validation_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

# split into features and labels
X_train = train_df.drop(columns='profit_margin')
y_train = train_df['profit_margin']
X_validation = validation_df.drop(columns='profit_margin')
y_validation = validation_df['profit_margin']
X_test = test_df.drop(columns='profit_margin')
y_test = test_df['profit_margin']

# combine train and validation datasets for cross validation
X_full_train = pd.concat([X_train, X_validation])
y_full_train = pd.concat([y_train, y_validation])

In [8]:
print(X_train.shape)
print(y_train.shape)
print(X_validation.shape)
print(y_validation.shape)
print(X_test.shape)
print(y_test.shape)

(6058, 72)
(6058,)
(757, 72)
(757,)
(758, 72)
(758,)


In [9]:
# base sklearn knn: tune k
errors = []

for k in range(1, 11):
    knn = KNeighborsRegressor(n_neighbors=k)
    y_validation_pred = knn.fit(X_train, y_train).predict(X_validation)
    rmse = mean_squared_error(y_validation, y_validation_pred, squared=False)
    mae = mean_absolute_error(y_validation, y_validation_pred)
    errors.append([k, rmse, mae])

errors_df = pd.DataFrame(errors, columns=['k', 'rmse', 'mae'])
errors_df.set_index('k', inplace=True)
errors_df

Unnamed: 0_level_0,rmse,mae
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.982311,0.770256
2,0.815751,0.637314
3,0.76211,0.600762
4,0.743401,0.592645
5,0.729701,0.586215
6,0.72623,0.58396
7,0.719044,0.581447
8,0.715052,0.57935
9,0.713323,0.585036
10,0.711666,0.58603


In [10]:
# plot error against k
fig = px.bar(errors_df, x=errors_df.index, y=['rmse', 'mae'], barmode='group')
fig.update_layout(yaxis_title='error')
fig.show()

In [11]:
# base sklearn knn: model
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

In [12]:
# base sklearn knn: validation error
y_val_pred = knn.predict(X_validation)
rmse = mean_squared_error(y_validation, y_val_pred, squared=False)
mae = mean_absolute_error(y_validation, y_val_pred)
print('base sklearn knn - validation rmse:', rmse)
print('base sklearn knn - validation mae:', mae)

base sklearn knn - validation rmse: 0.7297008670625598
base sklearn knn - validation mae: 0.5862147672013993


In [13]:
# base sklearn knn: test error
y_test_pred = knn.predict(X_test)
rmse = mean_squared_error(y_test, y_test_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_pred)
print('base sklearn knn - test rmse:', rmse)
print('base sklearn knn - test mae:', mae)

base sklearn knn - test rmse: 0.74848615637268
base sklearn knn - test mae: 0.6205282207673316


In [15]:
# base sklearn knn: cross validation
knn = KNeighborsRegressor() # using default n_neighbors = 5
knn_scores = cross_validate(knn, X_full_train, y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('base sklearn knn - cross val:', knn_scores['test_r2'].mean())

base sklearn knn - cross val: -0.0007576566474793766


In [41]:
# base sklearn knn: t test
t_stat, p_val = stats.ttest_ind(y_test_pred, y_test)
alpha = 0.05
print('p-value:', p_val)
print('t-value:', t_stat)
if p_val < alpha:
    print("result is statistically significant")

p-value: 0.995505416713554
t-value: 0.0056340848500864805


In [None]:
# pca sklearn knn: tune n_components
errors_pca = []

for components in range(1, 11):
    pca_pipe = Pipeline([('pca', PCA(n_components=components)), ('knn', KNeighborsRegressor())])
    pca_pipe.fit(X_train, y_train)
    y_validation_pca_pred = pca_pipe.predict(X_validation)
    rmse = mean_squared_error(y_validation, y_validation_pca_pred, squared=False)
    mae = mean_absolute_error(y_validation, y_validation_pca_pred)
    errors_pca.append([components, rmse, mae])

errors_pca_df = pd.DataFrame(errors_pca, columns=['components', 'rmse', 'mae'])
errors_pca_df.set_index('components', inplace=True)
errors_pca_df

Unnamed: 0_level_0,rmse,mae
components,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.816,0.682462
2,0.80025,0.656975
3,0.779946,0.641774
4,0.794494,0.644978
5,0.783474,0.637067
6,0.762806,0.619811
7,0.770304,0.622595
8,0.771942,0.621853
9,0.775164,0.622783
10,0.790706,0.641209


In [29]:
# visualize pca components
pca = PCA(n_components=10)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train))

labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    X_train_pca,
    labels=labels,
    dimensions=range(2)
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [30]:
# pca: explained variance
sum = 0
for i, var in enumerate(pca.explained_variance_ratio_ * 100):
    print(i + 1, 'components - total explained variance:', var)
    sum += var
print('total explained variance:', sum)

1 components - total explained variance: 7.753244052655848
2 components - total explained variance: 7.343951711391061
3 components - total explained variance: 4.718023581634088
4 components - total explained variance: 4.467043346941544
5 components - total explained variance: 3.8663804198351825
6 components - total explained variance: 3.3713363835642642
7 components - total explained variance: 2.9463434461891365
8 components - total explained variance: 2.7628218562851834
9 components - total explained variance: 2.5391353639397023
10 components - total explained variance: 2.4004147425468627
total explained variance: 42.16869490498288


In [35]:
# pca sklearn knn: model
pca_pipe = Pipeline([('pca', PCA(n_components=3)), ('knn', KNeighborsRegressor())])
pca_pipe.fit(X_train, y_train)

In [36]:
# pca sklearn knn: validation error
y_validation_pca_pred = pca_pipe.predict(X_validation)
rmse = mean_squared_error(y_validation, y_validation_pca_pred, squared=False)
mae = mean_absolute_error(y_validation, y_validation_pca_pred)
print('pca sklearn knn - validation rmse:', rmse)
print('pca sklearn knn - validation mae:', mae)

pca sklearn knn - validation rmse: 0.7798101402187685
pca sklearn knn - validation mae: 0.6419030218177331


In [37]:
# pca sklearn knn: test error
y_test_pca_pred = pca_pipe.predict(X_test)
rmse = mean_squared_error(y_test, y_test_pca_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_pca_pred)
print('pca sklearn knn - test rmse:', rmse)
print('pca sklearn knn - test mae:', mae)

pca sklearn knn - test rmse: 0.8151604067174215
pca sklearn knn - test mae: 0.6749646995428829


In [38]:
# pca sklearn knn: cross validation
knn_pca_scores = cross_validate(pca_pipe, X_full_train, y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('pca sklearn knn - cross val:', knn_pca_scores['test_r2'].mean())

pca sklearn knn - cross val: -0.15607564384683265


In [40]:
# pca sklearn knn: t test
t_stat, p_val = stats.ttest_ind(y_test_pca_pred, y_test)
alpha = 0.05
print('p-value:', p_val)
print('t-value:', t_stat)
if p_val < alpha:
    print("result is statistically significant")

p-value: 0.3223471726570667
t-value: 0.9899695809714202


In [43]:
# poly sklearn knn: model
poly_pipe = Pipeline([('poly', PolynomialFeatures()), ('knn', KNeighborsRegressor())])
poly_pipe.fit(X_train, y_train)

In [44]:
# poly sklearn knn: validation error
y_validation_poly_pred = poly_pipe.predict(X_validation)
rmse = mean_squared_error(y_validation, y_validation_poly_pred, squared=False)
mae = mean_absolute_error(y_validation, y_validation_poly_pred)
print('poly sklearn knn - validation rmse:', rmse)
print('poly sklearn knn - validation mae:', mae)

poly sklearn knn - validation rmse: 0.7517339572147149
poly sklearn knn - validation mae: 0.6107020927695729


In [45]:
# poly sklearn knn: test error
y_test_poly_pred = poly_pipe.predict(X_test)
rmse = mean_squared_error(y_test, y_test_poly_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_poly_pred)
print('poly sklearn knn - test rmse:', rmse)
print('poly sklearn knn - test mae:', mae)

poly sklearn knn - test rmse: 0.7663638165652618
poly sklearn knn - test mae: 0.6364074364308808


In [46]:
# poly sklearn knn: cross validation
knn_poly_scores = cross_validate(poly_pipe, X_full_train, y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('poly sklearn knn - cross val:', knn_poly_scores['test_r2'].mean())

poly sklearn knn - cross val: -0.05303269738774391


In [48]:
# poly sklearn knn: t test
t_stat, p_val = stats.ttest_ind(y_test_poly_pred, y_test)
alpha = 0.05
print('p-value:', p_val)
print('t-value:', t_stat)
if p_val < alpha:
    print("result is statistically significant")

p-value: 0.8814473079754256
t-value: -0.14915992337653647


In [50]:
# nmf sklearn knn: model
nmf_pipe = Pipeline([('nmf',  NMF(n_components=2)), ('knn', KNeighborsRegressor())])
nmf_pipe.fit(X_train, y_train)


Maximum number of iterations 200 reached. Increase it to improve convergence.



In [51]:
# nmf sklearn knn: validation error
y_validation_nmf_pred = nmf_pipe.predict(X_validation)
rmse = mean_squared_error(y_validation, y_validation_nmf_pred, squared=False)
mae = mean_absolute_error(y_validation, y_validation_nmf_pred)
print('nmf sklearn knn - validation rmse:', rmse)
print('nmf sklearn knn - validation mae:', mae)

nmf sklearn knn - validation rmse: 0.8053322833711429
nmf sklearn knn - validation mae: 0.6634551905358342


In [52]:
# nmf sklearn knn: test error
y_test_nmf_pred = nmf_pipe.predict(X_test)
rmse = mean_squared_error(y_test, y_test_nmf_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_nmf_pred)
print('nmf sklearn knn - test rmse:', rmse)
print('nmf sklearn knn - test mae:', mae)

nmf sklearn knn - test rmse: 0.8283467781594301
nmf sklearn knn - test mae: 0.6918294048723355


In [53]:
# nmf sklearn knn: cross validation
knn_nmf_scores = cross_validate(nmf_pipe, X_full_train, y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('nmf sklearn knn - cross val:', knn_nmf_scores['test_r2'].mean())


Maximum number of iterations 200 reached. Increase it to improve convergence.


Maximum number of iterations 200 reached. Increase it to improve convergence.


Maximum number of iterations 200 reached. Increase it to improve convergence.


Maximum number of iterations 200 reached. Increase it to improve convergence.


Maximum number of iterations 200 reached. Increase it to improve convergence.


Maximum number of iterations 200 reached. Increase it to improve convergence.



nmf sklearn knn - cross val: -0.16895020044590098



Maximum number of iterations 200 reached. Increase it to improve convergence.



In [54]:
# nmf sklearn knn: t test
t_stat, p_val = stats.ttest_ind(y_test_nmf_pred, y_test)
alpha = 0.05
print('p-value:', p_val)
print('t-value:', t_stat)
if p_val < alpha:
    print("result is statistically significant")

p-value: 0.9073576240999198
t-value: 0.11639160571431567


In [56]:
# tsne sklearn knn: model
tsne = TSNE()
X_train_tsne = pd.DataFrame(tsne.fit_transform(X_train))
X_validation_tsne = pd.DataFrame(tsne.fit_transform(X_validation))
X_test_tsne = pd.DataFrame(tsne.fit_transform(X_test))

knn_tsne = KNeighborsRegressor()
knn_tsne.fit(X_train_tsne, y_train)

In [57]:
# tsne sklearn knn: validation error
y_validation_tsne_pred = knn_tsne.predict(X_validation_tsne)
rmse = mean_squared_error(y_validation, y_validation_tsne_pred, squared=False)
mae = mean_absolute_error(y_validation, y_validation_tsne_pred)
print('tsne sklearn knn - validation rmse:', rmse)
print('tsne sklearn knn - validation mae:', mae)

tsne sklearn knn - validation rmse: 0.8470213750836924
tsne sklearn knn - validation mae: 0.6998822107961936


In [58]:
# tsne sklearn knn: test error
y_test_tsne_pred = knn_tsne.predict(X_test_tsne)
rmse = mean_squared_error(y_test, y_test_tsne_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_tsne_pred)
print('tsne sklearn knn - test rmse:', rmse)
print('tsne sklearn knn - test mae:', mae)

tsne sklearn knn - test rmse: 0.8482210593488367
tsne sklearn knn - test mae: 0.7138840550033047


In [59]:
# tsne sklearn knn: cross validation
knn_tsne_scores = cross_validate(knn_tsne, pd.concat([X_train_tsne, X_validation_tsne]), y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('nmf sklearn knn - cross val:', knn_tsne_scores['test_r2'].mean())

nmf sklearn knn - cross val: -0.16195582378108037


In [60]:
# tsne sklearn knn: t test
t_stat, p_val = stats.ttest_ind(y_test_tsne_pred, y_test)
alpha = 0.05
print('p-value:', p_val)
print('t-value:', t_stat)
if p_val < alpha:
    print("result is statistically significant")

p-value: 0.22920015262052207
t-value: -1.2029076042892595


In [62]:
# pt sklearn knn: model
pt_pipe = Pipeline([('pt',  PowerTransformer()), ('knn', KNeighborsRegressor())])
pt_pipe.fit(X_train, y_train)

In [63]:
# pt sklearn knn: validation error
y_validation_pt_pred = pt_pipe.predict(X_validation)
rmse = mean_squared_error(y_validation, y_validation_pt_pred, squared=False)
mae = mean_absolute_error(y_validation, y_validation_pt_pred)
print('pt sklearn knn - validation rmse:', rmse)
print('pt sklearn knn - validation mae:', mae)

pt sklearn knn - validation rmse: 0.7316729981444196
pt sklearn knn - validation mae: 0.593990689025661


In [64]:
# pt sklearn knn: test error
y_test_pt_pred = pt_pipe.predict(X_test)
rmse = mean_squared_error(y_test, y_test_pt_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_pt_pred)
print('pt sklearn knn - test rmse:', rmse)
print('pt sklearn knn - test mae:', mae)

pt sklearn knn - test rmse: 0.755002947696682
pt sklearn knn - test mae: 0.6176852843828139


In [65]:
# pt sklearn knn: cross validation
knn_pt_scores = cross_validate(pt_pipe, X_full_train, y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('pt sklearn knn - cross val:', knn_pt_scores['test_r2'].mean())

pt sklearn knn - cross val: -0.0018148411094721872


In [66]:
# pt sklearn knn: t test
t_stat, p_val = stats.ttest_ind(y_test_pt_pred, y_test)
alpha = 0.05
print('p-value:', p_val)
print('t-value:', t_stat)
if p_val < alpha:
    print("result is statistically significant")

p-value: 0.8245972756459519
t-value: 0.2216743733890712


In [16]:
# scratch knn implementation
class KNNRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors: int = 5):
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.X_ = X
        self.y_ = y
        return self
    
    def predict(self, X):
        check_is_fitted(self)
        X = check_array(X)
        
        y = np.zeros(X.shape[0])

        for i, x in tqdm(enumerate(X)):
            dist = np.linalg.norm(self.X_ - x, axis=1)
            # print(dist)
            k_idx = np.argpartition(dist, self.n_neighbors)[: self.n_neighbors]
            # print(self.)
            k_avg = np.mean(self.y_[k_idx])
            y[i] = k_avg

        return y

In [17]:
# base scratch knn: model
scratch_knn = KNNRegressor()
scratch_knn.fit(X_train, y_train)

In [18]:
# base scratch knn: validation error
scratch_y_val_pred = scratch_knn.predict(X_validation)
rmse = mean_squared_error(y_validation, scratch_y_val_pred, squared=False)
mae = mean_absolute_error(y_validation, scratch_y_val_pred)
print('base scratch knn - validation rmse:', rmse)
print('base scratch knn - validation mae:', mae)

757it [00:00, 1059.10it/s]

base scratch knn - validation rmse: 0.7297008670625598
base scratch knn - validation mae: 0.5862147672013993





In [19]:
# base scratch knn: test error
scratch_y_test_pred = scratch_knn.predict(X_test)
rmse = mean_squared_error(y_test, scratch_y_test_pred, squared=False)
mae = mean_absolute_error(y_test, scratch_y_test_pred)
print('base scratch knn - test rmse:', rmse)
print('base scratch knn - test mae:', mae)

758it [00:00, 1099.68it/s]

base scratch knn - test rmse: 0.74848615637268
base scratch knn - test mae: 0.6205282207673316





In [20]:
# base scratch knn: cross validation
scratch_knn = KNNRegressor() # using default n_neighbors = 5
knn_scores = cross_validate(scratch_knn, X_full_train, y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('base scratch knn - cross val:', knn_scores['test_r2'].mean())

682it [00:00, 975.16it/s]
6133it [00:06, 982.61it/s] 
682it [00:00, 1005.49it/s]
6133it [00:05, 1059.55it/s]
682it [00:00, 1011.20it/s]
6133it [00:06, 993.51it/s] 
682it [00:00, 979.53it/s]
6133it [00:05, 1025.34it/s]
682it [00:00, 1010.93it/s]
6133it [00:06, 1018.92it/s]
681it [00:00, 1051.56it/s]
6134it [00:05, 1052.48it/s]
681it [00:00, 1040.23it/s]
6134it [00:05, 1041.06it/s]
681it [00:00, 1027.88it/s]
6134it [00:06, 1009.87it/s]
681it [00:00, 923.56it/s]
6134it [00:06, 989.01it/s] 
681it [00:00, 1032.13it/s]
6134it [00:06, 1002.79it/s]

base scratch knn - cross val: -0.0007576566474793989





In [67]:
# base scratch knn: t test
t_stat, p_val = stats.ttest_ind(scratch_y_test_pred, y_test)
alpha = 0.05
print('p-value:', p_val)
print('t-value:', t_stat)
if p_val < alpha:
    print("result is statistically significant")

p-value: 0.995505416713554
t-value: 0.0056340848500864805


In [21]:
# pca scratch knn: model
scratch_pca_pipe = Pipeline([('pca', PCA(n_components=3)), ('knn', KNNRegressor())])
scratch_pca_pipe.fit(X_train, y_train)

# pca scratch knn: validation error
scratch_y_validation_pca_pred = scratch_pca_pipe.predict(X_validation)
rmse = mean_squared_error(y_validation, scratch_y_validation_pca_pred, squared=False)
mae = mean_absolute_error(y_validation, scratch_y_validation_pca_pred)
print('pca scratch knn - validation rmse:', rmse)
print('pca scratch knn - validation mae:', mae)

# pca scratch knn: test error
scratch_y_test_pca_pred = scratch_pca_pipe.predict(X_test)
rmse = mean_squared_error(y_test, scratch_y_test_pca_pred, squared=False)
mae = mean_absolute_error(y_test, scratch_y_test_pca_pred)
print('pca scratch knn - test rmse:', rmse)
print('pca scratch knn - test mae:', mae)

# pca scratch knn: cross validation
scratch_knn_pca_scores = cross_validate(scratch_pca_pipe, X_full_train, y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('pca scratch knn - cross val:', scratch_knn_pca_scores['test_r2'].mean())

757it [00:00, 4454.38it/s]


pca scratch knn - validation rmse: 0.7817740954987719
pca scratch knn - validation mae: 0.6439336256303356


758it [00:00, 4236.98it/s]


pca scratch knn - test rmse: 0.8185756224618655
pca scratch knn - test mae: 0.6771197477786206


682it [00:00, 4054.81it/s]
6133it [00:01, 4064.56it/s]
682it [00:00, 4278.33it/s]
6133it [00:01, 4195.17it/s]
682it [00:00, 4044.28it/s]
6133it [00:01, 4200.71it/s]
682it [00:00, 4144.44it/s]
6133it [00:01, 4314.08it/s]
682it [00:00, 6002.46it/s]
6133it [00:01, 6128.59it/s]
681it [00:00, 5779.02it/s]
6134it [00:01, 5722.04it/s]
681it [00:00, 5673.39it/s]
6134it [00:01, 6092.39it/s]
681it [00:00, 5814.30it/s]
6134it [00:01, 6011.23it/s]
681it [00:00, 5732.37it/s]
6134it [00:01, 6021.41it/s]
681it [00:00, 6351.73it/s]
6134it [00:00, 6221.88it/s]

pca scratch knn - cross val: -0.15740627997345893





In [68]:
# pca scratch knn: t test
t_stat, p_val = stats.ttest_ind(scratch_y_test_pca_pred, y_test)
alpha = 0.05
print('p-value:', p_val)
print('t-value:', t_stat)
if p_val < alpha:
    print("result is statistically significant")

p-value: 0.35773881512680794
t-value: 0.9199626753799351


In [23]:
# poly scratch knn: model
scratch_poly_pipe = Pipeline([('poly', PolynomialFeatures()), ('knn', KNNRegressor())])
scratch_poly_pipe.fit(X_train, y_train)

# poly scratch knn: validation error
scratch_y_validation_poly_pred = scratch_poly_pipe.predict(X_validation)
rmse = mean_squared_error(y_validation, scratch_y_validation_poly_pred, squared=False)
mae = mean_absolute_error(y_validation, scratch_y_validation_poly_pred)
print('poly scratch knn - validation rmse:', rmse)
print('poly scratch knn - validation mae:', mae)

# poly scratch knn: test error
scratch_y_test_poly_pred = scratch_poly_pipe.predict(X_test)
rmse = mean_squared_error(y_test, scratch_y_test_poly_pred, squared=False)
mae = mean_absolute_error(y_test, scratch_y_test_poly_pred)
print('poly scratch knn - test rmse:', rmse)
print('poly scratch knn - test mae:', mae)

# poly scratch knn: cross validation
scratch_knn_poly_scores = cross_validate(scratch_poly_pipe, X_full_train, y_full_train, cv=5, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('poly scratch knn - cross val:', scratch_knn_pca_scores['test_r2'].mean())

757it [00:23, 31.56it/s]


poly scratch knn - validation rmse: 0.7517339572147149
poly scratch knn - validation mae: 0.6107020927695729


758it [00:23, 32.06it/s]


poly scratch knn - test rmse: 0.7663638165652618
poly scratch knn - test mae: 0.6364074364308808


1363it [00:38, 35.50it/s]
5452it [02:34, 35.38it/s]
1363it [00:38, 35.51it/s]
5452it [02:33, 35.44it/s]
1363it [00:38, 35.31it/s]
5452it [02:41, 33.79it/s]
1363it [00:40, 33.29it/s]
5452it [02:44, 33.16it/s]
1363it [00:40, 33.49it/s]
5452it [02:40, 34.04it/s]

poly scratch knn - cross val: -0.15740627997345893





In [69]:
# poly scratch knn: t test
t_stat, p_val = stats.ttest_ind(scratch_y_test_poly_pred, y_test)
alpha = 0.05
print('p-value:', p_val)
print('t-value:', t_stat)
if p_val < alpha:
    print("result is statistically significant")

p-value: 0.8814473079754256
t-value: -0.14915992337653647


In [24]:
# nmf scratch knn: model
scratch_nmf_pipe = Pipeline([('nmf', NMF(n_components=2)), ('knn', KNNRegressor())])
scratch_nmf_pipe.fit(X_train, y_train)

# nmf scratch knn: validation error
scratch_y_validation_nmf_pred = scratch_nmf_pipe.predict(X_validation)
rmse = mean_squared_error(y_validation, scratch_y_validation_nmf_pred, squared=False)
mae = mean_absolute_error(y_validation, scratch_y_validation_nmf_pred)
print('nmf scratch knn - validation rmse:', rmse)
print('nmf scratch knn - validation mae:', mae)

# nmf scratch knn: test error
scratch_y_test_nmf_pred = scratch_nmf_pipe.predict(X_test)
rmse = mean_squared_error(y_test, scratch_y_test_nmf_pred, squared=False)
mae = mean_absolute_error(y_test, scratch_y_test_nmf_pred)
print('nmf scratch knn - test rmse:', rmse)
print('nmf scratch knn - test mae:', mae)

# nmf scratch knn: cross validation
scratch_knn_nmf_scores = cross_validate(scratch_nmf_pipe, X_full_train, y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('nmf scratch knn - cross val:', scratch_knn_pca_scores['test_r2'].mean())


Maximum number of iterations 200 reached. Increase it to improve convergence.

757it [00:00, 8708.37it/s]


nmf scratch knn - validation rmse: 0.8053322833711429
nmf scratch knn - validation mae: 0.6634551905358343


758it [00:00, 8486.17it/s]


nmf scratch knn - test rmse: 0.8283467781594301
nmf scratch knn - test mae: 0.6918294048723355


682it [00:00, 8387.65it/s]
6133it [00:00, 8439.76it/s]
682it [00:00, 8772.30it/s]
6133it [00:00, 8505.66it/s]

Maximum number of iterations 200 reached. Increase it to improve convergence.

682it [00:00, 8604.17it/s]
6133it [00:00, 8518.35it/s]

Maximum number of iterations 200 reached. Increase it to improve convergence.

682it [00:00, 7971.56it/s]
6133it [00:00, 8265.59it/s]
682it [00:00, 8184.10it/s]
6133it [00:00, 8565.70it/s]

Maximum number of iterations 200 reached. Increase it to improve convergence.

681it [00:00, 8628.33it/s]
6134it [00:00, 8735.68it/s]

Maximum number of iterations 200 reached. Increase it to improve convergence.

681it [00:00, 8572.60it/s]
6134it [00:00, 8729.35it/s]

Maximum number of iterations 200 reached. Increase it to improve convergence.

681it [00:00, 8706.78it/s]
6134it [00:00, 8587.04it/s]

Maximum number of iterations 200 reached. Increase it to improve convergence.

681it [00:00, 8303.84it/s]
6134it [00:00, 8468.30it/s]

Maximum number of iterat

nmf scratch knn - cross val: -0.15740627997345893





In [70]:
# nmf scratch knn: t test
t_stat, p_val = stats.ttest_ind(scratch_y_test_nmf_pred, y_test)
alpha = 0.05
print('p-value:', p_val)
print('t-value:', t_stat)
if p_val < alpha:
    print("result is statistically significant")

p-value: 0.9073576240999198
t-value: 0.11639160571431567


In [27]:
# tsne scratch knn: model
scratch_knn_tsne = KNNRegressor()
scratch_knn_tsne.fit(X_train_tsne, y_train)

# tsne scratch knn: validation error
scratch_y_validation_tsne_pred = scratch_knn_tsne.predict(X_validation_tsne)
rmse = mean_squared_error(y_validation, scratch_y_validation_tsne_pred, squared=False)
mae = mean_absolute_error(y_validation, scratch_y_validation_tsne_pred)
print('tsne scratch knn - validation rmse:', rmse)
print('tsne scratch knn - validation mae:', mae)

# tsne scratch knn: test error
scratch_y_test_tsne_pred = scratch_knn_tsne.predict(X_test_tsne)
rmse = mean_squared_error(y_test, scratch_y_test_tsne_pred, squared=False)
mae = mean_absolute_error(y_test, scratch_y_test_tsne_pred)
print('tsne scratch knn - test rmse:', rmse)
print('tsne scratch knn - test mae:', mae)

# nmf scratch knn: cross validation
scratch_knn_tsne_scores = cross_validate(scratch_knn_tsne, pd.concat([X_train_tsne, X_validation_tsne]), y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('nmf scratch knn - cross val:', scratch_knn_tsne_scores['test_r2'].mean())

757it [00:00, 7067.58it/s]


tsne scratch knn - validation rmse: 0.8618173197973306
tsne scratch knn - validation mae: 0.7107605969598132


758it [00:00, 8469.96it/s]


tsne scratch knn - test rmse: 0.8664791887807098
tsne scratch knn - test mae: 0.7252987861431605


682it [00:00, 9409.62it/s]
6133it [00:00, 9838.96it/s]
682it [00:00, 9534.89it/s]
6133it [00:00, 9451.68it/s]
682it [00:00, 9465.26it/s]
6133it [00:00, 9593.78it/s]
682it [00:00, 9214.24it/s]
6133it [00:00, 9501.87it/s]
682it [00:00, 9634.97it/s]
6133it [00:00, 9649.81it/s]
681it [00:00, 8826.13it/s]
6134it [00:00, 9120.20it/s]
681it [00:00, 9149.80it/s]
6134it [00:00, 9174.26it/s]
681it [00:00, 9101.87it/s]
6134it [00:00, 9185.18it/s]
681it [00:00, 9085.83it/s]
6134it [00:00, 9109.55it/s]
681it [00:00, 9845.34it/s]
6134it [00:00, 9564.35it/s]

nmf scratch knn - cross val: -0.1635470913252107





In [71]:
# tsne scratch knn: t test
t_stat, p_val = stats.ttest_ind(scratch_y_test_tsne_pred, y_test)
alpha = 0.05
print('p-value:', p_val)
print('t-value:', t_stat)
if p_val < alpha:
    print("result is statistically significant")

p-value: 0.47826165053953573
t-value: -0.7092772512587519


In [28]:
# pt scratch knn: model
scratch_pt_pipe = Pipeline([('pt', PowerTransformer()), ('knn', KNNRegressor())])
scratch_pt_pipe.fit(X_train, y_train)

# pt scratch knn: validation error
scratch_y_validation_pt_pred = scratch_pt_pipe.predict(X_validation)
rmse = mean_squared_error(y_validation, scratch_y_validation_pt_pred, squared=False)
mae = mean_absolute_error(y_validation, scratch_y_validation_pt_pred)
print('pt scratch knn - validation rmse:', rmse)
print('pt scratch knn - validation mae:', mae)

# pt scratch knn: test error
scratch_y_test_pt_pred = scratch_pt_pipe.predict(X_test)
rmse = mean_squared_error(y_test, scratch_y_test_pt_pred, squared=False)
mae = mean_absolute_error(y_test, scratch_y_test_pt_pred)
print('pt scratch knn - test rmse:', rmse)
print('pt scratch knn - test mae:', mae)

# pt scratch knn: cross validation
scratch_knn_pt_scores = cross_validate(scratch_pt_pipe, X_full_train, y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('pt scratch knn - cross val:', scratch_knn_pt_scores['test_r2'].mean())

757it [00:00, 1640.16it/s]


pt scratch knn - validation rmse: 0.7316729981444197
pt scratch knn - validation mae: 0.593990689025661


758it [00:00, 1467.50it/s]


pt scratch knn - test rmse: 0.755002947696682
pt scratch knn - test mae: 0.617685284382814


682it [00:00, 1516.87it/s]
6133it [00:04, 1497.97it/s]
682it [00:00, 1441.95it/s]
6133it [00:03, 1548.07it/s]
682it [00:00, 1441.47it/s]
6133it [00:04, 1493.74it/s]
682it [00:00, 1477.74it/s]
6133it [00:04, 1522.11it/s]
682it [00:00, 1417.90it/s]
6133it [00:04, 1405.89it/s]
681it [00:00, 1518.27it/s]
6134it [00:04, 1506.75it/s]
681it [00:00, 1414.76it/s]
6134it [00:04, 1470.61it/s]
681it [00:00, 1503.12it/s]
6134it [00:04, 1519.92it/s]
681it [00:00, 1537.90it/s]
6134it [00:04, 1506.33it/s]
681it [00:00, 1555.70it/s]
6134it [00:03, 1574.05it/s]

pt scratch knn - cross val: -0.0018148411094722427





In [72]:
# pt scratch knn: t test
t_stat, p_val = stats.ttest_ind(scratch_y_test_pt_pred, y_test)
alpha = 0.05
print('p-value:', p_val)
print('t-value:', t_stat)
if p_val < alpha:
    print("result is statistically significant")

p-value: 0.8245972756459519
t-value: 0.22167437338907126
