In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import NMF
from sklearn.manifold import TSNE
from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

In [81]:
# read in datasets
train_df = pd.read_csv('train.csv')
validation_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

# split into features and labels
X_train = train_df.drop(columns='profit_margin')
y_train = train_df['profit_margin']
X_validation = validation_df.drop(columns='profit_margin')
y_validation = validation_df['profit_margin']
X_test = test_df.drop(columns='profit_margin')
y_test = test_df['profit_margin']

# combine train and validation datasets for cross validation
X_full_train = pd.concat([X_train, X_validation])
y_full_train = pd.concat([y_train, y_validation])

In [118]:
print(X_train.shape)
print(y_train.shape)
print(X_validation.shape)
print(y_validation.shape)
print(X_test.shape)
print(y_test.shape)

(6058, 72)
(6058,)
(757, 72)
(757,)
(758, 72)
(758,)


In [78]:
# base sklearn knn: tune k
errors = []

for k in range(1, 11):
    knn = KNeighborsRegressor(n_neighbors=k)
    y_validation_pred = knn.fit(X_train, y_train).predict(X_validation)
    rmse = mean_squared_error(y_validation, y_validation_pred, squared=False)
    mae = mean_absolute_error(y_validation, y_validation_pred)
    errors.append([k, rmse, mae])

errors_df = pd.DataFrame(errors, columns=['k', 'rmse', 'mae'])
errors_df.set_index('k', inplace=True)
errors_df

Unnamed: 0_level_0,rmse,mae
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.982311,0.770256
2,0.815751,0.637314
3,0.76211,0.600762
4,0.743401,0.592645
5,0.729701,0.586215
6,0.72623,0.58396
7,0.719044,0.581447
8,0.715052,0.57935
9,0.713323,0.585036
10,0.711666,0.58603


In [79]:
# plot error against k
fig = px.bar(errors_df, x=errors_df.index, y=['rmse', 'mae'], barmode='group')
fig.update_layout(yaxis_title='error')
fig.show()

In [86]:
# base sklearn knn: model
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

In [87]:
# base sklearn knn: validation error
y_val_pred = knn.predict(X_validation)
rmse = mean_squared_error(y_validation, y_val_pred, squared=False)
mae = mean_absolute_error(y_validation, y_val_pred)
print('base sklearn knn - validation rmse:', rmse)
print('base sklearn knn - validation mae:', mae)

base sklearn knn - validation rmse: 0.7297008670625598
base sklearn knn - validation mae: 0.5862147672013993


In [88]:
# base sklearn knn: test error
y_test_pred = knn.predict(X_test)
rmse = mean_squared_error(y_test, y_test_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_pred)
print('base sklearn knn - test rmse:', rmse)
print('base sklearn knn - test mae:', mae)

base sklearn knn - test rmse: 0.74848615637268
base sklearn knn - test mae: 0.6205282207673316


In [51]:
# base sklearn knn: cross validation
knn = KNeighborsRegressor() # using default n_neighbors = 5
knn_scores = cross_validate(knn, X_full_train, y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('base sklearn knn - cross val:', knn_scores['test_r2'].mean())

base sklearn knn - cross val: -0.0007576566474793766


In [57]:
# pca sklearn knn: tune n_components
errors_pca = []

for components in range(1, 11):
    pca_pipe = Pipeline([('pca', PCA(n_components=components)), ('knn', KNeighborsRegressor())])
    pca_pipe.fit(X_train, y_train)
    y_validation_pca_pred = pca_pipe.predict(X_validation)
    rmse = mean_squared_error(y_validation, y_validation_pca_pred, squared=False)
    mae = mean_absolute_error(y_validation, y_validation_pca_pred)
    errors_pca.append([components, rmse, mae])

errors_pca_df = pd.DataFrame(errors_pca, columns=['components', 'rmse', 'mae'])
errors_pca_df.set_index('components', inplace=True)
errors_pca_df

Unnamed: 0_level_0,rmse,mae
components,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.816,0.682462
2,0.80025,0.656975
3,0.779946,0.641774
4,0.794494,0.644978
5,0.783474,0.637067
6,0.762806,0.619811
7,0.770304,0.622595
8,0.771942,0.621853
9,0.775164,0.622783
10,0.790706,0.641209


In [106]:
# visualize pca components
pca = PCA(n_components=3)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train))

labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    X_train_pca,
    labels=labels,
    dimensions=range(2)
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [107]:
# pca: explained variance
sum = 0
for i, var in enumerate(pca.explained_variance_ratio_ * 100):
    print(i + 1, 'components - total explained variance:', var)
    sum += var
print('total explained variance:', sum)

1 components - total explained variance: 7.753249386733228
2 components - total explained variance: 7.343958899810564
3 components - total explained variance: 4.718960496555792
total explained variance: 19.816168783099585


In [108]:
# pca sklearn knn: model
pca_pipe = Pipeline([('pca', PCA(n_components=3)), ('knn', KNeighborsRegressor())])
pca_pipe.fit(X_train, y_train)

In [109]:
# pca sklearn knn: validation error
y_validation_pca_pred = pca_pipe.predict(X_validation)
rmse = mean_squared_error(y_validation, y_validation_pca_pred, squared=False)
mae = mean_absolute_error(y_validation, y_validation_pca_pred)
print('pca sklearn knn - validation rmse:', rmse)
print('pca sklearn knn - validation mae:', mae)

pca sklearn knn - validation rmse: 0.7833362747729539
pca sklearn knn - validation mae: 0.645123612171877


In [123]:
# pca sklearn knn: test error
y_test_pca_pred = pca_pipe.predict(X_test)
rmse = mean_squared_error(y_test, y_test_pca_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_pca_pred)
print('pca sklearn knn - validation rmse:', rmse)
print('pca sklearn knn - validation mae:', mae)

pca sklearn knn - validation rmse: 0.8126037390881704
pca sklearn knn - validation mae: 0.6725658998554526


In [122]:
# pca sklearn knn: cross validation
knn_pca_scores = cross_validate(pca_pipe, X_full_train, y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('pca sklearn knn - cross val:', knn_pca_scores['test_r2'].mean())

pca sklearn knn - cross val: -0.15923072426640625


In [127]:
# poly sklearn knn: model
poly_pipe = Pipeline([('poly', PolynomialFeatures()), ('knn', KNeighborsRegressor())])
poly_pipe.fit(X_train, y_train)

In [130]:
# poly sklearn knn: validation error
y_validation_poly_pred = poly_pipe.predict(X_validation)
rmse = mean_squared_error(y_validation, y_validation_poly_pred, squared=False)
mae = mean_absolute_error(y_validation, y_validation_poly_pred)
print('poly sklearn knn - validation rmse:', rmse)
print('poly sklearn knn - validation mae:', mae)

poly sklearn knn - validation rmse: 0.7517339572147149
poly sklearn knn - validation mae: 0.6107020927695729


In [132]:
# poly sklearn knn: test error
y_test_poly_pred = poly_pipe.predict(X_test)
rmse = mean_squared_error(y_test, y_test_poly_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_poly_pred)
print('poly sklearn knn - test rmse:', rmse)
print('poly sklearn knn - test mae:', mae)

poly sklearn knn - test rmse: 0.7663638165652618
poly sklearn knn - test mae: 0.6364074364308808


In [135]:
# poly sklearn knn: cross validation
knn_poly_scores = cross_validate(poly_pipe, X_full_train, y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('poly sklearn knn - cross val:', knn_poly_scores['test_r2'].mean())

poly sklearn knn - cross val: -0.05303269738774391


In [133]:
# nmf sklearn knn: model
nmf_pipe = Pipeline([('nmf',  NMF(n_components=2)), ('knn', KNeighborsRegressor())])
nmf_pipe.fit(X_train, y_train)


Maximum number of iterations 200 reached. Increase it to improve convergence.



In [136]:
# nmf sklearn knn: validation error
y_validation_nmf_pred = nmf_pipe.predict(X_validation)
rmse = mean_squared_error(y_validation, y_validation_nmf_pred, squared=False)
mae = mean_absolute_error(y_validation, y_validation_nmf_pred)
print('nmf sklearn knn - validation rmse:', rmse)
print('nmf sklearn knn - validation mae:', mae)

nmf sklearn knn - validation rmse: 0.8053322833711429
nmf sklearn knn - validation mae: 0.6634551905358342


In [137]:
# nmf sklearn knn: test error
y_test_nmf_pred = nmf_pipe.predict(X_test)
rmse = mean_squared_error(y_test, y_test_nmf_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_nmf_pred)
print('nmf sklearn knn - test rmse:', rmse)
print('nmf sklearn knn - test mae:', mae)

nmf sklearn knn - test rmse: 0.8283467781594301
nmf sklearn knn - test mae: 0.6918294048723355


In [152]:
# nmf sklearn knn: cross validation
knn_nmf_scores = cross_validate(nmf_pipe, X_full_train, y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('nmf sklearn knn - cross val:', knn_nmf_scores['test_r2'].mean())


Maximum number of iterations 200 reached. Increase it to improve convergence.


Maximum number of iterations 200 reached. Increase it to improve convergence.


Maximum number of iterations 200 reached. Increase it to improve convergence.


Maximum number of iterations 200 reached. Increase it to improve convergence.


Maximum number of iterations 200 reached. Increase it to improve convergence.


Maximum number of iterations 200 reached. Increase it to improve convergence.



nmf sklearn knn - cross val: -0.16895020044590098



Maximum number of iterations 200 reached. Increase it to improve convergence.



In [143]:
# tsne sklearn knn: model
tsne = TSNE()
X_train_tsne = pd.DataFrame(tsne.fit_transform(X_train))
X_validation_tsne = pd.DataFrame(tsne.fit_transform(X_validation))
X_test_tsne = pd.DataFrame(tsne.fit_transform(X_test))

knn_tsne = KNeighborsRegressor()
knn_tsne.fit(X_train_tsne, y_train)

In [145]:
# tsne sklearn knn: validation error
y_validation_tsne_pred = knn_tsne.predict(X_validation_tsne)
rmse = mean_squared_error(y_validation, y_validation_tsne_pred, squared=False)
mae = mean_absolute_error(y_validation, y_validation_tsne_pred)
print('tsne sklearn knn - validation rmse:', rmse)
print('tsne sklearn knn - validation mae:', mae)

tsne sklearn knn - validation rmse: 0.8328837495385479
tsne sklearn knn - validation mae: 0.6826482186850078


In [147]:
# tsne sklearn knn: test error
y_test_tsne_pred = knn_tsne.predict(X_test_tsne)
rmse = mean_squared_error(y_test, y_test_tsne_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_tsne_pred)
print('tsne sklearn knn - test rmse:', rmse)
print('tsne sklearn knn - test mae:', mae)

tsne sklearn knn - test rmse: 0.8654705244202453
tsne sklearn knn - test mae: 0.7210774044351557


In [148]:
# nmf sklearn knn: cross validation
knn_tsne_scores = cross_validate(knn_tsne, pd.concat([X_train_tsne, X_validation_tsne]), y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('nmf sklearn knn - cross val:', knn_tsne_scores['test_r2'].mean())

nmf sklearn knn - cross val: -0.15232450044813622


In [149]:
# pt sklearn knn: model
pt_pipe = Pipeline([('pt',  PowerTransformer()), ('knn', KNeighborsRegressor())])
pt_pipe.fit(X_train, y_train)

In [150]:
# pt sklearn knn: validation error
y_validation_pt_pred = pt_pipe.predict(X_validation)
rmse = mean_squared_error(y_validation, y_validation_pt_pred, squared=False)
mae = mean_absolute_error(y_validation, y_validation_pt_pred)
print('pt sklearn knn - validation rmse:', rmse)
print('pt sklearn knn - validation mae:', mae)

pt sklearn knn - validation rmse: 0.7316729981444196
pt sklearn knn - validation mae: 0.593990689025661


In [151]:
# pt sklearn knn: test error
y_test_pt_pred = pt_pipe.predict(X_test)
rmse = mean_squared_error(y_test, y_test_pt_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_pt_pred)
print('pt sklearn knn - test rmse:', rmse)
print('pt sklearn knn - test mae:', mae)

pt sklearn knn - test rmse: 0.755002947696682
pt sklearn knn - test mae: 0.6176852843828139


In [154]:
# pt sklearn knn: cross validation
knn_pt_scores = cross_validate(pt_pipe, X_full_train, y_full_train, cv=10, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'), return_train_score=True)
print('pt sklearn knn - cross val:', knn_pt_scores['test_r2'].mean())

pt sklearn knn - cross val: -0.0018148411094721872


In [155]:
# scratch knn implementation
class KNNRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors: int = 5):
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.X_ = X
        self.y_ = y
        return self
    
    def predict(self, X):
        check_is_fitted(self)
        X = check_array(X)
        
        y = np.zeros(X.shape[0])

        for i, x in enumerate(X):
            dist = np.linalg.norm(self.X_ - x, axis=1)
            # print(dist)
            k_idx = np.argpartition(dist, self.n_neighbors)[: self.n_neighbors]
            # print(self.)
            k_avg = np.mean(self.y_[k_idx])
            y[i] = k_avg

        return y