# Watch Price Valuator

## Imports

In [None]:
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import OneHotEncoder

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from lightgbm import early_stopping
from lightgbm import log_evaluation

import optuna
import optuna.integration.lightgbm as lgb_tuner


%matplotlib inline
plt.style.use('seaborn')
mpl.rcParams['figure.figsize'] = (12, 6)
plt.rc('axes', titlesize=22) 
plt.rc('figure', titlesize=22)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=14)
plt.rc('ytick', labelsize=14)
plt.rc('axes', labelsize=14)

## Data

### Extract

In [None]:
sqlite_file_name = "../scraper/database.db"
sqlite_url = f"sqlite:///{sqlite_file_name}"

In [None]:
df_original = pd.read_sql_table('watch', sqlite_url)

In [None]:
df = df_original.copy()

### Transform

In [None]:
df.columns

In [None]:
df.drop(['id', 'model_num', 'model_id', 'product_url', 'image_url', 'image_filename'], axis=1, inplace=True)

In [None]:
df.dtypes

In [None]:
df = df.astype({
    'brand': 'category',
    'series': 'category',
    # 'model_num': 'category'
})

In [None]:
df.shape

In [None]:
df.head()

In [None]:
X, y = df.drop('price', axis=1), df['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42)

### Inspect

In [None]:
sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')
plt.show()

In [None]:
for col in ['brand', 'series']:#, 'model_num']:
    X_train[col].value_counts()[:30].plot(kind='bar')
    plt.title(col)
    plt.show()

In [None]:
sns.histplot(data=X_train, x='year')
plt.title('year')
plt.show()

In [None]:
X_train['box'].value_counts().plot(kind='bar')
plt.title('box')
plt.show()

In [None]:
X_train['papers'].value_counts().plot(kind='bar')
plt.title('papers')
plt.show()

In [None]:
sns.histplot(y_train)
plt.title('price')
plt.show()

In [None]:
sns.histplot(np.log(y_train))
plt.title('log price')
plt.show()

### Feature Encoding

In [None]:
X_train_ = X_train.copy()
X_test_ = X_test.copy()

In [None]:
ohe_cols = ['brand', 'series']

In [None]:
ohe = OneHotEncoder()
ohe.fit(X[ohe_cols])

In [None]:
X_train_.reset_index(inplace=True)

In [None]:
X_train_ohe = pd.DataFrame(ohe.transform(X_train_[ohe_cols]).toarray(), columns=ohe.get_feature_names())

In [None]:
X_train_ = X_train_.drop(ohe_cols, axis=1).join(X_train_ohe)

In [None]:
X_train_['box'] = X_train_['box'].map({True: 1, False: 0})

In [None]:
X_train_['papers'] = X_train_['papers'].map({True: 1, False: 0})

In [None]:
X_test_.reset_index(inplace=True)
X_test_ohe = pd.DataFrame(ohe.transform(X_test_[ohe_cols]).toarray(), columns=ohe.get_feature_names())
X_test_ = X_test_.drop(ohe_cols, axis=1).join(X_test_ohe)
X_test_['box'] = X_test_['box'].map({True: 1, False: 0})
X_test_['papers'] = X_test_['papers'].map({True: 1, False: 0})

In [None]:
-cross_val_score(LinearRegression(), X_train_, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

In [None]:
-cross_val_score(Lasso(), X_train_, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

In [None]:
-cross_val_score(Ridge(), X_train_, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

In [None]:
-cross_val_score(DecisionTreeRegressor(), X_train_, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

In [None]:
-cross_val_score(RandomForestRegressor(), X_train_, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

In [None]:
-cross_val_score(XGBRegressor(), X_train_, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

In [None]:
-cross_val_score(LGBMRegressor(), X_train_, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

### Embeddings

In [None]:
X_train_ = X_train.copy()
X_test_ = X_test.copy()

In [None]:
X_train_['box'] = X_train_['box'].map({True: 1, False: 0})
X_train_['papers'] = X_train_['papers'].map({True: 1, False: 0})

In [None]:
def rm_spaces(feature):
    return re.sub(r'[^0-9a-zA-Z]', '', feature)

In [None]:
brand_tokenizer = Tokenizer(oov_token="<OOV>")
brand_tokenizer.fit_on_texts(X['brand'].apply(lambda x: rm_spaces(x)))
brand_tokenizer.word_index

In [None]:
series_tokenizer = Tokenizer(oov_token="<OOV>")
series_tokenizer.fit_on_texts(X['series'].apply(lambda x: rm_spaces(x)))
series_tokenizer.word_index

In [None]:
X_train.reset_index()

In [None]:
brand_train_labels = brand_tokenizer.texts_to_sequences(X_train['brand'].apply(lambda x: rm_spaces(x)))
brand_test_labels = brand_tokenizer.texts_to_sequences(X_test['brand'].apply(lambda x: rm_spaces(x)))

series_train_labels = series_tokenizer.texts_to_sequences(X_train['series'].apply(lambda x: rm_spaces(x)))
series_test_labels = series_tokenizer.texts_to_sequences(X_test['series'].apply(lambda x: rm_spaces(x)))

In [None]:
brand_train_labels = tf.squeeze(tf.constant(brand_train_labels))
brand_test_labels = tf.squeeze(tf.constant(brand_test_labels))

series_train_labels = tf.squeeze(tf.constant(series_train_labels))
series_test_labels = tf.squeeze(tf.constant(series_test_labels))

In [None]:
num_brand = len(brand_tokenizer.word_index)
num_series = len(series_tokenizer.word_index)

In [None]:
num_brand

In [None]:
num_series

In [None]:
num_brand**0.25

In [None]:
num_series**0.25

In [None]:
early_stopping_keras = EarlyStopping(patience=30, restore_best_weights=True)

In [None]:
brand_dim = 5
series_dim = 15


## Create the layers
# First input
input_brand = tf.keras.layers.Input(shape=(1,))
# Second input
input_series = tf.keras.layers.Input(shape=(1,))
# First embedding layer
embedding_brand = Embedding(input_dim=num_brand+1, output_dim=brand_dim,
                           name='embedding_brand')(input_brand)
# Second embedding layer
embedding_series = Embedding(input_dim=num_series+1, output_dim=series_dim,
                            name='embedding_series')(input_series)
# Concatenate the output of both embedding layers
concat = tf.keras.layers.Concatenate()([embedding_brand, embedding_series])

#
# concat = tf.keras.layers.Dense(12)(concat)

# A single neuron out with no activation function as this is a regression problem
output = tf.keras.layers.Dense(1)(concat)

# Create the model and feed it the layers to expect as inputs and outputs
model = tf.keras.Model(inputs=[input_brand, input_series],
                       outputs=output)
# Compile the model, with MSE as a loss function and Adam for the optimizer
model.compile(loss='mean_squared_error',
              optimizer=tf.keras.optimizers.Adam(lr=3e-4),
              metrics=[tf.keras.metrics.RootMeanSquaredError()])
# Fit the model!
history = model.fit((brand_train_labels, series_train_labels), y_train.to_numpy(),
                    validation_data=((brand_test_labels, series_test_labels),
                                     y_test.to_numpy()), epochs=3000, callbacks=[early_stopping_keras], verbose=2)
# List of performance histories
# histories.append(history)

In [None]:
# Save the model after that hard work
model.save('trained_embedding_model.h5')

In [None]:
# summarize history for accuracy
plt.plot(history.history['root_mean_squared_error'])
plt.plot(history.history['val_root_mean_squared_error'])
plt.title('model root_mean_squared_error')
plt.ylabel('root_mean_squared_error')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='best')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='best')
plt.show()

In [None]:
# Grab each embedding layer from the previous model
embedding_layer_brand = model.get_layer('embedding_brand')
# Ensure that the layer's parameters cannot be changed anymore!
embedding_layer_brand.trainable = False

embedding_layer_series = model.get_layer('embedding_series')
embedding_layer_series.trainable = False

In [None]:
# Create 2 new 'models', which recieve the sequences and matrix multiplies with
# the embedding layers to generate n and m features per sample,
# for brand and series respectively

# Sequential model
    feature_generator_brand = tf.keras.models.Sequential([
        # Input layer to establish the array shape to expect. This layer is also not trainable
        tf.keras.layers.InputLayer(input_shape=[1,]),
        # The learned embedding layer!
        embedding_layer_brand
    ])
    # Same for second model
    feature_generator_series = tf.keras.models.Sequential([
        tf.keras.layers.InputLayer(input_shape=[1,]),
        embedding_layer_series
    ])

In [None]:
embedding_layer_brand.output_shape[2]

In [None]:
# Use the model's fit method to create the new features
brand_array = feature_generator_brand.predict(brand_train_labels)
print(brand_array.shape)
# Remove inner dimension, to go from 3D to 2D
brand_array = np.squeeze(brand_array, axis=1)
print(brand_array.shape)

In [None]:
# Use the model's fit method to create the new features
series_array = feature_generator_series.predict(series_train_labels)
print(series_array.shape)
# Remove inner dimension, to go from 3D to 2D
series_array = np.squeeze(series_array, axis=1)
print(series_array.shape)

In [None]:
features_array = np.concatenate((brand_array, series_array), axis=1)

In [None]:
embedding_cols = ["brand_embedding_" + str(i+1) for i in range(brand_dim)] + ["series_embedding_" + str(i+1) for i in range(series_dim)]

In [None]:
embedding_df = pd.DataFrame(features_array, columns=embedding_cols)

In [None]:
brand_train_labels = brand_tokenizer.texts_to_sequences(X['brand'].apply(lambda x: rm_spaces(x)))

In [None]:
def engineer_features(X: pd.DataFrame) -> pd.DataFrame:
    X_copy = X.copy()
    X_copy['box'] = X_copy['box'].map({True: 1, False: 0})
    X_copy['papers'] = X_copy['papers'].map({True: 1, False: 0})

    brand_labels = brand_tokenizer.texts_to_sequences(X['brand'].apply(lambda x: rm_spaces(x)))
    brand_labels = tf.squeeze(tf.constant(brand_labels))
    series_labels = series_tokenizer.texts_to_sequences(X['series'].apply(lambda x: rm_spaces(x)))
    series_labels = tf.squeeze(tf.constant(series_labels))

    brand_array = feature_generator_brand.predict(brand_labels)
    brand_array = np.squeeze(brand_array, axis=1)
    series_array = feature_generator_series.predict(series_labels)
    series_array = np.squeeze(series_array, axis=1)
    features_array = np.concatenate((brand_array, series_array), axis=1)

    embedding_cols = ["brand_embedding_" + str(i+1) for i in range(brand_dim)] + ["series_embedding_" + str(i+1) for i in range(series_dim)]
    embedding_df = pd.DataFrame(features_array, columns=embedding_cols)

    X_copy = X_copy.join(embedding_df)
    X_copy.drop(['brand', 'series'], axis=1, inplace=True)

    return X_copy

In [None]:
engineer_features(X)

In [None]:
X_train__, X_test__, y_train, y_test = train_test_split(
    engineer_features(X), y, test_size=0.10, random_state=42)

In [None]:
X_train__

In [None]:
-cross_val_score(LinearRegression(), X_train__, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

In [None]:
-cross_val_score(Lasso(), X_train__, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

In [None]:
-cross_val_score(Ridge(), X_train__, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

In [None]:
-cross_val_score(DecisionTreeRegressor(), X_train__, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

In [None]:
-cross_val_score(RandomForestRegressor(), X_train__, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

In [None]:
-cross_val_score(XGBRegressor(), X_train__, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

In [None]:
-cross_val_score(LGBMRegressor(), X_train__, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

In [None]:
dtrain = lgb_tuner.Dataset(X_train__, label=y_train)

In [None]:
optuna.logging.set_verbosity(optuna.logging.FATAL)

In [None]:
optuna.logging.enable_propagation()

In [None]:
optuna.logging.enable_default_handler()

In [None]:
params = {
    "objective": "regression",
    "metric": "mean_squared_error",
    "verbosity": 0,
    "boosting_type": "gbdt",
}

In [None]:
tuner = lgb_tuner.LightGBMTunerCV(params,
                                  dtrain,
                                  num_boost_round=1000,
                                  folds=KFold(n_splits=5),
                                  callbacks=[early_stopping(100),
                                             log_evaluation(100)
                                             ],
                                  )

In [None]:
tuner.run()

In [None]:
print("Best score:", tuner.best_score**0.5)
best_params = tuner.best_params
print("Best params:", best_params)
print("  Params: ")
for key, value in best_params.items():
    print("    {}: {}".format(key, value))

In [None]:
-cross_val_score(LGBMRegressor(**best_params), X_train__, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()