In [17]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [5]:
# Load data (takes like 9 seconds to run don't panic)

data = pd.read_excel('https://s3.amazonaws.com/stockx-sneaker-analysis/wp-content/uploads/2019/02/StockX-Data-Contest-2019.xlsx', sheet_name = 1)
df = data.copy()
df.head()

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region
0,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097.0,220,2016-09-24,11.0,California
1,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685.0,220,2016-11-23,11.0,California
2,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690.0,220,2016-11-23,11.0,California
3,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075.0,220,2016-11-23,11.5,Kentucky
4,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828.0,220,2017-02-11,11.0,Rhode Island


First, we want to change our dates into numbers so the model can work with them. Later, we can figure out how seasons affect prices and whatnot (more sales during holidays etc.), but for now we're just going to ignore all of that and treat every day of the year as the same. To do this, we'll use ordinal dates which measures how far a date is from Jan 1, 1 A.D.

We'll turn them into datetime objects then use datetime's toordinal() method.

In [6]:
df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Release Date'] = pd.to_datetime(df['Release Date'])

df['Order Date'] = df['Order Date'].apply(lambda x: x.toordinal())
df['Release Date'] = df['Release Date'].apply(lambda x: x.toordinal())

df

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region
0,736573,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097.0,220,736231,11.0,California
1,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685.0,220,736291,11.0,California
2,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690.0,220,736291,11.0,California
3,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075.0,220,736291,11.5,Kentucky
4,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828.0,220,736371,11.0,Rhode Island
...,...,...,...,...,...,...,...,...
99951,737103,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,565.0,220,737054,8.0,Oregon
99952,737103,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,598.0,220,737054,8.5,California
99953,737103,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,605.0,220,737054,5.5,New York
99954,737103,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,650.0,220,737054,11.0,California


In [7]:
df['Delta'] = (df['Sale Price'] - df['Retail Price'])/df['Retail Price']
df

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region,Delta
0,736573,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097.0,220,736231,11.0,California,3.986364
1,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685.0,220,736291,11.0,California,2.113636
2,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690.0,220,736291,11.0,California,2.136364
3,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075.0,220,736291,11.5,Kentucky,3.886364
4,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828.0,220,736371,11.0,Rhode Island,2.763636
...,...,...,...,...,...,...,...,...,...
99951,737103,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,565.0,220,737054,8.0,Oregon,1.568182
99952,737103,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,598.0,220,737054,8.5,California,1.718182
99953,737103,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,605.0,220,737054,5.5,New York,1.750000
99954,737103,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,650.0,220,737054,11.0,California,1.954545


We want our model to predict sales prices given all the other data, so we're going to split our data into two datasets: X (features: everything that's not the sales price) and Y (target: the sale prices).

Note: Since our data consists of a bunch of different things from numbers to dates to strings, I think we'll have to use something like one hot encoding to convert all the non numbers into numbers that our model can deal with. We could also label encoding where we just assign each label it's own number. I'm not sure what's the pros and cons of each so we'll have to research this and look further. For now though, I just use pandas "get_dummies" method which does one-hot encoding.

In [8]:
Y = df['Delta']
df = df.drop(['Sale Price', 'Retail Price', 'Delta'], axis=1)
X = pd.get_dummies(df)
X

Unnamed: 0,Order Date,Release Date,Shoe Size,Brand_ Yeezy,Brand_Off-White,Sneaker Name_Adidas-Yeezy-Boost-350-Low-Moonrock,Sneaker Name_Adidas-Yeezy-Boost-350-Low-Oxford-Tan,Sneaker Name_Adidas-Yeezy-Boost-350-Low-Pirate-Black-2015,Sneaker Name_Adidas-Yeezy-Boost-350-Low-Pirate-Black-2016,Sneaker Name_Adidas-Yeezy-Boost-350-Low-Turtledove,...,Buyer Region_South Dakota,Buyer Region_Tennessee,Buyer Region_Texas,Buyer Region_Utah,Buyer Region_Vermont,Buyer Region_Virginia,Buyer Region_Washington,Buyer Region_West Virginia,Buyer Region_Wisconsin,Buyer Region_Wyoming
0,736573,736231,11.0,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,736573,736291,11.0,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,736573,736291,11.0,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,736573,736291,11.5,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,736573,736371,11.0,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99951,737103,737054,8.0,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
99952,737103,737054,8.5,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
99953,737103,737054,5.5,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
99954,737103,737054,11.0,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


We use scikit-learn's train test split to separate X and Y into data that we will train our model on and data that we will test our model on. We can play around with how much of the dataset we want to use to train and test but a good starting point is 80/20, so we'll make test size = 0.2

In [23]:
X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

Now, we can actually train the model and have it predict. We'll use the mean squared error to look at how far off we are.

In [24]:
# models we'll use
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet': ElasticNet(),
    'Random Forest': RandomForestRegressor(n_estimators=100),
    'XGBoost': XGBRegressor(n_estimators=100),
    'Decision Tree': DecisionTreeRegressor()
}

# for each model, we want to train it, predict, then grade it
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    results[name] = {'MSE': mse, 'R^2': r2}

# results
results_df = pd.DataFrame(results).T
results_df['MSE'] = results_df['MSE']
results_df['R^2'] = results_df['R^2']

print(results_df)

                        MSE       R^2
Linear Regression  0.291453  0.868781
Ridge Regression   0.291394  0.868808
Lasso Regression   2.135622  0.038494
ElasticNet         2.135648  0.038482
Random Forest      0.031201  0.985953
XGBoost            0.037280  0.983216
Decision Tree      0.050169  0.977413


When we use other models, we can compare our mse from this one to their mse. It'll also be a good metric to see how our tweaks to this model affect the error (if we change test_size in the block above to 0.1, looking at if the error will increase or decrease).

 We can also use other errors like absolute error and whatnot but for now this is a fine starting point.