In [None]:
import setup_notebook

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import zscore
import seaborn as sns

from grandexchange.preprocess import (
    fill_missing_data,
    load_price_data,
    load_preprocessed_data,
    remove_price_outliers
)

In [None]:
prices = load_price_data()

Select items which have on average more than 1m sold each hour

In [None]:
prices.head()

In [None]:
def plot_single_item_prices(item_id, **kwargs):
    df = prices[prices['item_id'] == item_id].copy()
    item_name = df['name'].unique()[0] + ' prices'
    sns.set(rc={'figure.figsize':(15, 7)})
    ax = sns.lineplot(data=df, x='datetime', y='price', color='blue', alpha=0.5)
    ax.set_title(item_name)
    plt.fill_between(df.datetime, df['price'] - df['margin'], df['price'] + df['margin'], color='grey', alpha=0.5)

In [None]:
plot_single_item_prices(21003)

### Detect and remove outliers

A number of items seem to have a few outliers in the prices data.

In [None]:
plot_single_item_prices(2363)

In [None]:
df = prices[prices['item_id'] == 2363].copy()
df['price_change'] = df.sort_values('datetime')['price'].diff()
df.loc[df['price_change'].isna(), 'price_change'] = 0

In [None]:
sns.histplot(df['price_change'], bins=500)

Difference is normally distributed - use z scores to remove outliers

In [None]:
df['zscore'] = zscore(df['price_change'])
anomalies = df[df['zscore'] > 5][['item_id', 'datetime']]
anomalies['anomalous'] = 1

df = pd.merge(df, anomalies, on=['item_id', 'datetime'], how='left')
df.loc[df['anomalous'].isna(), 'anomalous'] = 0

plt.plot(df['datetime'], df['price'])
plt.scatter(df.loc[df['anomalous'] == 1, 'datetime'], df.loc[df['anomalous'] == 1, 'price'], color='red', label='removed points')
plt.legend()
plt.show()

In [None]:
prices = remove_price_outliers(prices)

### Impute missing periods

In [None]:
prices = fill_missing_data(prices)

### DTW find similar clusters of items

In [None]:
prices = load_preprocessed_data(n_hours=6)

In [None]:
# prices['price_scaled'] = prices.groupby('item_id')['price'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
prices['price_scaled'] = prices.groupby('item_id')['price'].transform(lambda x: (x - x.mean()) / x.std())

In [None]:
prices_sample = prices[prices['item_id'].isin(prices['item_id'].sample(10))]
sns.set(rc={'figure.figsize':(20, 10)})
sns.lineplot(data=prices_sample, x='datetime', y='price_scaled', hue='name')

In [None]:
from tslearn.clustering import TimeSeriesKMeans

# Reshape timeseries for dtw clustering
prices_wide = prices.pivot(index='datetime', columns='name', values='price_scaled')

In [None]:
%%time

dba_km = TimeSeriesKMeans(n_clusters=5,
                          metric="dtw",
                          random_state=42,
                          max_iter=5,
                          max_iter_barycenter=5,
                          n_jobs=-1)
y_pred = dba_km.fit_predict(np.array(prices_wide).T)
# labels = dict(zip(df_covid_pivot.index, y_pred))

In [None]:
cluster_key = pd.DataFrame({
    'name': prices_wide.columns,
    'cluster': y_pred
})

In [None]:
cluster_key['cluster'].value_counts()

In [None]:
# Show some from each cluster
cluster_sample = cluster_key.groupby('cluster').sample(7)
cluster_sample = pd.merge(prices, cluster_sample, on='name', how='inner')

In [None]:
clusters = cluster_sample['cluster'].sort_values().unique()

fig, ax = plt.subplots(len(clusters), 1, figsize=(30, 60))

for i, cluster in enumerate(clusters):
    sns.lineplot(
        data=cluster_sample.query(f'cluster == {cluster}'),
        x='datetime',
        y='price_scaled',
        hue='name',
        alpha=0.5,
        ax=ax[i]
    )
    ax[i].title.set_text(f'Normalised prices: cluster {cluster}')

plt.show()