In [18]:
import pandas as pd
import numpy as np
from copy import copy, deepcopy
import random
import datetime
from tslearn.preprocessing import TimeSeriesScalerMeanVariance, TimeSeriesResampler
from tslearn.clustering import TimeSeriesKMeans
import matplotlib.pyplot as plt
import sklearn as sk


# Import data
seed = 44
np.random.seed(seed)
random.seed(seed)

In [4]:
print("Reading file...")
all_df = pd.read_csv('./data.csv', low_memory=False)
print("Done reading!")
all_df['time'] = pd.to_datetime(all_df['timestamp'] * 1000000000)
all_df = all_df.rename({'time': 'ds', 'percentOfMaxPrice': 'y'}, axis='columns')
dfs = [v for k, v in all_df.groupby('itadPlain')]
print("Everything done!")


Reading file...
Done reading!
Everything done!


In [14]:

onlyTrend = []
for df in dfs:
    onlyTrend.append(df['y'])

trends = np.array(onlyTrend, dtype=object)

In [15]:
# Convert it so that it's compatible with numpy
for i in range(0, len(trends)):
    trends[i] = trends[i].to_numpy()

In [16]:
# Train Splitting
X_train = trends
np.random.shuffle(X_train)
X_train = X_train[:2000]

In [9]:
# this is the data we're working with.
X_train[0:5]

array([array([1.        , 0.79927273, 0.79927273, ..., 0.67054545, 0.67054545,
              0.67054545])                                                    ,
       array([0.89856145, 0.73983752, 0.73983752, ..., 0.2791931 , 0.2791931 ,
              0.2791931 ])                                                    ,
       array([1.        , 0.71779809, 0.21409922, ..., 0.47845953, 0.47845953,
              0.47845953])                                                    ,
       array([-9.57, -6.49, -6.49, ..., -0.  , -0.  , -0.  ]),
       array([1.       , 0.8373583, 0.8373583, ..., 0.8373583, 0.8373583,
              0.8373583])                                                ],
      dtype=object)

In [19]:
def create_training_data(sz=80):
    # Normalize around 0
    t_data = TimeSeriesResampler(sz=80).fit_transform(trends)[:2000]
    t_data = TimeSeriesScalerMeanVariance().fit_transform(t_data)
    return t_data

We know from the [[learning.ipynb]] that:
   1) 16 trend "classes" is not nearly enough
   2)

In [22]:
# Soft-DTW-k-means

sdtw_data = create_training_data()
print("Soft-DTW k-means")
sdtw_km = TimeSeriesKMeans(n_clusters=16, n_jobs=-1,metric="softdtw", metric_params={"gamma": .01},verbose=True,random_state=seed)

# 80/20 split again
kf = sk.model_selection.KFold(shuffle=True,random_state=seed)

y_pred = sdtw_km.fit_predict(sdtw_data)

plt.figure(figsize=(18, 10))
for yi in range(16):
    plt.subplot(4, 4, 1 + yi)
    for xx in sdtw_data[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(sdtw_km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, 40)
    plt.ylim(-4, 4)
    plt.text(0.55, 0.85,'Cluster %d' % (yi + 1),
             transform=plt.gca().transAxes)
    if yi == 1:
        plt.title("Soft-DTW $k$-means")

plt.tight_layout()
plt.show()

Soft-DTW k-means
557.017 --> 320.800 --> 303.908 --> 

KeyboardInterrupt: 

In [None]:
from tslearn.clustering import TimeSeriesKMeans
import matplotlib.pyplot as plt
import datetime


def train_clustering_model(model):
    # We know from [[./learning.ipynb]] that

    return

In [17]:
X_train[0]

array([0.96629213, 0.77212079, 0.77212079, ..., 0.26580056, 0.26580056,
       0.26580056])