In [1]:
import pandas as pd
import numpy as np
import glob
import os
import random
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import datetime
import tensorflow.keras as keras
from tensorflow.keras import Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.layers import LSTM, Dense, LSTM, Flatten, BatchNormalization, Dropout
from keras.utils import to_categorical
import tensorflow as tf
# Load the TensorBoard notebook extension
%load_ext tensorboard

Using TensorFlow backend.


In [2]:
countriesOfInterest = ["HK", "JP", 'ZA', 'TN', 'TR', 'GB', 'MX', 'US', 'CO', 'EC', 'AU', 'NZ']
countriesOfInterest = ["ZA", "EG", "TW", "JP", "DK", "FI", "US", "CA", "AU", "NZ", "BE", "CO"]
train_n = 500
val_n = 20
Category = "Country"
w_length = 300
enc = OneHotEncoder()
enc.fit(np.array(countriesOfInterest).reshape(-1, 1))

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [3]:
def split(X, cat):
    X = X.reset_index()
    new_pos = list(X.track_id.index[X.track_id.shift(1) != X.track_id]) # indices where the song changes
    new_pos.append(max(X.track_id.index) + 1) # add a new index to know where the last song ends
    split_pos = []
    for i in range(len(new_pos)-1):
        split_pos = split_pos + list(range(new_pos[i], new_pos[i+1], w_length))
    split_pos = split_pos[1:]
    us_train = np.split(X.iloc[:,:24].to_numpy(), split_pos)
    labs = np.split(X[Category].to_numpy(), split_pos)
    # drop the short sequences
    short_seqs = []
    temp = [] 
    labels = []
    for i, value in enumerate(us_train):
        if value.shape[0] == w_length:
            temp.append(value)
            labels.append(labs[i][0])
    us_train = temp
    return np.stack(us_train), labels

In [4]:
def splitSeconds(n, country, t):
    data = pickle.load( open( "Raw Track Data\\" + country + "_" + t + ".p", "rb" ) )
    tracks = data.track_id.unique()
    tracks = np.random.choice(tracks, size=n, replace=True)
    samples = []
    for track in tracks:
        try:
            trackFeats = data[data.track_id == track]
            FeatsLen =  trackFeats.shape[0]
            ind = random.randrange(1, FeatsLen - 10)
            feats = trackFeats.iloc[ind:ind+(w_length*10),6:30]
            dur = trackFeats.iloc[ind:ind+(w_length*10),1]
            example = np.array(feats.loc[feats.index.repeat(dur * 10)][-300:])
            if example.shape[0] == w_length:
                samples = samples + [example]
        except:
            continue
    samples = np.array(samples)
    return samples, np.repeat(np.array([country]), samples.shape[0])

In [5]:
def getSamples(train_n, val_n):
    train = pd.DataFrame()
    train_labels = pd.DataFrame()
    val = pd.DataFrame()
    val_labels = pd.DataFrame()
    train_x = []
    train_labels = []
    val_x = []
    val_labels = []
    for country in countriesOfInterest:
        print("getting",country)
        x1, y1 = splitSeconds(train_n, country, "train")
        x2, y2 = splitSeconds(val_n, country, "val")
        train_x = train_x + x1.tolist()
        train_labels = train_labels + y1.tolist()
        val_x = val_x + x2.tolist()
        val_labels = val_labels + y2.tolist()
    #train_x = np.array(train_x)
    y = np.dstack(train_x)
    train_x = np.rollaxis(y,-1)
    train_labels = np.array(train_labels)
    #val_x = np.array(val_x)
    y = np.dstack(val_x)
    val_x = np.rollaxis(y,-1)
    val_labels = np.array(val_labels)
    class_weights = class_weight.compute_class_weight('balanced',
                                                     np.unique(train_labels),
                                                     list(train_labels))
    train_labels = enc.transform(np.array(train_labels).reshape(-1,1)).toarray()
    val_labels = enc.transform(np.array(val_labels).reshape(-1,1)).toarray()
    return train_x, train_labels, val_x, val_labels, class_weights

In [6]:
train_x, train_labels, val_x, val_labels, class_weights = getSamples(1, 1)

getting ZA
getting EG
getting TW
getting JP
getting DK
getting FI
getting US
getting CA
getting AU
getting NZ
getting BE
getting CO


### Fit model

In [7]:
enc.categories_

[array(['AU', 'BE', 'CA', 'CO', 'DK', 'EG', 'FI', 'JP', 'NZ', 'TW', 'US',
        'ZA'], dtype='<U2')]

In [8]:
train_x.shape

(8, 300, 24)

In [9]:

model = keras.Sequential()
model.add(LSTM(64, 
               input_shape=(train_x.shape[1], train_x.shape[2]), 
               return_sequences = True
              ))
model.add(BatchNormalization())
model.add(LSTM(128, 
               input_shape=(train_x.shape[1], train_x.shape[2]), 
               return_sequences = True,
               dropout = .5,
              go_backwards = True))
model.add(BatchNormalization())
model.add(LSTM(256, 
               input_shape=(train_x.shape[1], train_x.shape[2]),
               dropout = .5,
               activity_regularizer = regularizers.l2(0.01)
              ))
model.add(BatchNormalization())
model.add(Dropout(.5))
#model.add(Dense(64, activation = "relu"))
#model.add(BatchNormalization())
#model.add(Dropout(.5))
model.add(Dense(len(enc.categories_[0]), activation= "softmax"))
adam = keras.optimizers.Adam(lr=0.001)
model.compile(loss = "categorical_crossentropy", optimizer= adam, metrics=["acc"])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 300, 64)           22784     
_________________________________________________________________
batch_normalization (BatchNo (None, 300, 64)           256       
_________________________________________________________________
lstm_1 (LSTM)                (None, 300, 128)          98816     
_________________________________________________________________
batch_normalization_1 (Batch (None, 300, 128)          512       
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               394240    
_________________________________________________________________
batch_normalization_2 (Batch (None, 256)               1024      
_________________________________________________________________
dropout (Dropout)            (None, 256)               0

In [None]:
desc = "64_128_256_Dropout_Singlel2regularization_10000sample"
log_dir = os.path.join(
    "logs",
    "fit",
    desc
)
model_dir = os.path.join(
    "pickle",
    "save"
)

train_n = 10000
val_n = 2000
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
epochs = 25
iterations = 10
learn_rate = 0.001
for i in range(iterations):
    adam = keras.optimizers.Adam(lr=learn_rate)
    model.compile(loss = "categorical_crossentropy", optimizer= adam, metrics=["acc"])
    train_x, train_labels, val_x, val_labels, class_weights = getSamples(train_n, val_n)
    print(np.sum(train_labels, axis = 0))
    model.fit(train_x, train_labels,
              epochs = i * epochs + epochs, 
              initial_epoch = i * epochs,
              shuffle = True,
              validation_data = (val_x, val_labels),
              batch_size = 1024,
              class_weight = class_weights,
             callbacks=[tensorboard_callback],
             verbose = 1)
    model.save_weights(model_dir)
    if i%2 == 0:
        learn_rate = learn_rate/2
    if i % 1 == 0:
        preds = model.predict(val_x, batch_size = 1024, verbose = 1)
     #   print(np.sum(train_labels, axis = 0))
        plt.imshow(
            confusion_matrix(
                enc.inverse_transform(preds), 
                enc.inverse_transform(val_labels), 
               # normalize = "all"
            )
        )
        plt.pause(.5)
        plt.show()
        preds = model.predict(train_x, batch_size = 1024, verbose = 1)
        plt.imshow(
            confusion_matrix(
                enc.inverse_transform(preds), 
                enc.inverse_transform(train_labels), 
            #    normalize = "all"
            )
        )
        plt.pause(.5)
        plt.show()

getting ZA
getting EG
getting TW
