In [1]:
import pandas as pd
import numpy as np
import glob
import os
import random
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import datetime
import tensorflow.keras as keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, LSTM, Flatten, BatchNormalization, Dropout
from keras.utils import to_categorical
import tensorflow as tf
# Load the TensorBoard notebook extension
%load_ext tensorboard

Using TensorFlow backend.


In [2]:
countriesOfInterest = ["HK", "JP", 'ZA', 'TN', 'TR', 'GB', 'MX', 'US', 'CO', 'EC', 'AU', 'NZ']
train_n = 50
test_n = 10
Category = "Country"
w_length = 200
enc = OneHotEncoder()
enc.fit(np.array(countriesOfInterest).reshape(-1, 1))

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [3]:
def split(X, cat):
    X = X.reset_index()
    new_pos = list(X.track_id.index[X.track_id.shift(1) != X.track_id]) # indices where the song changes
    new_pos.append(max(X.track_id.index) + 1) # add a new index to know where the last song ends
    split_pos = []
    for i in range(len(new_pos)-1):
        split_pos = split_pos + list(range(new_pos[i], new_pos[i+1], w_length))
    split_pos = split_pos[1:]
    us_train = np.split(X.iloc[:,:27].to_numpy(), split_pos)
    labs = np.split(X[Category].to_numpy(), split_pos)
    # drop the short sequences
    short_seqs = []
    temp = [] 
    labels = []
    for i, value in enumerate(us_train):
        if value.shape[0] == w_length:
            temp.append(value)
            labels.append(labs[i][0])
    us_train = temp
    return np.stack(us_train), labels

In [4]:
def getSamples():
    train = pd.DataFrame()
    train_labels = pd.DataFrame()
    val = pd.DataFrame()
    val_labels = pd.DataFrame()
    for country in countriesOfInterest:
        holdTrain = pickle.load( open( "Raw Track Data\\" + country + "_train.p", "rb" ) )
        holdTrain = holdTrain.drop(["confidence", "start", "duration"], axis = 1)
        holdVal = pickle.load( open( "Raw Track Data\\" + country + "_val.p", "rb" ) )
        holdVal = holdVal.drop(["confidence", "start", "duration"], axis = 1)
        unique = holdTrain.track_id.unique()
        unique = np.random.choice(unique, size=train_n, replace=False)
        holdTrain = holdTrain[holdTrain.track_id.isin(unique)]
        unique = holdVal.track_id.unique()
        unique = np.random.choice(unique, size=test_n, replace=False)
        holdVal = holdVal[holdVal.track_id.isin(unique)]
        train = train.append(holdTrain)
        val = val.append(holdVal)

    train_x, train_labels = split(train, Category)
    val_x, val_labels = split(val, Category)
    train_labels = enc.transform(np.array(train_labels).reshape(-1,1)).toarray()
    val_labels = enc.transform(np.array(val_labels).reshape(-1,1)).toarray()
    return train_x, train_labels, val_x, val_labels

In [5]:
train_x, train_labels, val_x, val_labels = getSamples()

In [6]:
max(np.sum(train_labels, axis = 0))/sum(np.sum(train_labels, axis = 0))

0.1100086281276963

In [7]:
model = keras.Sequential()
model.add(LSTM(16, 
               input_shape=(train_x.shape[1], train_x.shape[2]), 
               return_sequences = True))
model.add(LSTM(32, 
               input_shape=(train_x.shape[1], train_x.shape[2]), 
               return_sequences = False))
model.add(Dense(200, activation= "softmax"))
model.add(BatchNormalization())
model.add(Dropout(.25))
model.add(Dense(len(enc.categories_[0]), activation= "softmax"))
adam = keras.optimizers.Adam(lr=0.0001)
model.compile(loss = "categorical_crossentropy", optimizer= adam, metrics=["acc"])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 200, 16)           2816      
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                6272      
_________________________________________________________________
dense (Dense)                (None, 200)               6600      
_________________________________________________________________
batch_normalization (BatchNo (None, 200)               800       
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 12)                2412      
Total params: 18,900
Trainable params: 18,500
Non-trainable params: 400
__________________________________________________

In [None]:
log_dir = os.path.join(
    "logs",
    "fit",
    datetime.datetime.now().strftime("%Y%m%d-%H%M%S"),
)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
epochs = 10
iterations = 100
for i in range(iterations):
    train_x, train_labels, val_x, val_labels = getSamples()
    model.fit(train_x, train_labels,
              epochs = i * epochs + epochs, 
              initial_epoch = i * epochs,
              shuffle = True,
              validation_data = (val_x, val_labels),
              batch_size = 2048,
             callbacks=[tensorboard_callback],
             verbose = 0)
    if i %10 == 0:
        preds = model.predict(val_x, batch_size = 256, verbose = 1)
        print(np.sum(preds, axis = 0))
        print(np.sum(val_labels, axis = 0))
        plt.imshow(
            confusion_matrix(
                enc.inverse_transform(preds), 
                enc.inverse_transform(val_labels), 
                normalize = "all"
            )
        )
        plt.pause(.5)
        plt.show()

In [None]:
preds = model.predict(val_x, batch_size = 256, verbose = 1)
print(np.sum(preds, axis = 0))
print(np.sum(val_labels, axis = 0))
plt.imshow(
    confusion_matrix(
        enc.inverse_transform(preds), 
        enc.inverse_transform(val_labels), 
        normalize = "true"
    )
)

In [None]:
preds = model.predict(train_x, batch_size = 256, verbose = 1)
print(np.sum(preds, axis = 0))
print(np.sum(train_labels, axis = 0))
plt.imshow(
    confusion_matrix(
        enc.inverse_transform(preds), 
        enc.inverse_transform(train_labels), 
        normalize = "true"
    )
)