In [1]:
import pandas as pd
import numpy as np
import glob
import os
import random
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import datetime
import tensorflow.keras as keras
from tensorflow.keras import Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.layers import LSTM, Dense, LSTM, Flatten, BatchNormalization, Dropout
from keras.utils import to_categorical
import tensorflow as tf
# Load the TensorBoard notebook extension
%load_ext tensorboard

Using TensorFlow backend.


In [2]:
countriesOfInterest = ["HK", "JP", 'ZA', 'TN', 'TR', 'GB', 'MX', 'US', 'CO', 'EC', 'AU', 'NZ']
countriesOfInterest = ["ZA", "EG", "TW", "JP", "DK", "FI", "US", "CA", "AU", "NZ", "BR", "CO"]
train_n = 500
val_n = 20
Category = "Country"
w_length = 1000
enc = OneHotEncoder()
enc.fit(np.array(countriesOfInterest).reshape(-1, 1))

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [3]:
def split(X, cat):
    X = X.reset_index()
    new_pos = list(X.track_id.index[X.track_id.shift(1) != X.track_id]) # indices where the song changes
    new_pos.append(max(X.track_id.index) + 1) # add a new index to know where the last song ends
    split_pos = []
    for i in range(len(new_pos)-1):
        split_pos = split_pos + list(range(new_pos[i], new_pos[i+1], w_length))
    split_pos = split_pos[1:]
    us_train = np.split(X.iloc[:,:24].to_numpy(), split_pos)
    labs = np.split(X[Category].to_numpy(), split_pos)
    # drop the short sequences
    short_seqs = []
    temp = [] 
    labels = []
    for i, value in enumerate(us_train):
        if value.shape[0] == w_length:
            temp.append(value)
            labels.append(labs[i][0])
    us_train = temp
    return np.stack(us_train), labels

In [4]:
def splitSeconds(n, country, t, seconds, samplerate):
    length = seconds * samplerate
    data = pickle.load( open( "Raw Track Data\\" + country + "_" + t + ".p", "rb" ) )
    tracks = data.track_id.unique()
    tracks = np.random.choice(tracks, size=n, replace=False)
    trackFeats = data[data.track_id.isin(tracks)]
    dur = trackFeats.iloc[:,1]
    long = trackFeats.loc[trackFeats.index.repeat(dur * samplerate)].reset_index(drop = True)
    long['change'] = long.track_id.eq(long.track_id.shift())
    change = long[long.change == False].index
    long = long.iloc[:, 5:30]
    indices = np.concatenate((np.arange(0, long.shape[0], length), change))
    indices = np.sort(indices)
    indices = np.unique(indices)
    partition = np.split(np.array(long), indices)
    samples = []
    for i in partition:
        if i.shape[0] == length:
            samples = samples + [i]
    samples = np.stack(samples)
    return samples, np.repeat(np.array([country]), samples.shape[0])

In [5]:
def getSamples(train_n, val_n, seconds, samplerate):
    train = pd.DataFrame()
    train_labels = pd.DataFrame()
    val = pd.DataFrame()
    val_labels = pd.DataFrame()
    train_x = []
    train_labels = []
    val_x = []
    val_labels = []
    for country in countriesOfInterest:
        print("getting",country)
        x1, y1 = splitSeconds(train_n, country, "train", seconds, samplerate)
        x2, y2 = splitSeconds(val_n, country, "val", seconds, samplerate)
        train_x = train_x + x1.tolist()
        train_labels = train_labels + y1.tolist()
        val_x = val_x + x2.tolist()
        val_labels = val_labels + y2.tolist()
    #train_x = np.array(train_x)
    y = np.dstack(train_x)
    train_x = np.rollaxis(y,-1)
    train_labels = np.array(train_labels)
    #val_x = np.array(val_x)
    y = np.dstack(val_x)
    val_x = np.rollaxis(y,-1)
    val_labels = np.array(val_labels)
    class_weights = class_weight.compute_class_weight('balanced',
                                                     np.unique(train_labels),
                                                     list(train_labels))
    train_labels = enc.transform(np.array(train_labels).reshape(-1,1)).toarray()
    val_labels = enc.transform(np.array(val_labels).reshape(-1,1)).toarray()
    return train_x, train_labels, val_x, val_labels, class_weights

In [6]:
train_x, train_labels, val_x, val_labels, class_weights = getSamples(1, 1, 10, 100)

getting ZA
getting EG
getting TW
getting JP
getting DK
getting FI
getting US
getting CA
getting AU
getting NZ
getting BR
getting CO


### Fit model

In [7]:
enc.categories_

[array(['AU', 'BR', 'CA', 'CO', 'DK', 'EG', 'FI', 'JP', 'NZ', 'TW', 'US',
        'ZA'], dtype='<U2')]

In [8]:
train_x.shape

(253, 1000, 25)

In [9]:
model = keras.Sequential()
model.add(LSTM(16, 
               input_shape=(train_x.shape[1], train_x.shape[2]), 
               return_sequences = False,
              # recurrent_dropout = .5,
               kernel_regularizer=regularizers.l2(0.01),
              ))
model.add(Dropout(.5))
model.add(BatchNormalization())
model.add(Dense(len(enc.categories_[0]), activation= "softmax", kernel_regularizer=regularizers.l2(0.01)))
adam = keras.optimizers.Adam(lr=0.001)
model.compile(loss = "categorical_crossentropy", optimizer= adam, metrics=["acc"])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 16)                2688      
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
batch_normalization (BatchNo (None, 16)                64        
_________________________________________________________________
dense (Dense)                (None, 12)                204       
Total params: 2,956
Trainable params: 2,924
Non-trainable params: 32
_________________________________________________________________
None


In [11]:
#add recurrent dropout to add noise to duration
desc = "16LSTM"
log_dir = os.path.join(
    "logs",
    "betterSampling",
    desc
)

train_n = 200
val_n  = 50
seconds = 10
samplerate = 100
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
epochs = 5
iterations = 10
learn_rate = 0.001
for i in range(iterations):
    adam = keras.optimizers.Adam(lr=learn_rate)
    model.compile(loss = "categorical_crossentropy", optimizer= adam, metrics=["acc"])
    train_x, train_labels, val_x, val_labels, class_weights = getSamples(train_n, val_n, seconds, samplerate)
    print(np.sum(train_labels, axis = 0))
    model.fit(train_x, train_labels,
              epochs = i * epochs + epochs, 
              initial_epoch = i * epochs,
              shuffle = True,
              validation_data = (val_x, val_labels),
              batch_size = 1024,
              class_weight = class_weights,
             callbacks=[tensorboard_callback],
             verbose = 1)
    model.save_weights(model_dir)
    if i%2 == 0:
        learn_rate = learn_rate/2
    if i % 1 == 0:
        preds = model.predict(val_x, batch_size = 1024, verbose = 1)
     #   print(np.sum(train_labels, axis = 0))
        plt.imshow(
            confusion_matrix(
                enc.inverse_transform(preds), 
                enc.inverse_transform(val_labels), 
               # normalize = "all"
            )
        )
        plt.pause(.5)
        plt.show()
        preds = model.predict(train_x, batch_size = 1024, verbose = 1)
        plt.imshow(
            confusion_matrix(
                enc.inverse_transform(preds), 
                enc.inverse_transform(train_labels), 
            #    normalize = "all"
            )
        )
        plt.pause(.5)
        plt.show()

getting ZA
getting EG
getting TW
getting JP
getting DK
getting FI
getting US
getting CA
getting AU
getting NZ
getting BR
getting CO
[524. 496. 504. 409. 592. 508. 430. 641. 498. 547. 799. 560.]
Train on 6508 samples, validate on 6879 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

KeyboardInterrupt: 

In [15]:
sum(train_labels)

array([524., 496., 504., 409., 592., 508., 430., 641., 498., 547., 799.,
       560.])

In [22]:
n  = 200
seconds = 10
samplerate = 100
country = "NZ"
t = "train"

In [38]:
length = seconds * samplerate
data = pickle.load( open( "Raw Track Data\\" + country + "_" + t + ".p", "rb" ) )
tracks = data.track_id.unique()
tracks = np.random.choice(tracks, size=n, replace=False)
trackFeats = data[data.track_id.isin(tracks)]
trackFeats.sort_values(["track_id", "start"])
dur = trackFeats.iloc[:,1]
long = trackFeats.loc[trackFeats.index.repeat(dur * samplerate)].reset_index(drop = True)
long['change'] = long.track_id.eq(long.track_id.shift())
change = long[long.change == False].index
long = long.iloc[:, 5:30]
indices = np.concatenate((np.arange(0, long.shape[0], length), change))
indices = np.sort(indices)
indices = np.unique(indices)
partition = np.split(np.array(long), indices)
samples = []
for i in partition:
    if i.shape[0] == length:
        samples = samples + [i]
samples = np.stack(samples)

In [40]:
indices

array([       0,        1,        2, ..., 14636973, 14636974, 14636975],
      dtype=int64)