In [1]:
import pandas as pd
import numpy as np
import glob
import tensorflow.keras as keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical

In [2]:
import os
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

In [3]:
np.random.seed(1234) ### DOES NOT WORK???
w_length = 100
n_countries = 2

In [26]:
mx = pd.DataFrame()
us = pd.DataFrame()
for file in glob.glob("Raw Track Data\\*.csv"):
    name = file[15:-4]
    new = pd.read_csv(file)
    if name[:2] == "MX":
        mx = mx.append(new)
    if name[:2] == "US":
        us = us.append(new)

In [27]:
mx = mx.drop(["confidence", "loudness_start", "loudness_max_time", "loudness_max"], axis = 1)
us = us.drop(["confidence", "loudness_start", "loudness_max_time", "loudness_max"], axis = 1)

In [28]:
print(mx.shape)
mx = mx.drop_duplicates(["track_id", "start"])
print(mx.shape)

(2057082, 27)
(1183336, 27)


In [29]:
print(us.shape)
us = us.drop_duplicates(["track_id", "start"])
print(us.shape)

(1294236, 27)
(1272411, 27)


In [30]:
print("MX", mx.shape)
print("US", us.shape)

MX (1183336, 27)
US (1272411, 27)


### Check if the playlists have common songs

In [31]:
sum(us.track_id.isin(mx.track_id))

42593

In [32]:
dupes = us["track_id"].loc[us.track_id.isin(mx.track_id)]
mx = mx.loc[~(mx["track_id"].isin(dupes))]
us = us.loc[~(us["track_id"].isin(dupes))]
print("MX", mx.shape)
print("US", us.shape)

MX (1140743, 27)
US (1229818, 27)


### Input array for US

In [33]:
new_pos = list(us.track_id.index[us.track_id.shift(1) != us.track_id]) # indices where the song changes
new_pos.append(max(us.track_id.index) + 1) # add a new index to know where the last song ends
split_pos = []
for i in range(len(new_pos)-1):
    split_pos = split_pos + list(range(new_pos[i], new_pos[i+1], w_length))
split_pos = split_pos[1:]
us_train = np.split(us.iloc[:,:27].to_numpy(), split_pos)
# drop the short sequences
short_seqs = []
temp = [] 
for i, value in enumerate(us_train):
    if value.shape[0] == w_length:
        temp.append(value)
us_train = temp
us_train = np.stack(us_train)
val_index = np.isin(us_train[:,:,26],
                    np.random.choice(us.track_id.unique(), np.int(len(us.track_id.unique())/10)))
val_index = val_index.sum(1) != 0
us_val = us_train[val_index,:,:25] # drop track id
us_train = us_train[np.logical_not(val_index),:,:25] # drop track id
us_train = us_train.astype("float64")
us_val = us_val.astype("float64")

If we want songs to have the same number of observations, we can determine which element in the list returned by np.slit corresponds to which song (using new_pos) and randomly overpopulate accordingly. An alternative is to do this with an online batching if we have memory problems.

### Input array for MX

In [34]:
new_pos = list(mx.track_id.index[mx.track_id.shift(1) != mx.track_id]) # indices where the song changes
new_pos.append(max(mx.track_id.index) + 1) # add a new index to know where the last song ends
split_pos = []
for i in range(len(new_pos)-1):
    split_pos = split_pos + list(range(new_pos[i], new_pos[i+1], w_length))
split_pos = split_pos[1:]
mx_train = np.split(mx.iloc[:,:27].to_numpy(), split_pos)
# drop the short sequences
short_seqs = []
temp = [] 
for i, value in enumerate(mx_train):
    if value.shape[0] == w_length:
        temp.append(value)
mx_train = temp
mx_train = np.stack(mx_train)
val_index = np.isin(mx_train[:,:,26],
                    np.random.choice(mx.track_id.unique(), np.int(len(mx.track_id.unique())/10)))
val_index = val_index.sum(1) != 0
mx_val = mx_train[val_index,:,:25] # drop track id
mx_train = mx_train[np.logical_not(val_index),:,:25] # drop track id
mx_train = mx_train.astype("float64")
mx_val = mx_val.astype("float64")

### Input and Output arrays

In [35]:
train_input = np.concatenate([us_train, mx_train])
val_input = np.concatenate([us_val, mx_val])

train_output = np.ones((us_train.shape[0], 1))
train_output = np.concatenate([train_output, np.zeros((mx_train.shape[0], 1))])
train_output = keras.utils.to_categorical(train_output)
val_output = np.ones((us_val.shape[0], 1))
val_output = np.concatenate([val_output, np.zeros((mx_val.shape[0], 1))])
val_output = keras.utils.to_categorical(val_output)


print(train_input.shape, "\n", val_input.shape, "\n", train_output.shape, "\n", val_output.shape)
print(train_input.dtype, "\n", val_input.dtype, "\n", train_output.dtype, "\n", val_output.dtype)

(22925, 100, 25) 
 (3124, 100, 25) 
 (22925, 2) 
 (3124, 2)
float64 
 float64 
 float32 
 float32


### Define Model

In [36]:
out_index = 2
model = keras.Sequential()
model.add(LSTM(25, input_shape=(w_length, train_input.shape[2]), dropout = .5, recurrent_dropout = .5))
model.add(Dense(n_countries, activation= "softmax"))
model.compile(loss = "categorical_crossentropy", optimizer= "adam", metrics=["acc"])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 25)                5100      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 52        
Total params: 5,152
Trainable params: 5,152
Non-trainable params: 0
_________________________________________________________________
None


In [37]:
type(train_input)
train_input.dtype

dtype('float64')

In [38]:
model.fit(train_input, train_output,
          epochs = 50, shuffle = True,
          validation_data = (val_input, val_output),
          batch_size = 128,
         use_multiprocessing = True)

Train on 22925 samples, validate on 3124 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x23c000d6dd8>