In [1]:
import pandas as pd
import numpy as np
import tensorflow.keras as keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical

In [2]:
np.random.seed(1234) ### DOES NOT WORK???
w_length = 100
n_countries = 2

In [3]:
# only one playlist per two countries
us = pd.read_csv("US_37i9dQZF1DX6bBjHfdRnza.csv")
mx = pd.read_csv("MX_37i9dQZF1DX0yN5997BIDH.csv")

### Check if the playlists have common songs

In [4]:
sum(us.track_id.isin(mx.track_id))

0

### Input array for US

In [5]:
new_pos = list(us.track_id.index[us.track_id.shift(1) != us.track_id]) # indices where the song changes
new_pos.append(max(us.track_id.index) + 1) # add a new index to know where the last song ends
split_pos = []
for i in range(len(new_pos)-1):
    split_pos = split_pos + list(range(new_pos[i], new_pos[i+1], w_length))
split_pos = split_pos[1:]
us_train = np.split(us.iloc[:,:31].to_numpy(), split_pos)
# drop the short sequences
short_seqs = []
temp = [] 
for i, value in enumerate(us_train):
    if value.shape[0] == w_length:
        temp.append(value)
us_train = temp
us_train = np.stack(us_train)
val_index = np.isin(us_train[:,:,30],
                    np.random.choice(us.track_id.unique(), np.int(len(us.track_id.unique())/10)))
val_index = val_index.sum(1) != 0
us_val = us_train[val_index,:,:29] # drop track id
us_train = us_train[np.logical_not(val_index),:,:29] # drop track id
us_train = us_train.astype("float64")
us_val = us_val.astype("float64")

If we want songs to have the same number of observations, we can determine which element in the list returned by np.slit corresponds to which song (using new_pos) and randomly overpopulate accordingly. An alternative is to do this with an online batching if we have memory problems.

### Input array for MX

In [6]:
new_pos = list(mx.track_id.index[mx.track_id.shift(1) != mx.track_id]) # indices where the song changes
new_pos.append(max(mx.track_id.index) + 1) # add a new index to know where the last song ends
split_pos = []
for i in range(len(new_pos)-1):
    split_pos = split_pos + list(range(new_pos[i], new_pos[i+1], w_length))
split_pos = split_pos[1:]
mx_train = np.split(mx.iloc[:,:31].to_numpy(), split_pos)
# drop the short sequences
short_seqs = []
temp = [] 
for i, value in enumerate(mx_train):
    if value.shape[0] == w_length:
        temp.append(value)
mx_train = temp
mx_train = np.stack(mx_train)
val_index = np.isin(mx_train[:,:,30],
                    np.random.choice(mx.track_id.unique(), np.int(len(mx.track_id.unique())/10)))
val_index = val_index.sum(1) != 0
mx_val = mx_train[val_index,:,:29] # drop track id
mx_train = mx_train[np.logical_not(val_index),:,:29] # drop track id
mx_train = mx_train.astype("float64")
mx_val = mx_val.astype("float64")

### Input and Output arrays

In [7]:
train_input = np.concatenate([us_train, mx_train])
val_input = np.concatenate([us_val, mx_val])

train_output = np.ones((us_train.shape[0], 1))
train_output = np.concatenate([train_output, np.zeros((mx_train.shape[0], 1))])
train_output = keras.utils.to_categorical(train_output)
val_output = np.ones((us_val.shape[0], 1))
val_output = np.concatenate([val_output, np.zeros((mx_val.shape[0], 1))])
val_output = keras.utils.to_categorical(val_output)


print(train_input.shape, "\n", val_input.shape, "\n", train_output.shape, "\n", val_output.shape)
print(train_input.dtype, "\n", val_input.dtype, "\n", train_output.dtype, "\n", val_output.dtype)

(2098, 100, 29) 
 (190, 100, 29) 
 (2098, 2) 
 (190, 2)
float64 
 float64 
 float32 
 float32


### Define Model

In [8]:
out_index = 2
model = Sequential()
model.add(LSTM(25, input_shape=(w_length, train_input.shape[2])))
model.add(Dense(n_countries, activation= "softmax"))
model.compile(loss = "categorical_crossentropy", optimizer= "adam", metrics=["acc"])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 25)                5500      
_________________________________________________________________
dense (Dense)                (None, 2)                 52        
Total params: 5,552
Trainable params: 5,552
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
type(train_input)
train_input.dtype

dtype('float64')

In [11]:
model.fit(train_input, train_output,
          epochs = 50, shuffle = True,
          validation_data = (val_input, val_output),
          batch_size = 128)

Train on 2098 samples, validate on 190 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x25f2fccbdc8>