In [1]:
import pandas as pd
import numpy as np
import glob
import os


In [2]:
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
import keras as keras
from keras import Sequential
from keras.layers import LSTM, Dense
from keras.utils import to_categorical

os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

Using plaidml.keras.backend backend.


In [3]:
os.environ["KERAS_BACKEND"]

'plaidml.keras.backend'

In [4]:
np.random.seed(1234) ### DOES NOT WORK???
w_length = 200
n_countries = 2

In [5]:
len(glob.glob("Raw Track Data\\*.csv"))

1726

### Import Data

In [6]:
import random
allTracks = pd.DataFrame()
for country in ["HK", "JP", 'ZA', 'TN', 'TR', 'GB', 'MX', 'US', 'CO', 'EC', 'AU', 'NZ']:
    for file in glob.glob("Raw Track Data\\*" + country + "*.csv"):
        if random.random() < .75:
            new = pd.read_csv(file)
            new["Country"] = file[15:17]
            new["Year"] = file[18:22]
            new["Playlist"] = file[23:-4]
            allTracks = allTracks.append(new)
allTracks = allTracks.drop(["confidence", "loudness_start", "loudness_max_time"], axis = 1)
print("Unique tracks:\t", len(pd.unique(allTracks.track_id)))

Unique tracks:	 9306


### Define category

In [7]:
Category = "Country"

In [8]:
cats = pd.unique(allTracks[Category])
print(cats)

['AT' 'AU' 'CY' 'CZ' 'EE' 'GR' 'HK' 'HU' 'IN' 'LT' 'LU' 'LV' 'MT' 'NO'
 'EC' 'FI' 'JP' 'MY' 'PH' 'SG' 'TW' 'AE' 'JO' 'MX' 'PL' 'PS' 'SA' 'AD'
 'CO' 'IT' 'LI' 'PE' 'VN' 'DK' 'ID' 'NZ' 'TR' 'GB' 'QA' 'AR' 'CL' 'NI'
 'PA' 'CA' 'FR' 'IE' 'IS' 'MC' 'NL' 'OM' 'RO' 'SK' 'US' 'BH' 'DZ' 'KW'
 'LB' 'MA' 'CH' 'CR' 'ES' 'GT' 'HN' 'IL' 'PT' 'SV' 'SE' 'BR' 'DE']


### Drop within category duplicates

In [9]:
print(allTracks.shape)
allTracks = allTracks.drop_duplicates(["track_id", "start", Category])
print(allTracks.shape)

(17350890, 31)
(14052462, 31)


### Create train (60%), test (20%), and validation (20%) datasets at track level

In [10]:
UniqueTracks = pd.unique(allTracks.track_id)
testTracks = np.random.choice(UniqueTracks, int(len(UniqueTracks) * .1), replace = False)
test = allTracks.loc[allTracks.track_id.isin(testTracks)]
allTracks = allTracks.loc[~allTracks.track_id.isin(testTracks)]
UniqueTracks = pd.unique(allTracks.track_id)
testTracks = np.random.choice(UniqueTracks, int(len(UniqueTracks) * .2), replace = False)
val = allTracks.loc[allTracks.track_id.isin(testTracks)]
train = allTracks = allTracks.loc[~allTracks.track_id.isin(testTracks)]


In [11]:
train[Category]

0        AT
1        AT
2        AT
3        AT
4        AT
         ..
23992    SG
23993    SG
23994    SG
23995    SG
23996    SG
Name: Country, Length: 10100920, dtype: object

In [12]:

#del(UniqueTracks, testTracks, allTracks)

In [13]:
print(test.shape)
print(val.shape)
print(train.shape)

(1402222, 31)
(2549320, 31)
(10100920, 31)


### Split tracks into subsegments

In [14]:
def split(X, cat):
    X = X.reset_index()
    new_pos = list(X.track_id.index[X.track_id.shift(1) != X.track_id]) # indices where the song changes
    new_pos.append(max(X.track_id.index) + 1) # add a new index to know where the last song ends
    split_pos = []
    for i in range(len(new_pos)-1):
        split_pos = split_pos + list(range(new_pos[i], new_pos[i+1], w_length))
    split_pos = split_pos[1:]
    us_train = np.split(X.iloc[:,:27].to_numpy(), split_pos)
    labs = np.split(X[Category].to_numpy(), split_pos)
    # drop the short sequences
    short_seqs = []
    temp = [] 
    labels = []
    for i, value in enumerate(us_train):
        if value.shape[0] == w_length:
            temp.append(value)
            labels.append(labs[i][0])
    us_train = temp
    return np.stack(us_train), labels

In [15]:
test_x, test_labels= split(test, Category)
#del(test)
val_x, val_labels = split(val, Category)
#del val
train_x, train_labels = split(train, Category)
#del train

In [16]:
print(test_x.shape)
print(len(test_labels))

(6190, 200, 27)
6190


### One-hot encode labels

In [17]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(cats.reshape(-1, 1))

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [18]:
cats

array(['AT', 'AU', 'CY', 'CZ', 'EE', 'GR', 'HK', 'HU', 'IN', 'LT', 'LU',
       'LV', 'MT', 'NO', 'EC', 'FI', 'JP', 'MY', 'PH', 'SG', 'TW', 'AE',
       'JO', 'MX', 'PL', 'PS', 'SA', 'AD', 'CO', 'IT', 'LI', 'PE', 'VN',
       'DK', 'ID', 'NZ', 'TR', 'GB', 'QA', 'AR', 'CL', 'NI', 'PA', 'CA',
       'FR', 'IE', 'IS', 'MC', 'NL', 'OM', 'RO', 'SK', 'US', 'BH', 'DZ',
       'KW', 'LB', 'MA', 'CH', 'CR', 'ES', 'GT', 'HN', 'IL', 'PT', 'SV',
       'SE', 'BR', 'DE'], dtype=object)

In [19]:
np.unique(train_labels)

array(['AD', 'AE', 'AR', 'AT', 'AU', 'BH', 'BR', 'CA', 'CH', 'CL', 'CO',
       'CR', 'CY', 'CZ', 'DE', 'DK', 'DZ', 'EC', 'EE', 'ES', 'FI', 'FR',
       'GB', 'GR', 'GT', 'HK', 'HN', 'HU', 'ID', 'IE', 'IL', 'IN', 'IS',
       'IT', 'JO', 'JP', 'KW', 'LB', 'LI', 'LT', 'LU', 'LV', 'MA', 'MC',
       'MT', 'MX', 'MY', 'NI', 'NL', 'NO', 'NZ', 'OM', 'PA', 'PE', 'PH',
       'PL', 'PS', 'PT', 'QA', 'RO', 'SA', 'SE', 'SG', 'SK', 'SV', 'TR',
       'TW', 'US', 'VN'], dtype='<U2')

In [20]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train_labels),
                                                 list(train_labels))
class_weights

array([1.11387882, 0.47808808, 2.36750628, 1.7200917 , 0.19820643,
       0.78530502, 3.7964154 , 4.36458641, 2.31622456, 2.36750628,
       0.27810759, 4.33509597, 3.20797101, 0.86351844, 9.16563147,
       2.27515675, 3.27343981, 0.49890685, 0.86351844, 3.04073082,
       2.2996208 , 9.16563147, 0.45730164, 0.87649481, 4.33509597,
       0.20024788, 4.08658728, 2.13864734, 2.2123938 , 2.82640618,
       3.0994889 , 1.00249094, 1.90384036, 2.82640618, 1.65359331,
       0.23605379, 2.47719769, 2.47719769, 1.90950656, 0.86701919,
       0.93254971, 0.87054844, 3.27343981, 4.93534002, 1.59204517,
       0.70894387, 1.10429295, 2.36750628, 1.08194638, 1.15394641,
       0.23112183, 1.35930975, 1.30140812, 1.56105645, 1.54601013,
       0.92050818, 1.65359331, 3.05521049, 0.76929761, 4.93534002,
       0.47915923, 3.14506962, 0.60074364, 4.93534002, 4.33509597,
       0.42574267, 1.54974445, 0.87769385, 2.50622736])

In [21]:
test_labels = enc.transform(np.array(test_labels).reshape(-1, 1)).toarray()
val_labels = enc.transform(np.array(val_labels).reshape(-1, 1)).toarray()
train_labels = enc.transform(np.array(train_labels).reshape(-1, 1)).toarray()

### Define Model

In [22]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM

In [23]:
train_x.shape[2]

27

In [24]:
out_index = 2
model = keras.Sequential()
model.add(LSTM(50, input_shape=(w_length, train_x.shape[2]), return_sequences = True))
model.add(LSTM(50, input_shape=(w_length, train_x.shape[2]), dropout=.5))
#model.add(Dense(200, activation= "relu"))
#model.add(keras.layers.Dropout(.25))
model.add(Dense(len(enc.categories_[0]), activation= "softmax"))
model.compile(loss = "categorical_crossentropy", optimizer= "adam", metrics=["acc"])
print(model.summary())

INFO:plaidml:Opening device "opencl_amd_gfx804.0"


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 200, 50)           15600     
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_1 (Dense)              (None, 69)                3519      
Total params: 39,319
Trainable params: 39,319
Non-trainable params: 0
_________________________________________________________________
None


In [25]:
type(train_x)
train_x.dtype

dtype('float64')

In [26]:
train_labels.shape

(44270, 69)

In [27]:
max(np.sum(train_labels, axis = 0))/sum(np.sum(train_labels, axis = 0))

0.07311949401400497

In [28]:
max(np.sum(test_labels, axis = 0))/sum(np.sum(test_labels, axis = 0))

0.0825525040387722

In [29]:
enc.categories_[0]

array(['AD', 'AE', 'AR', 'AT', 'AU', 'BH', 'BR', 'CA', 'CH', 'CL', 'CO',
       'CR', 'CY', 'CZ', 'DE', 'DK', 'DZ', 'EC', 'EE', 'ES', 'FI', 'FR',
       'GB', 'GR', 'GT', 'HK', 'HN', 'HU', 'ID', 'IE', 'IL', 'IN', 'IS',
       'IT', 'JO', 'JP', 'KW', 'LB', 'LI', 'LT', 'LU', 'LV', 'MA', 'MC',
       'MT', 'MX', 'MY', 'NI', 'NL', 'NO', 'NZ', 'OM', 'PA', 'PE', 'PH',
       'PL', 'PS', 'PT', 'QA', 'RO', 'SA', 'SE', 'SG', 'SK', 'SV', 'TR',
       'TW', 'US', 'VN'], dtype=object)

In [None]:
model.fit(train_x, train_labels,
          epochs = 50, 
          shuffle = True,
          validation_data = (val_x, val_labels),
          batch_size = 512,
          class_weight=class_weights)

Train on 44270 samples, validate on 11162 samples
Epoch 1/50


INFO:plaidml:Analyzing Ops: 1798 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 4321 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 8380 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 10505 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 13592 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 17907 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 22343 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 26926 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 32711 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 36295 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 39168 of 40247 operations complete




INFO:plaidml:Analyzing Ops: 1435 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 4294 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 8436 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 10886 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 14749 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 20019 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 24659 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 30946 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 34435 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 38166 of 40247 operations complete
INFO:plaidml:Analyzing Ops: 5147 of 16887 operations complete
INFO:plaidml:Analyzing Ops: 9439 of 16887 operations complete
INFO:plaidml:Analyzing Ops: 15709 of 16887 operations complete
INFO:plaidml:Analyzing Ops: 4391 of 16887 operations complete
INFO:plaidml:Analyzing Ops: 8870 of 16887 operations complete
INFO:plaidml:Analyzing Ops: 14501 of 16887 operations complete

Epoch 2/50
 4608/44270 [==>...........................] - ETA: 12:55 - loss: 3.8059 - acc: 0.0762

In [None]:
model.get_layer(index = 0)