In [1]:
import pandas as pd
import numpy as np
import glob
import os


In [3]:
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
import keras as keras
from keras import Sequential
from keras.layers import LSTM, Dense
from keras.utils import to_categorical

os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

Using plaidml.keras.backend backend.


In [4]:
os.environ["KERAS_BACKEND"]

'plaidml.keras.backend'

In [5]:
np.random.seed(1234) ### DOES NOT WORK???
w_length = 200
n_countries = 2

In [6]:
len(glob.glob("Raw Track Data\\*.csv"))

1800

### Import Data

In [7]:
import random
allTracks = pd.DataFrame()
for file in glob.glob("Raw Track Data\\*.csv"):
    if random.random() < .20:
        new = pd.read_csv(file)
        new["Country"] = file[15:17]
        new["Year"] = file[18:22]
        new["Playlist"] = file[23:-4]
        allTracks = allTracks.append(new)
allTracks = allTracks.drop(["confidence", "loudness_start", "loudness_max_time"], axis = 1)
print("Unique tracks:\t", len(pd.unique(allTracks.track_id)))

Unique tracks:	 14229


### Define category

In [8]:
Category = "Country"

In [9]:
cats = pd.unique(allTracks[Category])
print(cats)

['AD' 'AE' 'AR' 'AT' 'AU' 'BE' 'BG' 'BH' 'BO' 'BR' 'CA' 'CH' 'CL' 'CO'
 'CR' 'CY' 'CZ' 'DE' 'DK' 'DO' 'DZ' 'EC' 'EE' 'EG' 'ES' 'FI' 'FR' 'GB'
 'GR' 'GT' 'HK' 'HN' 'HU' 'ID' 'IE' 'IL' 'IN' 'IS' 'IT' 'JO' 'JP' 'KW'
 'LB' 'LI' 'LT' 'LU' 'LV' 'MA' 'MC' 'MT' 'MX' 'MY' 'NL' 'NO' 'NZ' 'OM'
 'PA' 'PE' 'PH' 'PL' 'PS' 'PT' 'PY' 'QA' 'RO' 'SA' 'SE' 'SG' 'SK' 'SV'
 'TH' 'TN' 'TR' 'TW' 'US' 'UY' 'VN' 'ZA']


### Drop within category duplicates

In [10]:
print(allTracks.shape)
allTracks = allTracks.drop_duplicates(["track_id", "start", Category])
print(allTracks.shape)

(20317978, 31)
(19043994, 31)


### Create train (60%), test (20%), and validation (20%) datasets at track level

In [11]:
UniqueTracks = pd.unique(allTracks.track_id)
testTracks = np.random.choice(UniqueTracks, int(len(UniqueTracks) * .1), replace = False)
test = allTracks.loc[allTracks.track_id.isin(testTracks)]
allTracks = allTracks.loc[~allTracks.track_id.isin(testTracks)]
UniqueTracks = pd.unique(allTracks.track_id)
testTracks = np.random.choice(UniqueTracks, int(len(UniqueTracks) * .2), replace = False)
val = allTracks.loc[allTracks.track_id.isin(testTracks)]
train = allTracks = allTracks.loc[~allTracks.track_id.isin(testTracks)]


In [12]:
train[Category]

0        AD
1        AD
2        AD
3        AD
4        AD
         ..
89612    ZA
89613    ZA
89614    ZA
89615    ZA
89616    ZA
Name: Country, Length: 13728459, dtype: object

In [13]:

#del(UniqueTracks, testTracks, allTracks)

In [14]:
print(test.shape)
print(val.shape)
print(train.shape)

(1909240, 31)
(3406295, 31)
(13728459, 31)


### Split tracks into subsegments

In [15]:
def split(X, cat):
    X = X.reset_index()
    new_pos = list(X.track_id.index[X.track_id.shift(1) != X.track_id]) # indices where the song changes
    new_pos.append(max(X.track_id.index) + 1) # add a new index to know where the last song ends
    split_pos = []
    for i in range(len(new_pos)-1):
        split_pos = split_pos + list(range(new_pos[i], new_pos[i+1], w_length))
    split_pos = split_pos[1:]
    us_train = np.split(X.iloc[:,:27].to_numpy(), split_pos)
    labs = np.split(X[Category].to_numpy(), split_pos)
    # drop the short sequences
    short_seqs = []
    temp = [] 
    labels = []
    for i, value in enumerate(us_train):
        if value.shape[0] == w_length:
            temp.append(value)
            labels.append(labs[i][0])
    us_train = temp
    return np.stack(us_train), labels

In [47]:
test_x, test_labels= split(test, Category)
#del(test)
val_x, val_labels = split(val, Category)
#del val
train_x, train_labels = split(train, Category)
#del train

In [48]:
print(test_x.shape)
print(len(test_labels))

(8378, 200, 27)
8378


### One-hot encode labels

In [49]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(cats.reshape(-1, 1))

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [50]:
cats

array(['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH', 'BO', 'BR', 'CA',
       'CH', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DE', 'DK', 'DO', 'DZ', 'EC',
       'EE', 'EG', 'ES', 'FI', 'FR', 'GB', 'GR', 'GT', 'HK', 'HN', 'HU',
       'ID', 'IE', 'IL', 'IN', 'IS', 'IT', 'JO', 'JP', 'KW', 'LB', 'LI',
       'LT', 'LU', 'LV', 'MA', 'MC', 'MT', 'MX', 'MY', 'NL', 'NO', 'NZ',
       'OM', 'PA', 'PE', 'PH', 'PL', 'PS', 'PT', 'PY', 'QA', 'RO', 'SA',
       'SE', 'SG', 'SK', 'SV', 'TH', 'TN', 'TR', 'TW', 'US', 'UY', 'VN',
       'ZA'], dtype=object)

In [51]:
np.unique(train_labels)

array(['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH', 'BO', 'BR', 'CA',
       'CH', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DE', 'DK', 'DO', 'DZ', 'EC',
       'EE', 'EG', 'ES', 'FI', 'FR', 'GB', 'GR', 'GT', 'HK', 'HN', 'HU',
       'ID', 'IE', 'IL', 'IN', 'IS', 'IT', 'JO', 'JP', 'KW', 'LB', 'LI',
       'LT', 'LU', 'LV', 'MA', 'MC', 'MT', 'MX', 'MY', 'NL', 'NO', 'NZ',
       'OM', 'PA', 'PE', 'PH', 'PL', 'PS', 'PT', 'PY', 'QA', 'RO', 'SA',
       'SE', 'SG', 'SK', 'SV', 'TH', 'TN', 'TR', 'TW', 'US', 'UY', 'VN',
       'ZA'], dtype='<U2')

In [59]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train_labels),
                                                 list(train_labels))
class_weights

array([0.92897867, 0.45788522, 1.15705128, 0.98964181, 1.21910135,
       1.74078887, 0.97960742, 0.55605054, 1.92266233, 4.68430458,
       0.92122796, 1.49788809, 1.46107799, 5.56050544, 4.20059922,
       0.99345791, 2.08331606, 0.64248567, 1.69126971, 1.5335521 ,
       1.47783988, 4.80068482, 1.51848773, 0.75701298, 1.01166264,
       0.77757571, 1.39514487, 1.98182117, 0.61390807, 1.12341607,
       0.48610708, 0.8319809 , 1.17463565, 0.70649932, 0.57295052,
       1.20391006, 1.39514487, 2.37089036, 1.02507992, 0.42189424,
       0.56499288, 1.88056023, 0.82224495, 1.4078511 , 2.08894664,
       1.20017121, 1.08250736, 1.88514697, 1.71376997, 1.26086502,
       5.60079896, 0.71765112, 4.15543149, 0.74246903, 0.79928672,
       0.3988185 , 1.83154089, 0.9565721 , 4.60065629, 0.98711399,
       1.03468575, 1.10415751, 2.59365858, 0.45599425, 1.66935261,
       0.43107097, 2.6111833 , 0.54163298, 0.6994663 , 1.1890927 ,
       0.77213812, 1.74866574, 2.18336231, 0.45252357, 1.83154

In [60]:
test_labels = enc.transform(np.array(test_labels).reshape(-1, 1)).toarray()
val_labels = enc.transform(np.array(val_labels).reshape(-1, 1)).toarray()
train_labels = enc.transform(np.array(train_labels).reshape(-1, 1)).toarray()

### Define Model

In [61]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM



In [62]:
train_x.shape[2]

27

In [63]:
out_index = 2
model = keras.Sequential()
model.add(LSTM(50, input_shape=(w_length, train_x.shape[2]), return_sequences = True))
model.add(LSTM(50, input_shape=(w_length, train_x.shape[2]), dropout=.5))
#model.add(Dense(200, activation= "relu"))
#model.add(keras.layers.Dropout(.25))
model.add(Dense(len(enc.categories_[0]), activation= "softmax"))
model.compile(loss = "categorical_crossentropy", optimizer= "adam", metrics=["acc"])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_7 (LSTM)                (None, 200, 50)           15600     
_________________________________________________________________
lstm_8 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_4 (Dense)              (None, 78)                3978      
Total params: 39,778
Trainable params: 39,778
Non-trainable params: 0
_________________________________________________________________
None


In [64]:
type(train_x)
train_x.dtype

dtype('float64')

In [65]:
train_labels.shape

(60287, 78)

In [66]:
max(np.sum(train_labels, axis = 0))/sum(np.sum(train_labels, axis = 0))

0.032146233848093286

In [67]:
max(np.sum(test_labels, axis = 0))/sum(np.sum(test_labels, axis = 0))

0.033540224397230846

In [68]:
enc.categories_[0]

array(['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH', 'BO', 'BR', 'CA',
       'CH', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DE', 'DK', 'DO', 'DZ', 'EC',
       'EE', 'EG', 'ES', 'FI', 'FR', 'GB', 'GR', 'GT', 'HK', 'HN', 'HU',
       'ID', 'IE', 'IL', 'IN', 'IS', 'IT', 'JO', 'JP', 'KW', 'LB', 'LI',
       'LT', 'LU', 'LV', 'MA', 'MC', 'MT', 'MX', 'MY', 'NL', 'NO', 'NZ',
       'OM', 'PA', 'PE', 'PH', 'PL', 'PS', 'PT', 'PY', 'QA', 'RO', 'SA',
       'SE', 'SG', 'SK', 'SV', 'TH', 'TN', 'TR', 'TW', 'US', 'UY', 'VN',
       'ZA'], dtype=object)

In [None]:
model.fit(train_x, train_labels,
          epochs = 50, 
          shuffle = True,
          validation_data = (val_x, val_labels),
          batch_size = 512,
          class_weight=class_weights)

Train on 60287 samples, validate on 15011 samples
Epoch 1/50

In [None]:
model.get_layer(index = 0)