### Import libraries and set parameters

In [1]:
import pandas as pd
import numpy as np
import glob
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import class_weight
import pickle
import random

Category = "Country"
countriesOfInterest = ["HK", "JP", 'ZA', 'TN', 'TR', 'GB', 'MX', 'US', 'CO', 'EC', 'AU', 'NZ']
countriesOfInterest = ["HK", "JP", 'ZA', 'TN', 'TR', 'GB', 'MX', 'US', 'CO', 'EC', 'AU']

w_length = 200

### Import track data

In [None]:
allTracks = pd.DataFrame()
for country in countriesOfInterest:
    ls = glob.glob("Raw Track Data\\" + country + "*.csv")
    print(country, "has", str(len(ls)), "playlists")
    for file in ls:
        if random.random() < 1:
            new = pd.read_csv(file)
            new["Country"] = file[15:17]
            new["Year"] = file[18:22]
            new["Playlist"] = file[23:-4]
            allTracks = allTracks.append(new)
allTracks = allTracks.drop(["confidence", "loudness_start", "loudness_max_time"], axis = 1)
del new
print("Unique tracks:\t", len(pd.unique(allTracks.track_id)))

HK has 92 playlists
JP has 66 playlists
ZA has 119 playlists
TN has 107 playlists
TR has 82 playlists


### Remove tracks in multiple countries

In [None]:
hold = allTracks.groupby("track_id").nunique()
keep = hold[hold.Country==1].index
allTracks = allTracks[allTracks.track_id.isin(keep)]

In [None]:
print(allTracks.shape)
allTracks = allTracks.drop_duplicates(["track_id", "start"])
print(allTracks.shape)

In [None]:
sum(hold.Country)/hold.shape[0]

### Fit one-hot encoder

In [None]:
cats = pd.unique(allTracks[Category])
print(cats)

In [None]:
enc = OneHotEncoder()
enc.fit(cats.reshape(-1, 1))
pickle.dump( enc, open( "pickle\\enc.p", "wb" ) )

### Separate training, testing, and validation datasets

In [None]:
UniqueTracks = pd.unique(allTracks.track_id)
testTracks = np.random.choice(UniqueTracks, int(len(UniqueTracks) * .1), replace = False)
test = allTracks.loc[allTracks.track_id.isin(testTracks)]
allTracks = allTracks.loc[~allTracks.track_id.isin(testTracks)]
UniqueTracks = pd.unique(allTracks.track_id)
testTracks = np.random.choice(UniqueTracks, int(len(UniqueTracks) * .2), replace = False)
val = allTracks.loc[allTracks.track_id.isin(testTracks)]
train = allTracks = allTracks.loc[~allTracks.track_id.isin(testTracks)]
del UniqueTracks, testTracks, allTracks

### Structure datasets

In [None]:
def split(X, cat):
    X = X.reset_index()
    new_pos = list(X.track_id.index[X.track_id.shift(1) != X.track_id]) # indices where the song changes
    new_pos.append(max(X.track_id.index) + 1) # add a new index to know where the last song ends
    split_pos = []
    for i in range(len(new_pos)-1):
        split_pos = split_pos + list(range(new_pos[i], new_pos[i+1], w_length))
    split_pos = split_pos[1:]
    us_train = np.split(X.iloc[:,:27].to_numpy(), split_pos)
    labs = np.split(X[Category].to_numpy(), split_pos)
    # drop the short sequences
    short_seqs = []
    temp = [] 
    labels = []
    for i, value in enumerate(us_train):
        if value.shape[0] == w_length:
            temp.append(value)
            labels.append(labs[i][0])
    us_train = temp
    return np.stack(us_train), labels

In [None]:
test_x, test_labels= split(test, Category)
del(test)
val_x, val_labels = split(val, Category)
del val
train_x, train_labels = split(train, Category)
del train

In [None]:
pickle.dump( test_x, open( "pickle\\test_x.p", "wb" ) )
pickle.dump( val_x, open( "pickle\\val_x.p", "wb" ) )
pickle.dump( train_x, open( "pickle\\train_x.p", "wb" ) )

### Generate class weights for unbalanced calsses

In [None]:
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train_labels),
                                                 list(train_labels))
class_weights

In [None]:
pickle.dump( class_weights, open( "pickle\\class_weights.p", "wb" ) )

### One-hot encode labels

In [None]:
test_labels = enc.transform(np.array(test_labels).reshape(-1, 1)).toarray()
val_labels = enc.transform(np.array(val_labels).reshape(-1, 1)).toarray()
train_labels = enc.transform(np.array(train_labels).reshape(-1, 1)).toarray()

In [None]:
pickle.dump( test_labels, open( "pickle\\test_labels.p", "wb" ) )
pickle.dump( val_labels, open( "pickle\\val_labels.p", "wb" ) )
pickle.dump( train_labels, open( "pickle\\train_labels.p", "wb" ) )