In [1]:
import numpy as np
import pandas as pd

In [7]:
data = pd.read_csv('data/fma/pca95_data.csv')

print(data.shape)
data.tail(10)
print(data.iloc[1:8001, 0])

data_ = data.set_index(data['track_id']).drop(data_.columns[0], axis=1)
data_.tail()

(8000, 209)
1            5
2           10
3          140
4          141
5          148
         ...  
7995    154308
7996    154309
7997    154413
7998    154414
7999    155066
Name: track_id, Length: 7999, dtype: int64


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,198,199,200,201,202,203,204,205,206,genre
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
154308,-1.835371,-3.867849,-6.521848,-2.411572,3.434481,2.289318,-0.829162,-0.129816,-3.757467,0.445772,...,-0.419148,-0.558558,0.445757,0.76863,-1.068514,-0.262254,-0.255049,-0.360921,0.713248,3
154309,-11.417151,-1.690663,3.894931,5.543107,-3.539808,-0.751535,1.902261,-8.342069,-0.799709,-5.173376,...,-0.846237,0.63126,-0.050538,1.153211,-0.327087,0.745656,-0.017804,0.305331,0.027885,3
154413,-7.985121,-8.34207,-8.434832,5.441379,0.428404,3.982633,3.205439,2.140876,2.614021,0.73776,...,-0.141369,0.835807,-0.682346,-0.724363,0.220534,-0.193568,0.935017,0.227057,-0.888828,6
154414,-8.804382,-3.467991,-0.408808,2.326901,-4.537887,-4.941451,3.604733,1.655934,-0.287049,-0.54133,...,0.542697,-0.50865,-0.3938,-0.133884,-0.449462,0.234049,0.427487,-0.354786,-0.171535,6
155066,-6.918844,-5.785399,-4.790543,-2.403361,0.468361,-0.12557,1.921701,0.25092,-4.934083,-2.80338,...,-0.060823,-0.310459,-0.691978,-1.086134,-0.970882,0.498464,-0.510837,-0.547809,0.651497,3


In [13]:
X = data.iloc[1:8001, 1:208].values
y = data.iloc[1:8001, 208].values

In [17]:
def standardise(X):
    mu = np.mean(X, 0)
    sigma = np.std(X, 0)
    X_std = (X - mu) / sigma
    return X_std

X_std = pd.DataFrame(standardise(X))

In [18]:
# insert 1 in every row for intercept b
X_std.insert(loc=len(X_std.columns), column='intercept', value=1)

# stacking data X and labels y into one matrix
data_split = np.hstack((X_std, y[:, np.newaxis]))
       
np.random.shuffle(data_split)

split_rate = 0.7
train, test = np.split(data_split, [int(split_rate*(data_split.shape[0]))])

X_train = train[:,:-1]
y_train = train[:, -1]

X_test = test[:,:-1]
y_test = test[:, -1]

y_train = y_train.astype(float)
y_test = y_test.astype(float)

In [19]:
def compute_cost(w, X, y, regul_strength=1e5):
    n = X.shape[0]
    distances = 1 - y * (X @ w)
    distances[distances < 0] = 0
    hinge = regul_strength * distances.mean()

    # calculate cost
    return 0.5 * np.dot(w, w) + hinge

In [20]:
def calculate_cost_gradient(w, X_batch, y_batch, regul_strength=1e5):
    # if only one example is passed
    if type(y_batch) == np.float64:
        y_batch = np.asarray([y_batch])
        X_batch = np.asarray([X_batch])

    distance = 1 - (y_batch * (X_batch @ w))
    dw = np.zeros(len(w))

    for ind, d in enumerate(distance):
        if max(0, d)==0:
            di = w
        else:
            di = w - (regul_strength * y_batch[ind] * X_batch[ind])
        dw += di

    return dw/len(y_batch)

In [21]:
def sgd(X, y, batch_size=16, max_iterations=2000, stop_criterion=0.01, learning_rate=1e-5, regul_strength=1e5, print_outcome=False):
    # initialise zero weights
    weights = np.zeros(X.shape[1])
    nth = 0
    # initialise starting cost as infinity
    prev_cost = np.inf
    
    # stochastic gradient descent
    indices = np.arange(len(y))
    for iteration in range(1, max_iterations):
        # shuffle to prevent repeating update cycles
        np.random.shuffle(indices)
        batch_idx = indices[:batch_size]
        X_b, y_b = X[batch_idx], y[batch_idx]
        for xi, yi in zip(X_b, y_b):
            ascent = calculate_cost_gradient(weights, xi, yi, regul_strength)
            weights = weights - (learning_rate * ascent)

        # convergence check on 2^n'th iteration
        if iteration==2**nth or iteration==max_iterations-1:
            # compute cost
            cost = compute_cost(weights, X, y, regul_strength)
            if print_outcome:
                print("Iteration is: {}, Cost is: {}".format(iteration, cost))
            # stop criterion
            if abs(prev_cost - cost) < stop_criterion * prev_cost:
                return weights
              
            prev_cost = cost
            nth += 1
    
    return weights

In [22]:
w = sgd(X_train, y_train, batch_size=32, max_iterations=2000, stop_criterion=0.01, learning_rate=1e-5, regul_strength=1e3, print_outcome=True)
print("Training finished.")

Iteration is: 1, Cost is: 1708.065518639834
Iteration is: 2, Cost is: 1690.6992477254603
Iteration is: 4, Cost is: 1390.4088558767921
Iteration is: 8, Cost is: 822.0919930067532
Iteration is: 16, Cost is: 554.7778174932002
Iteration is: 32, Cost is: 470.5072800348603
Iteration is: 64, Cost is: 358.47450929804074
Iteration is: 128, Cost is: 263.1960896356067
Iteration is: 256, Cost is: 196.97363092501456
Iteration is: 512, Cost is: 173.32167398166843
Iteration is: 1024, Cost is: 160.66406664816338
Iteration is: 1999, Cost is: 146.9612783354271
Training finished.


In [23]:
def score(w, X, y):
    y_preds = np.sign(X @ w)
    return np.mean(y_preds == y)

In [24]:
print("Accuracy on train set: {}".format(score(w, X_train, y_train)))
print("Accuracy on test set: {}".format(score(w, X_test, y_test)))

Accuracy on train set: 0.12627254866940524
Accuracy on test set: 0.12208333333333334


In [25]:
def cross_val_split(N, num_folds):
    fold_size = N // num_folds
    index_perm = np.random.permutation(np.arange(N))
    folds = []
    for k in range(num_folds):
      folds.append(index_perm[k*fold_size:(k+1)*fold_size])
    return folds

In [26]:
folds = cross_val_split(train.shape[0], 5)
folds

[array([ 124, 1197, 1632, ..., 2746, 2332, 5372]),
 array([ 895, 4504, 3256, ..., 4943,  388, 1858]),
 array([ 139, 3623, 5587, ..., 2718, 4274, 1528]),
 array([1757, 3917, 2595, ..., 2870, 5268, 1870]),
 array([4547, 4780,   70, ..., 1229,    6, 2757])]

In [27]:
def cross_val_evaluate(data, num_folds):
    folds = cross_val_split(data.shape[0], num_folds)

    train_scores = []
    val_scores = []
    
    for i in range(len(folds)):
        print('Fold', i+1)

        val_indices = folds[i]
        train_indices = list(set(range(data.shape[0])) - set(val_indices))

        X_train = data[train_indices,  :-1]
        y_train = data[train_indices, -1]
        
        X_val = data[val_indices,  :-1]
        y_val = data[val_indices, -1]

        w = sgd(X_train, y_train, max_iterations=1025, stop_criterion=0.01, learning_rate=1e-5, regul_strength=1e3)
        print("Training finished.")

        train_score = score(w, X_train, y_train)
        val_score = score(w, X_val, y_val)
        print("Accuracy on train set #{}: {}".format(i+1, train_score))
        print("Accuracy on validation set #{}: {}".format(i+1, val_score))

        train_scores.append(train_score)
        val_scores.append(val_score)

    return train_scores, val_scores

In [28]:
train_scores, val_scores = cross_val_evaluate(train, 5)

Fold 1
Training finished.
Accuracy on train set #1: 0.1279017857142857
Accuracy on validation set #1: 0.11796246648793565
Fold 2
Training finished.
Accuracy on train set #2: 0.12165178571428571
Accuracy on validation set #2: 0.14119749776586238
Fold 3
Training finished.
Accuracy on train set #3: 0.07008928571428572
Accuracy on validation set #3: 0.06613047363717604
Fold 4
Training finished.
Accuracy on train set #4: 0.1279017857142857
Accuracy on validation set #4: 0.11885612153708669
Fold 5
Training finished.
Accuracy on train set #5: 0.12410714285714286
Accuracy on validation set #5: 0.13315460232350312


In [29]:
print(np.mean(train_scores), np.mean(val_scores))

0.11433035714285714 0.1154602323503128
