In [1]:
from models import *
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, accuracy_score
from scoop import futures
import argparse
import os
import idx2numpy
from itertools import chain
import pickle

In [2]:
def data_processing(data):
    data.dropna(inplace=True)
    data.reset_index(drop=True, inplace=True)
    categorical_features = list(data.select_dtypes(include='object').columns)
    categorical_features = list(set(categorical_features))
    numerical_features = [c for c in data.columns if c not in categorical_features]
    scaler = MinMaxScaler()
    data[numerical_features] = scaler.fit_transform(data[numerical_features])

    for _c in categorical_features:
        data[_c] = pd.Categorical(data[_c])
    df_transformed = pd.get_dummies(data, drop_first=False)
    return df_transformed, scaler

def add_violations(X, y, count=40):
    indices = list(np.where((X.T[12] == 1))[0])
    instances_to_change = random.sample(indices, count)
    for i in instances_to_change:
        new_positive = random.randint(0, 2)
        new_y = [0]*5
        new_y[new_positive] = 1
        y[i] = np.array(new_y)

        
def get_violations(pred, X):
    indices = np.where((X.T[12] == 1))[0]
    pred_pop = pred[indices].T[3]
    pred_rock = pred[indices].T[4]
    pred_pop_or_rock = pred_pop + pred_rock
    return len(indices) - sum(pred_pop_or_rock)

In [3]:
dataset = 'music_data_24122020.csv'
dataset = pd.read_csv(dataset).iloc[:, 1:]
np.unique(dataset['Music Style'].tolist())

array(['Classical', 'Electronic', 'Metal', 'Pop', 'Rock'], dtype='<U10')

In [4]:
seed = 5
OUTER_FOLDS = 5

random.seed(100 + seed)
torch.manual_seed(100 + seed)
np.random.seed(100 + seed)
    
    
dataset = 'music_data_24122020.csv'
dataset = pd.read_csv(dataset).iloc[:, 1:]
dataset, scaler = data_processing(dataset)
y = np.array(dataset.iloc[:, -5:], dtype=np.float32)
X = np.array(dataset.iloc[:, :-5], dtype=np.float32)
add_violations(X, y, 60)
y = np.array(y, dtype=np.float32)

# number of folds for cross-validation
kf = KFold(n_splits=OUTER_FOLDS, shuffle=True, random_state=random.randint(0, 1000))
split_data = list(kf.split(X))

In [5]:
import torch.nn.functional as F

def input_last_layer(model, X):
    out = X
    for i in range(len(model.Layers) - 1):
        out = model.Layers[i](out)
        out = F.relu(out)
    return out.detach().numpy()

In [16]:
i = 1
for train_index, test_index in split_data:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(get_violations(y_train, X_train))
    print(get_violations(y_test, X_test))
    if i == 1:
        sys.exit(0)
    i += 1

51.0
10.0


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [17]:
# unconstrained model
unconstrained_model = MusicRegularizedModel(sizes=[13, 50, 30, 10, 5])
unconstrained_model.learn(X_train, y_train, learning_rate=0.1, batch_size=5, epochs=300,
                          momentum=0, K=K, alpha=0, validation_set_ratio=0.1, loss_type='srb')

epoch 1, validation loss: 0.5758939981460571; training loss: 0.5739101767539978; best loss: 0.5758939981460571
epoch 2, validation loss: 0.5062366127967834; training loss: 0.506457507610321; best loss: 0.5062366127967834
epoch 3, validation loss: 0.4920218884944916; training loss: 0.4898495674133301; best loss: 0.4920218884944916
epoch 4, validation loss: 0.49027854204177856; training loss: 0.4868735373020172; best loss: 0.49027854204177856
epoch 5, validation loss: 0.48521509766578674; training loss: 0.483405202627182; best loss: 0.48521509766578674
epoch 6, validation loss: 0.4788568317890167; training loss: 0.4772658944129944; best loss: 0.4788568317890167
epoch 7, validation loss: 0.46362558007240295; training loss: 0.4627443552017212; best loss: 0.46362558007240295
epoch 8, validation loss: 0.42948293685913086; training loss: 0.43277424573898315; best loss: 0.42948293685913086
epoch 9, validation loss: 0.3894917070865631; training loss: 0.39699873328208923; best loss: 0.3894917070

In [18]:
deepsade_model = torch.load('{}/deepsade_model_fold_{}.pt'.format(seed, i))

In [19]:
from sklearn.manifold import TSNE

def find_relevant_instances(X):
    indices = list(np.where((X.T[12] == 1))[0])
    return indices

def get_tsne_data(model, X_test):
    X_l = input_last_layer(model, torch.tensor(X_test))
    X_embedded = TSNE(n_components=2, init='random', perplexity=5).fit_transform(X_l)
    X_embedded = pd.DataFrame(X_embedded)
    X_embedded.columns = ['dim1', 'dim2']
    
    score = model(torch.tensor(X_test)).detach().numpy()
    pred = []
    for p in score:
        pred.append(np.argmax(p))
        pred_d = np.zeros((len(pred), len(score[0])))
        pred_d[np.arange(len(pred)), pred] = 1
    pred_d = np.array(pred_d)
    print(pred_d.sum(axis=0))
    print(get_violations(pred_d, X_test))
    
    predicted_genre = []
    for i in range(len(pred_d)):
        if pred_d[i][0] == 1:
            predicted_genre.append('classical')
        elif pred_d[i][1] == 1:
            predicted_genre.append('electronic')
        elif pred_d[i][2] == 1:
            predicted_genre.append('metal')
        elif pred_d[i][3] == 1:
            predicted_genre.append('pop')
        else:
            predicted_genre.append('rock')
    
    pop_rock_prediction_idx = [i for i in range(len(pred_d)) if (pred_d[i][3] == 1) or (pred_d[i][4] == 1)]
    beatles_idx = find_relevant_instances(X_test)
    X_embedded['beatles'] = 'no'
    X_embedded['pop-or-rock'] = 'no'
    X_embedded.loc[beatles_idx, 'beatles'] = 'yes'
    X_embedded.loc[pop_rock_prediction_idx, 'pop-or-rock'] = 'yes'
    X_embedded['predicted-genre'] = predicted_genre
    return X_embedded

In [20]:
X_test_tsne_deepsade = get_tsne_data(deepsade_model, X_test)
X_test_tsne_unconstrained = get_tsne_data(unconstrained_model, X_test)

[20. 26. 38. 36. 39.]
0.0
[22. 25. 37. 41. 34.]
2.0


In [21]:
def big_chart(chart, fontsize = 20): 
    return chart.configure_axis(
        grid = True, 
    labelFontSize = fontsize,
    titleFontSize = fontsize
).configure_title(
    fontSize = fontsize
    ).configure_legend(
titleFontSize=fontsize,
labelFontSize=fontsize
).configure_view(
    strokeWidth=0
)

def small_chart(chart, fontsize=None): 
    return big_chart(chart.properties(width=150,
                             height=150
                            ), fontsize)

In [22]:
import altair as alt

c1 = alt.Chart(X_test_tsne_deepsade).mark_point().encode(
    x='dim1',
    y='dim2',
    color='beatles'
)

c2 = alt.Chart(X_test_tsne_deepsade).mark_point().encode(
    x='dim1',
    y='dim2',
    color='pop-or-rock'
)

c3 = alt.Chart(X_test_tsne_deepsade).mark_point().encode(
    x='dim1',
    y='dim2',
    color='predicted-genre'
)

big_chart(c1, fontsize=25)

In [23]:
big_chart(c2, fontsize=25)

In [26]:
c1_u = alt.Chart(X_test_tsne_unconstrained).mark_point().encode(
    x='dim1',
    y='dim2',
    color='beatles'
)

c2_u = alt.Chart(X_test_tsne_unconstrained).mark_point().encode(
    x='dim1',
    y='dim2',
    color='pop-or-rock'
)

c3_u = alt.Chart(X_test_tsne_unconstrained).mark_point().encode(
    x='dim1',
    y='dim2',
    color='predicted-genre'
)

big_chart(c1_u, fontsize=25)

In [27]:
big_chart(c2_u, fontsize=25)

In [183]:
X_embedded_2 = TSNE(n_components=2, init='random', perplexity=3).fit_transform(X_l)
X_embedded_2 = pd.DataFrame(X_embedded_2)
X_embedded_2.columns = ['dim1', 'dim2']

beatles_idx = find_relevant_instances(X_test)
X_embedded_2['beatles'] = 'no'
X_embedded_2['pop-or-rock'] = 'no'
X_embedded_2.loc[beatles_idx, 'beatles'] = 'yes'
X_embedded_2.loc[pop_rock_prediction_idx, 'pop-or-rock'] = 'yes'
X_embedded_2['predicted-genre'] = predicted_genre
X_embedded_2

Unnamed: 0,dim1,dim2,beatles,pop-or-rock,predicted-genre
0,-14.509541,-58.773964,no,yes,rock
1,-10.274336,-22.682421,yes,yes,pop
2,-28.612734,1.138877,yes,yes,pop
3,1.365832,-21.182203,yes,yes,pop
4,-19.620451,-7.197937,yes,yes,pop
...,...,...,...,...,...
154,51.846794,-13.995423,no,no,electronic
155,-24.754915,-3.534508,yes,yes,pop
156,-29.966299,1.877599,yes,yes,pop
157,41.557880,37.750122,no,no,metal


In [184]:
c1 = alt.Chart(X_embedded_2).mark_point().encode(
    x='dim1',
    y='dim2',
    color='beatles'
)
big_chart(c1, fontsize=25)