In [26]:
import os
import csv
import numpy as np
import pandas as pd
from joblib import dump, load
from sklearn.metrics import accuracy_score

### Data Loading

In [27]:
# df = pd.read_csv(os.path.join('testset', 'task_4_all_generic.csv'))
# pd.set_option('display.max_columns', None)

# test_set = df.loc[df['segment_id'].isin([2, 6, 8, 11, 12, 20])]
# train_set = df.loc[~df['segment_id'].isin([2, 6, 8, 11, 12, 20])]
# train_set
# test_set.to_csv(os.path.join('testset', 'task_4_test_strat.csv'), index=False)
# train_set.to_csv(os.path.join('testset', 'task_4_train_strat.csv'), index=False)

In [210]:
# df = pd.read_csv(os.path.join('testset', 'task_4_test_strat.csv'))
# df = pd.read_csv(os.path.join('testset', 'task_4_test_generic.csv'))
df = pd.read_pickle(os.path.join('testset', 'old', 'task_3_test_pandas_v1.pkl')) #testset erich; groups with group_ids
pd.set_option('display.max_columns', None)   

In [211]:
def load_data(df, data_components=[], target_components=[]):
    groups = pd.Series(df["pianist_id"]*100 + df["segment_id"], name='groups')
    X = pd.DataFrame(index=df.index)
    y = pd.DataFrame(index=df.index)
    
    for component in data_components:
        if component == "emotions":
            X = pd.concat([X, df.iloc[:, 175 : 188]], axis=1)
        elif component == "emotions_binary":
            X = pd.concat([X, df.iloc[:, 188:]], axis=1)
        elif component == "tech_features":
            X = pd.concat([X, df.iloc[:, 3:172]], axis=1)
        elif component == "valence_arousal":
            X = pd.concat([X, df[['valence', 'arousal']]], axis=1)
        elif component == "valence":
            X = pd.concat([X, df['valence']], axis=1)
        elif component == "arousal":
            X = pd.concat([X, df['arousal']], axis=1)

    for component in target_components:
        if component == 'quadrant':
            y = pd.concat([y, df['quadrant']], axis=1)
        elif component == 'valence_arousal':
            y = pd.concat([y, df[['valence', 'arousal']]], axis=1)
        elif component == 'valence':
            y = pd.concat([y, df['valence']], axis=1)
        elif component == 'arousal':
            y = pd.concat([y, df['arousal']], axis=1)
        elif component == 'emotions':
            y = pd.concat([y, df.iloc[:, 175 : 188]], axis=1)
        elif component == 'emotions_binary':
            y = pd.concat([y, df.iloc[:, 188:]], axis=1)
        elif component == 'group_idx':
            y = pd.concat([y, groups], axis=1)

    return (X, y, groups)

In [212]:
X, y, groups = load_data(df, data_components=['tech_features'], target_components=['valence', 'arousal', 'group_idx'])
y.head()

Unnamed: 0,valence,arousal,groups
0,-1.0,3.272727,113
1,-1.0,3.272727,113
2,-1.0,3.272727,113
3,-1.0,3.272727,113
4,-1.0,3.272727,113


### Load Models

In [213]:
# xgb = load(os.path.join('best_XGBRe.joblib'))
xgb1 = load(os.path.join('trained_models', 'best_XGBRe_3_72_v_0_a_3_generic.joblib'))
xgb2 = load(os.path.join('trained_models', 'best_XGBRe_score_3_11.joblib'))
xgb3 = load(os.path.join('trained_models', 'best_XGBRe_score_3_15.joblib'))
xgb4 = load(os.path.join('trained_models', 'best_XGBRe_score_3_17.joblib'))
xgb5 = load(os.path.join('trained_models', 'best_XGBRe_score_4_05.joblib'))

# knn = load(os.path.join('best_KNeig.joblib'))
# knn1 = load(os.path.join('trained_models', 'best_KNeig_score_5_24_v_0.4_a_3.2_generic.joblib'))


models = [xgb1, xgb2, xgb3, xgb4, xgb5]
# models= [knn1]

### Calculate quadrant from prediction

In [214]:
def calc_quadrant(y_hat): #calculates grouped quadrant!
    group_idx = np.c_[y_hat[:, 2]] #convert to column vector
    quadrants = np.empty((len(y_hat), 2), dtype='int')
    quadrants = np.hstack((quadrants, group_idx))
    y_hat_with_group = np.hstack((y_hat[:, :-1], group_idx))
    means_grouped_y_hat = np.empty((len(np.unique(group_idx)), 3))

    #calculate mean per group
    for i, id in enumerate(np.unique(group_idx)): #unique group IDs
        current_group_indices = np.where(y_hat_with_group[:,-1] == id)
        means_grouped_y_hat[i] = y_hat_with_group[current_group_indices].mean(axis=0)

    #calculate quadrant and broadcast to snippets ------------ dirty double loop but should work :/
    for i, sample in enumerate(means_grouped_y_hat):
        for row in quadrants:
            if sample[2] == row[-1]:
                row[1] = v_a_to_quadrant_skewed(sample[0], sample[1])
        
    quadrants = quadrants[:,1:2] #also very dirty; droppint first unnecessary column; where does the extra column come from??

    return quadrants
    

### Inference

In [215]:
v_a_predictions = []
for model in models:
    pred = model.predict(X)
    v_a_predictions.append(pred)

v_a_predictions

[array([[-0.98236424,  3.31119156, 13.88181973],
        [-1.10189056,  3.18560362, 13.86159801],
        [-1.00606406,  3.24825549, 13.68513489],
        ...,
        [-0.35645688,  2.00673795,  5.55353689],
        [-0.21374188,  2.15671206,  6.49888515],
        [ 0.25534758,  2.19753742,  6.06625509]]),
 array([[-7.88428545e-01,  3.04954100e+00,  3.27322937e+02],
        [-5.85221946e-01,  3.31266809e+00,  4.57993927e+02],
        [-9.86335337e-01,  3.29500675e+00,  2.95689819e+02],
        ...,
        [-1.87183633e-01,  2.15968251e+00,  6.96101807e+02],
        [-3.28343958e-01,  2.42437506e+00,  6.29583374e+02],
        [-7.57906064e-02,  1.95280421e+00,  6.75077759e+02]]),
 array([[-9.49520350e-01,  2.93816161e+00,  2.43601929e+02],
        [-8.67586136e-01,  3.01529169e+00,  5.97901794e+02],
        [-8.80964279e-01,  3.32978916e+00,  3.35234955e+02],
        ...,
        [-1.56794786e-01,  2.13418937e+00,  8.45257141e+02],
        [-1.96188778e-01,  2.27112770e+00,  6.5021435

### Calculate Score

In [216]:
costs = []
for pred in v_a_predictions:
    # cost = custom_error_grouped(y, pred)
    costs.append(custom_error_grouped(y.to_numpy(), pred))
costs


[6.611111111111111,
 3.3703703703703702,
 3.5555555555555554,
 3.2222222222222223,
 4.111111111111111]

In [196]:
costs

[2.4545454545454546,
 5.681818181818182,
 6.2272727272727275,
 4.7272727272727275,
 5.75]

### Helpers

In [204]:
def v_a_to_quadrant(valence, arousal):
    if arousal < 3:
        if valence < 0:
            return 3 # sad
        return 4 # relaxed
    if valence < 0:
        return 2 # angry
    return 1 # happy

In [208]:
def v_a_to_quadrant_skewed(valence, arousal):
    if arousal < 3:
        if valence < 0:
            return 3 # sad
        return 4 # relaxed
    if valence < 0:
        return 2 # angry
    return 1 # happy

In [130]:
def custom_error_grouped(y_true, y_hat):
    if len(y_hat[0]) == 3:
        y_hat[:,-1] = y_true[:,-1].copy()
    else:
        y_hat = np.hstack((y_hat, y_true[:,-1].copy().reshape(-1,1)))
    means_grouped_y_hat = np.empty((len(np.unique(y_true[:,-1])), len(y_true[0]))) #means of groups with shape n_groups x n_predicted_features
    means_grouped_y_true = np.empty((len(np.unique(y_true[:,-1])), len(y_true[0]))) # container for grouped y_true data
    cost = 0
    N = len(means_grouped_y_hat)

    gain_matrix = np.array([
    [5, -5, -5, 2],
    [-5, 10, 2, -5],
    [-5, 2, 10, -5],
    [2, -5, -2, 5]
    ])

    for i, id in enumerate(np.unique(y_true[:,-1])): #unique group IDs
        current_group_indices = np.where(y_hat[:,-1] == id)
        means_grouped_y_true[i] = y_true[current_group_indices].mean(axis=0)
        means_grouped_y_hat[i] = y_hat[current_group_indices].mean(axis=0)


    for y, prediction in zip(means_grouped_y_true, means_grouped_y_hat):
        prediction_quadrant = v_a_to_quadrant_skewed(prediction[0], prediction[1])
        y_quadrant = v_a_to_quadrant(y[0], y[1])
        # print(f"Gain: {gain_matrix[y_quadrant-1][prediction_quadrant-1]}, Coordinates: {y_quadrant-1}, {prediction_quadrant-1}")
        cost += gain_matrix[y_quadrant-1][prediction_quadrant-1]

    return cost/N

## Challenge Test Set Inference

In [131]:
challenge_test_set_df = pd.read_csv(r'challenge_test_set.csv')
X_challenge = challenge_test_set_df.iloc[:, 3:]

In [132]:
pred_challenge = models[0].predict(X_challenge)
pred_challenge_df = pd.DataFrame(pred_challenge)


In [133]:
pred_challenge_df["group_idx"] = challenge_test_set_df["pianist_id"] * 100 + challenge_test_set_df["segment_id"]
pred_challenge_df.drop(columns=[2], inplace=True) #use this line if group_idx is in y_true for training
preds_with_group_idx = pred_challenge_df.to_numpy()
preds_with_group_idx

array([[1.68882143e-02, 2.88751078e+00, 1.27000000e+02],
       [2.53189802e-01, 2.81201744e+00, 1.27000000e+02],
       [4.49354798e-01, 3.00151968e+00, 1.27000000e+02],
       ...,
       [1.48730993e-01, 3.65526724e+00, 1.16100000e+03],
       [7.29179978e-01, 4.00092125e+00, 1.16100000e+03],
       [6.54234141e-02, 4.02586555e+00, 1.16100000e+03]])

In [134]:
preds_quadrants_challenge = calc_quadrant(preds_with_group_idx)
preds_quadrants_challenge

array([[4.],
       [4.],
       [4.],
       ...,
       [1.],
       [1.],
       [1.]])

In [135]:
final_preds_challenge = np.hstack((challenge_test_set_df.iloc[:, :3].to_numpy(), preds_quadrants_challenge)).astype(int)
final_challenge_preds_df = pd.DataFrame(final_preds_challenge, columns=["pianist_id", "segment_id", "snippet_id", "quadrant"])
final_challenge_preds_df.head(50)

Unnamed: 0,pianist_id,segment_id,snippet_id,quadrant
0,1,27,0,4
1,1,27,1,4
2,1,27,2,4
3,1,27,3,4
4,1,28,0,1
5,1,28,1,1
6,1,28,2,1
7,1,28,3,1
8,1,28,4,1
9,1,29,0,1


In [136]:
final_challenge_preds_df.to_csv(r'MLPC_submission_4.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)