In [1]:
import numpy as np
import pandas as pd
from tempfile import mkdtemp
import sklearn
from sklearn.model_selection import train_test_split, KFold, GroupShuffleSplit
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingRegressor, StackingClassifier, StackingRegressor
from xgboost.sklearn import XGBRegressor
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.decomposition import PCA, KernelPCA, FastICA
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.multioutput import RegressorChain, ClassifierChain
from sklearn.preprocessing import StandardScaler, QuantileTransformer, MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import ClassifierMixin, BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel, f_classif
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge, LogisticRegression
# from imblearn.over_sampling import SMOTE

import seaborn as sns
import matplotlib.pyplot as plt


from joblib import dump, load

sns.set_theme()

### Data Loading

In [2]:
df = pd.read_csv(os.path.join('trainset', 'task_4_train_generic.csv'))
pd.set_option('display.max_columns', None)   

In [3]:
def load_data(df, data_components=[], target_components=[]):
    groups = pd.Series(df["segment_id"],  name='groups') #pd.Series(df["pianist_id"]*100 + df["segment_id"], name='groups')
    X = pd.DataFrame(index=df.index)
    y = pd.DataFrame(index=df.index)
    
    for component in data_components:
        if component == "emotions":
            X = pd.concat([X, df.iloc[:, 175 : 188]], axis=1)
        elif component == "emotions_binary":
            X = pd.concat([X, df.iloc[:, 188:]], axis=1)
        elif component == "tech_features":
            X = pd.concat([X, df.iloc[:, 3:172]], axis=1)
        elif component == "valence_arousal":
            X = pd.concat([X, df[['valence', 'arousal']]], axis=1)

    for component in target_components:
        if component in df.columns:
            y = pd.concat([y, df[component]], axis=1)
        elif component == 'valence_arousal':
            y = pd.concat([y, df[['valence', 'arousal']]], axis=1)
        elif component == 'emotions':
            y = pd.concat([y, df.iloc[:, 175 : 189]], axis=1)
        elif component == 'emotions_binary':
            y = pd.concat([y, df.iloc[:, 189:]], axis=1)
        elif component == 'group_idx':
            y = pd.concat([y, groups], axis=1)

    return (X, y, groups)

In [4]:
X, y, groups = load_data(df, data_components=['emotions', 'tech_features', 'valence_arousal'], target_components=['valence_arousal', 'group_idx'])
y

Unnamed: 0,valence,arousal,groups
0,-0.363636,2.818182,0
1,-0.363636,2.818182,0
2,-0.363636,2.818182,0
3,-0.363636,2.818182,0
4,-0.363636,2.818182,0
...,...,...,...
1922,0.363636,2.000000,22
1923,0.363636,2.000000,22
1924,0.363636,2.000000,22
1925,0.363636,2.000000,22


### Custom Scoring/Loss Functions

In [5]:
GAIN_MATRIX = np.array([
    [5, -5, -5, 2],
    [-5, 10, 2, -5],
    [-5, 2, 10, -5],
    [2, -5, -2, 5]
])

In [6]:
X, y, _  = load_data(df, data_components=['tech_features'], target_components=['valence_arousal', 'group_idx'])
y

Unnamed: 0,valence,arousal,groups
0,-0.363636,2.818182,0
1,-0.363636,2.818182,0
2,-0.363636,2.818182,0
3,-0.363636,2.818182,0
4,-0.363636,2.818182,0
...,...,...,...
1922,0.363636,2.000000,22
1923,0.363636,2.000000,22
1924,0.363636,2.000000,22
1925,0.363636,2.000000,22


In [7]:
def custom_error_grouped(y_true, y_hat):

    y_hat[:,-1] = y_true[:,-1].copy()

    means_grouped_y_hat = np.empty((len(np.unique(y_true[:,-1])), len(y_true[0]))) #means of groups with shape n_groups x n_predicted_features
    means_grouped_y_true = np.empty((len(np.unique(y_true[:,-1])), len(y_true[0]))) # container for grouped y_true data
    cost = 0
    N = len(means_grouped_y_hat)

    cost_matrix = np.array([
    [5, -5, -5, 2],
    [-5, 10, 2, -5],
    [-5, 2, 10, -5],
    [2, -5, -2, 5]
    ])


    for i, id in enumerate(np.unique(y_true[:,-1])): #unique group IDs
        current_group_indices = np.where(y_hat[:,-1] == id)
        means_grouped_y_true[i] = y_true[current_group_indices].mean(axis=0)
        means_grouped_y_hat[i] = y_hat[current_group_indices].mean(axis=0)


    for y, prediction in zip(means_grouped_y_true, means_grouped_y_hat):
        prediction_quadrant = v_a_to_quadrant_skewed(prediction[0], prediction[1])
        y_quadrant = v_a_to_quadrant(y[0], y[1])

        cost += cost_matrix[y_quadrant-1][prediction_quadrant-1]



    return cost/N

custom_grouped_scorer = make_scorer(custom_error_grouped, greater_is_better=True)


### Training Function

#### Classification

#### Regression

In [8]:
def train_regressor(model_params, data_components=['tech_features'], target_components=['valence_arousal'], top_X=[10], scalers=[], regressors=[], feature_selectors=[], embedders=[], 
                    n_jobs=-1, cv=5, scoring='neg_mean_squared_error', memory=None):  #model params must be array of objects with same length as classifiers

    X, y, groups = load_data(df, data_components=data_components, target_components=target_components)
    
    cv = GroupShuffleSplit(n_splits=cv, random_state=42)

    current_best_score_regr = -1000
    current_best_regressor = None
    grid_results = []

    for i, regr_model_class in enumerate(regressors):
        print(f'Regressor model class: {regr_model_class}')
        for feat_model_class in feature_selectors:
            print(f'Feature selector model class: {feat_model_class}')
            for emb_model_class in embedders:
                print(f'Embedding model class: {emb_model_class}')
                for scaler in scalers:
                    for num in top_X:

                        pipe = Pipeline(steps=[
                            ('scaler', scaler),
                            ('embedder', emb_model_class),
                            ('feature_selector', 'passthrough' if feat_model_class == 'passthrough' else SelectFromModel(feat_model_class, max_features=num)),
                            ('estimator', RegressorChain(regr_model_class))
                        ], verbose=True, memory=memory)

                        print('Possbile params: ')
                        print(pipe.get_params().keys())
                        gs=GridSearchCV(pipe, model_params[i], n_jobs=n_jobs, cv=cv, scoring=scoring, verbose=10, error_score="raise")
                        gs.fit(X, y.to_numpy(), groups=groups)
                        result = gs.cv_results_
                        if gs.best_score_ > current_best_score_regr:
                            current_best_regressor = gs.best_estimator_
                            current_best_score_regr = gs.best_score_
                            dump(current_best_regressor, os.path.join(f'best_{str(regr_model_class)[:5]}.joblib') )
                        grid_results.append(pd.DataFrame.from_dict(result))
                        print(f'Num max_features: {num}')
                        print(gs.best_score_)
                        print('_______________')
    return (current_best_regressor, gs.best_params_, current_best_score_regr, grid_results)


### Training

In [29]:
scalers = [
    # 'passthrough',
    # StandardScaler(),
    QuantileTransformer(),
    # MinMaxScaler(),
    # MaxAbsScaler()
]

regressors = [    
    # RandomForestRegressor(),
    KNeighborsRegressor(),
    # SVR(),
    # XGBRegressor(),
    # AdaBoostRegressor(), 
#     StackingRegressor(estimators=[
#         ('Ada', AdaBoostRegressor()), 
#         ('RF', RandomForestRegressor()),
#         ('KNN', KNeighborsRegressor())
#     ])
]

feature_selectors = [
    'passthrough',
    # RandomForestRegressor(n_estimators=50),
]

embedders = [
    'passthrough',
    # PCA(),
    # KernelPCA()
]

cachedir = mkdtemp()

model_params = [{} for model in regressors]

#Random Forest
# model_params[0] = {
#     'estimator__base_estimator__n_estimators': [10, 50, 200, 500, 1000],
#     'estimator__base_estimator__max_features': ['auto', 'sqrt', 'log2'],
#     'estimator__base_estimator__max_depth': [4,6,8],
#     'feature_selector__estimator__n_estimators': [50]
    
# }

#kNN
model_params[0] = {
    'estimator__base_estimator__n_neighbors': [2, 5, 10, 20, 25, 30],
    'estimator__base_estimator__weights': ['uniform', 'distance'],
}

#Support Vector Machine
# model_params[0] = {
    # 'estimator__base_estimator__kernel': ['rbf'],
    # 'estimator__base_estimator__gamma': [1, 0.1, 0.01, 'scale', 'auto'],
    # 'estimator__base_estimator__C': [1, 10, 100],
    # 'feature_selector__estimator__n_estimators': [10, 25, 100]
# }

#XGBoost
# model_params[0] = {
#     'estimator__base_estimator__objective': ['reg:squarederror'],
#     'estimator__base_estimator__learning_rate': [0.2],
#     'estimator__base_estimator__n_estimators': [100],
#     'estimator__base_estimator__max_depth': [4],
#     'estimator__base_estimator__min_child_weight': [2],
#     'estimator__base_estimator__gamma': [0.1],
#     'estimator__base_estimator__subsample': [0.9],
#     'estimator__base_estimator__colsample_bytree': [0.8],
# }

#AdaBoost
# model_params[0] = {
#     'estimator__base_estimator__base_estimator': [
#         KNeighborsRegressor(n_neighbors=10),
#         KNeighborsRegressor(n_neighbors=5)
#     ],
#     'estimator__base_estimator__n_estimators':[25, 50, 100],
#     'estimator__base_estimator__learning_rate':[0.1],
#     # 'estimator__base_estimator__learning_rate':[0.01, 0.05, 0.1],
#     'estimator__base_estimator__loss': ['linear', 'square'],
# }

top_X = [150]

best_regressor, best_params, best_score, grid_result = train_regressor(model_params,
                                data_components=['tech_features'], 
                                # target_components=['valence_arousal', 'group_idx'], 
                                target_components=['valence_arousal', 'group_idx'], 
                                regressors=regressors, 
                                feature_selectors=feature_selectors, 
                                embedders=embedders,
                                scalers=scalers,
                                top_X=top_X,
                                memory=cachedir,
                                n_jobs=16,
                                cv=5,
                                scoring=custom_grouped_scorer#'neg_mean_squared_error'
                              )

Regressor model class: KNeighborsRegressor()
Feature selector model class: passthrough
Embedding model class: passthrough
Possbile params: 
dict_keys(['memory', 'steps', 'verbose', 'scaler', 'embedder', 'feature_selector', 'estimator', 'scaler__copy', 'scaler__ignore_implicit_zeros', 'scaler__n_quantiles', 'scaler__output_distribution', 'scaler__random_state', 'scaler__subsample', 'estimator__base_estimator__algorithm', 'estimator__base_estimator__leaf_size', 'estimator__base_estimator__metric', 'estimator__base_estimator__metric_params', 'estimator__base_estimator__n_jobs', 'estimator__base_estimator__n_neighbors', 'estimator__base_estimator__p', 'estimator__base_estimator__weights', 'estimator__base_estimator', 'estimator__cv', 'estimator__order', 'estimator__random_state'])
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Pipeline] ............ (step 1 of 4) Processing scaler, total=   0.9s
[Pipeline] .......... (step 2 of 4) Processing embedder, total=   0.0s
[Pipeline

In [30]:
best_params

{'estimator__base_estimator__n_neighbors': 20,
 'estimator__base_estimator__weights': 'uniform'}

### Helpers

In [16]:
def v_a_to_quadrant(valence, arousal):
    if arousal < 3:
        if valence < 0:
            return 3 # sad
        return 4 # relaxed
    if valence < 0:
        return 2 # angry
    return 1 # happy

In [28]:
def v_a_to_quadrant_skewed(valence, arousal):
    if arousal < 3.2:
        if valence < 0.4:
            return 3 # sad
        return 4 # relaxed
    if valence < 0.4:
        return 2 # angry
    return 1 # happy