In [15]:
from pathlib import Path
import numpy as np
import yaml
from copy import deepcopy
import sklearn.preprocessing
from scipy.special import logit, expit
from sklearn.linear_model import Ridge#, Lasso
from sklearn.multioutput import MultiOutputRegressor
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
# from sklearn.metrics import mean_absolute_error
from datetime import datetime

In [2]:
data_dir = Path('data')
X_train = np.load(data_dir / 'X_train.npz')['arr_0']
X_test = np.load(data_dir / 'X_test.npz')['arr_0']
Y_train = np.load(data_dir / 'Y_train.npz')['arr_0']

In [3]:
n_targets = Y_train.shape[1]
seed = 1126

In [25]:
config = yaml.safe_load(open('config.yml'))

In [6]:
if config['scaler_X']:
    scaler_X = getattr(sklearn.preprocessing, config['scaler_X'])()
    X_train_scaled = scaler_X.fit_transform(X_train)
    X_test_scaled = scaler_X.transform(X_test)
else:
    X_train_scaled = deepcopy(X_train)
    X_test_scaled = deepcopy(X_test)

In [7]:
Y_train_transformed = np.zeros_like(Y_train)

transform_dict = {
    'logit': logit,
    'log': np.log,
}
inv_transform_dict = {
    'logit': expit,
    'log': np.exp,
}

for i, transform in enumerate(config['transforms_Y']):
    Y_train_transformed[:, i] = transform_dict[transform](Y_train[:, i])

if config['scaler_Y']:
    scaler_Y = getattr(sklearn.preprocessing, config['scaler_Y'])()
    Y_train_scaled = scaler_Y.fit_transform(Y_train_transformed)
else:
    Y_train_scaled = deepcopy(Y_train_transformed)

In [8]:
# def WMAE(y_true, y_pred):
#     return mean_absolute_error(y_true, y_pred, multioutput=[200, 1, 300])

In [16]:
separate_targets = config['separate_targets']
model_name = config['model_name']
model_configs = config['model_configs']
model = eval(model_name)(**model_configs)
sample_weight = None
if separate_targets:
    model = MultiOutputRegressor(model)
    if config['sample_weight'] == 'reciprocal':
        sample_weight = 1 / Y_train[:, i]
model.fit(X_train_scaled, Y_train_scaled, sample_weight=sample_weight)

MultiOutputRegressor(estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=False, max_iter=None,
   normalize=False, random_state=1126, solver='auto', tol=0.001),
           n_jobs=1)

In [17]:
Y_pred_scaled = np.zeros((X_test.shape[0], n_targets))
if not separate_targets:
    Y_pred_scaled = model.predict(X_test_scaled)
else:
    for i in range(n_targets):
        Y_pred_scaled[:, i] = models[i].predict(X_test_scaled)

if config['scaler_Y']:
    Y_pred_transformed = scaler_Y.inverse_transform(Y_pred_scaled)
else:
    Y_pred_transformed = Y_pred

Y_pred = np.zeros_like(Y_pred_transformed)
for i, transform in enumerate(config['transforms_Y']):
    Y_pred[:, i] = inv_transform_dict[transform](Y_pred_transformed[:, i])

Y_pred = np.clip(Y_pred, [0, 0, 0.5], [1, float('inf'), 1])

In [18]:
now = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
submission_filename = 'submissions/submission_{}.csv'.format(now)
np.savetxt(submission_filename, Y_pred, delimiter=',', fmt='%.18f')