### Parameters

In [1]:
SEED = 123      # modifiable seed
CLF_SS = 1      # sub-sample model types for faster run
TARGETS = -1    # which target (0-4) to predict; -1 for all
n_jobs = 8

### Imports

In [8]:
import numpy as np  
import pandas as pd 
import pickle
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA

In [3]:
import multiprocessing
from joblib import Parallel, delayed

In [4]:
from collections import Counter
import datetime as datetime

In [5]:
import gc
import psutil
import sys

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
rcParams['figure.figsize'] = (15,5.5)

pd.options.display.max_rows = 150

In [10]:
start = datetime.datetime.now()

if SEED < 0:
    np.random.seed(datetime.datetime.now().microsecond)
else:
    np.random.seed(SEED)

### Data Loading

In [53]:
path = '../input/trends-assessment-prediction'

loading =  pd.read_csv(path+ '/' + 'loading.csv').set_index('Id')
loading_cols = loading.columns
loading_index = loading.index
fnc =  pd.read_csv(path+ '/' + 'fnc.csv').set_index('Id')
fnc_cols = fnc.columns
fnc_index = fnc.index

assert len(loading) == len(fnc)


In [54]:
fnc

Unnamed: 0_level_0,SCN(53)_vs_SCN(69),SCN(98)_vs_SCN(69),SCN(99)_vs_SCN(69),SCN(45)_vs_SCN(69),ADN(21)_vs_SCN(69),ADN(56)_vs_SCN(69),SMN(3)_vs_SCN(69),SMN(9)_vs_SCN(69),SMN(2)_vs_SCN(69),SMN(11)_vs_SCN(69),...,CBN(13)_vs_DMN(94),CBN(18)_vs_DMN(94),CBN(4)_vs_DMN(94),CBN(7)_vs_DMN(94),CBN(18)_vs_CBN(13),CBN(4)_vs_CBN(13),CBN(7)_vs_CBN(13),CBN(4)_vs_CBN(18),CBN(7)_vs_CBN(18),CBN(7)_vs_CBN(4)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0.368580,0.166876,0.438148,0.341007,-0.186251,0.049096,0.121417,-0.174268,-0.231578,0.000947,...,-0.149279,0.552841,0.131046,0.335446,0.394867,-0.042853,0.124627,-0.060712,0.515964,0.290488
10002,0.151696,-0.024819,0.217504,0.418072,-0.227234,-0.064052,-0.143832,-0.118116,-0.054825,0.038732,...,-0.214216,-0.039792,0.143014,-0.189962,0.498373,0.444231,0.592438,0.028649,0.705524,0.248327
10003,0.343415,0.109974,0.741641,0.578558,-0.676446,-0.436960,-0.295663,-0.377790,-0.344963,-0.294511,...,-0.154941,0.136850,-0.022361,0.137625,0.677972,0.409412,0.563892,0.438684,0.618204,0.284474
10004,0.132793,0.258255,0.490769,0.342717,0.091112,0.107969,0.029220,-0.026237,0.094742,0.098802,...,-0.130339,0.309540,0.141469,0.030853,0.344394,0.214097,0.317556,0.012435,0.665937,0.081358
10005,0.291921,0.251254,0.416470,0.511719,-0.362626,-0.164710,-0.289059,-0.015537,-0.087316,-0.142736,...,-0.139525,0.394932,0.040443,0.428334,0.498837,0.266755,0.227379,0.028984,0.752343,0.087898
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21750,0.227970,-0.222489,0.250417,0.442642,-0.221094,-0.234935,-0.296614,-0.273216,-0.106481,-0.157194,...,-0.103786,0.375065,0.104857,0.262614,0.502715,0.322353,0.458041,0.343754,0.705207,0.341224
21751,0.455052,0.483856,0.589565,0.633691,0.161995,-0.175318,0.015480,0.135612,0.172296,0.129111,...,-0.087604,0.131902,-0.047932,0.022317,0.583869,0.596734,0.515209,0.379589,0.568422,0.439016
21752,0.118257,0.452123,0.608328,0.422485,-0.106427,-0.008484,-0.010050,-0.122015,0.092994,-0.071942,...,0.229712,0.431489,0.039062,0.119474,0.523894,0.445209,0.332011,0.228977,0.560968,0.263504
21753,0.051042,0.088581,0.551354,0.305542,-0.034378,-0.094778,-0.089954,-0.222876,-0.197503,-0.192861,...,-0.147730,0.206375,0.096594,0.317651,0.545878,0.411197,0.230889,0.167354,0.221668,0.371357


In [55]:
n_components=200
pca = PCA(n_components=n_components)
tmp_pca = pca.fit_transform(fnc)
fnc = pd.DataFrame(tmp_pca, columns=fnc_cols[:n_components], index=fnc_index).iloc[:, :n_components]


loading = pd.read_csv("../input/trends-assessment-prediction/loading.csv")
# ICAのためのコード(変更点はここだけ)
ica = FastICA(n_components=26, random_state=SEED)#26個の基底（コンポネント）を作る
tmp_ica = ica.fit_transform(loading)
loading = pd.DataFrame(tmp_ica, columns=loading_cols, index=loading_index)

train_prob_site2 = pd.read_csv('adversal/train_prob_site2_icapca.csv')
test_prob_site2 = pd.read_csv('adversal/test_prob_site2_icapca.csv')
train_is_site2_id = train_prob_site2[train_prob_site2["is_site2"]==True]["Id"]
test_is_site2_id = test_prob_site2[test_prob_site2["is_site2"]==True]["Id"]

In [56]:
fnc

Unnamed: 0_level_0,SCN(53)_vs_SCN(69),SCN(98)_vs_SCN(69),SCN(99)_vs_SCN(69),SCN(45)_vs_SCN(69),ADN(21)_vs_SCN(69),ADN(56)_vs_SCN(69),SMN(3)_vs_SCN(69),SMN(9)_vs_SCN(69),SMN(2)_vs_SCN(69),SMN(11)_vs_SCN(69),...,CON(83)_vs_SCN(99),DMN(32)_vs_SCN(99),DMN(40)_vs_SCN(99),DMN(23)_vs_SCN(99),DMN(71)_vs_SCN(99),DMN(17)_vs_SCN(99),DMN(51)_vs_SCN(99),DMN(94)_vs_SCN(99),CBN(13)_vs_SCN(99),CBN(18)_vs_SCN(99)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,-1.961551,-0.485225,-0.670020,1.212889,0.409934,0.487359,1.189887,-0.261127,-0.383694,-0.268147,...,0.301713,-0.016864,0.046586,-0.186597,0.073977,-0.193430,-0.019634,0.079329,0.022645,0.139086
10002,-1.448180,-0.842191,1.347716,0.998952,-0.676541,1.896924,-1.724111,1.363868,-0.607254,0.710026,...,-0.292938,0.325759,0.111012,0.490560,0.377998,0.180009,0.108390,0.058020,0.297257,-0.134917
10003,2.171379,2.326819,0.225514,-0.331407,-0.357246,0.942099,0.798029,0.832019,-0.016345,-0.593752,...,0.197093,-0.182063,0.026890,0.278423,-0.168400,-0.080544,0.251683,0.220050,-0.185138,-0.280320
10004,-1.329180,-1.249583,-1.083893,0.502805,0.342003,0.647636,0.278334,-0.210236,-1.070481,-0.412703,...,-0.072906,0.424118,-0.109941,-0.047129,-0.154152,-0.146845,-0.146471,-0.022817,0.143836,0.132913
10005,-0.529954,-1.398189,0.681371,-1.682728,-0.303317,0.195571,0.066609,-0.242811,0.184963,-0.458346,...,-0.135485,-0.050978,0.357401,0.230802,0.045713,-0.277359,-0.059157,-0.081371,0.185007,-0.161853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21750,-2.350324,1.647999,-0.927615,0.117542,1.559609,-2.032102,1.383177,0.035700,0.617110,-0.168874,...,-0.225659,0.347276,0.353132,0.047607,-0.059646,0.031666,0.112908,0.002602,0.384550,-0.013686
21751,-2.329617,1.181427,0.937928,0.491918,-1.279863,0.613415,-0.602450,1.252947,0.537347,-0.898049,...,0.496028,-0.198042,-0.262437,-0.115600,-0.255559,0.088098,-0.206227,-0.426627,0.051799,0.166602
21752,0.510884,-0.057276,0.287525,-0.521297,0.757172,-1.407868,0.819167,1.451423,-1.359516,-0.215035,...,-0.227712,-0.427407,0.163810,-0.317071,-0.146499,-0.222819,0.112347,0.092158,-0.195796,-0.076038
21753,-1.011531,0.018191,1.721427,0.169492,-0.582768,2.965148,-0.309081,-0.524228,-0.979982,0.313568,...,0.212312,0.166819,0.020278,0.346048,0.247089,0.175153,-0.011613,-0.286663,0.244189,0.042045


In [57]:
prob_site2 = pd.concat([train_prob_site2, test_prob_site2], axis=0).sort_values("Id").set_index('Id')["prob_site2"]
prob_site2

Id
10001    0.234300
10002    0.256662
10003    0.359684
10004    0.340693
10005    0.334773
           ...   
21750    0.340606
21751    0.352036
21752    0.326198
21753    0.351039
21754    0.485402
Name: prob_site2, Length: 11754, dtype: float64

In [58]:
y_data =  pd.read_csv(path+ '/' + 'train_scores.csv').set_index('Id')

data = pd.concat((loading, fnc, prob_site2 ), axis = 'columns')  
test_data = data[~data.index.isin(y_data.index)]

X = data.loc[y_data.index] 
y = y_data 


In [59]:
train_is_site2_id

10      10025
16      10041
18      10044
19      10046
30      10071
        ...  
5860    21726
5863    21734
5867    21739
5869    21741
5876    21754
Name: Id, Length: 1462, dtype: int64

In [60]:
X = X.iloc[list(train_is_site2_id.index)]
y = y.iloc[list(train_is_site2_id.index)]
groups = np.random.randint(0, 5, len(y))
print(X.shape, y.shape)

(1462, 227) (1462, 5)


### Model Setup

In [61]:
from sklearn.model_selection import RandomizedSearchCV, RepeatedKFold, KFold, ShuffleSplit
from sklearn.svm import SVR, NuSVR
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import make_scorer, mean_absolute_error
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler, FunctionTransformer 

In [62]:
nusvr_params = {
    'kernel': [  'rbf',  ] , 
    'C': [ 1, 2, 3, 5, 7, 10, 15, 20, 30, 50, 70, 100, 140, 200, 300  ],
    'gamma': [ 'scale'], 
    'nu': [   0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 1] }

def trainNuSVR(x, y, groups, cv = 0, n_jobs = 8, **kwargs):
    clf = NuSVR(cache_size=1000, tol = 1e-5)
    params = nusvr_params        
    return trainModel(x, y, groups, clf, params, cv, n_jobs,  **kwargs)

In [63]:
enet_params = { 'alpha': [  1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3, 1e-2, 3e-2, 0.1, 0.3,   ],
                'l1_ratio': [ 0, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.95, 0.97, 0.98, 0.99, 1,   ]}

def trainENet(x, y, groups, cv = 0, n_jobs = 8, **kwargs):
    clf = ElasticNet(normalize = True, selection = 'random', max_iter = 10000, tol = 1e-5 )
    return trainModel(x, y, groups, clf, enet_params, cv, n_jobs, **kwargs)

In [64]:
def fnae(y_true, y_pred):
    valid = ~np.isnan(y_true)
    y_true = y_true[valid]
    y_pred = y_pred[valid]
    return np.sum(np.abs(y_true - y_pred))/np.sum(y_true)

fnae_scorer = make_scorer(fnae, greater_is_better = False)

In [65]:
def trainModel(x, y, groups, clf, params, cv = 0, n_jobs = 8, 
                   verbose=0, splits=None, **kwargs):
    if n_jobs is None:
        n_jobs = 8    

    n_iter = 30    
        
    folds = ShuffleSplit(n_splits = 10, train_size = 0.75, test_size = 0.20)
    clf = RandomizedSearchCV(clf, params, cv = folds, n_iter = n_iter, 
                            verbose = 1, n_jobs = n_jobs, scoring = fnae_scorer)
    
    f = clf.fit(x, y, groups)
    
    print(pd.DataFrame(clf.cv_results_['mean_test_score'])); print();  
    best = clf.best_estimator_;  print(best)
    print("Best Score: {}".format(np.round(clf.best_score_,4)))
    
    return best

In [66]:
def cleanX(X, target):
    X = X.copy()
    
    for col in fnc.columns:
        X[col] = X[col] / 300
       
    return X

In [67]:
def runBag(n = 3, model_type = trainENet, data = None, **kwargs):
    start_time = datetime.datetime.now(); 
    
    X, y, groups = data

    valid = ~y.isnull()
    X = X[valid]; y = y[valid]; groups = groups[valid]
    
    if 'target' in kwargs:
        X = cleanX(X, kwargs['target'])
    
    group_list = [*dict.fromkeys(groups)]   
    group_list.sort()
    
    clfs = []; preds = []; ys=[]; datestack = []
    for group in group_list:
        g = gc.collect()
        x_holdout = X[groups == group]
        y_holdout = y[groups == group]
        x_train = X[groups != group]
        y_train = y[groups != group]
        
        groups_train = groups[groups != group]

        model = model_type 
        clf = model(x_train, y_train, groups_train, **kwargs) 
        clfs.append(clf)

        predicted = clf.predict(x_holdout)
        print("{}: {:.4f}".format(group,
              fnae(y_holdout, predicted)  ) )
        
        preds.append(predicted)
        ys.append(y_holdout)
    
    y_pred = np.concatenate(preds)
    y_ho = np.concatenate(ys) 

    end_time = datetime.datetime.now(); 
    print("\nModel Bag Time: {}\n".format(str(end_time - start_time).split('.', 2)[0] ))
    return clfs

In [68]:
def trainBaseClfs(clfs, clf_names, data, target = None, **kwargs):
    start_time = datetime.datetime.now(); 
    
    X, y, groups = data
    
    X = cleanX(X, target)
    
    group_list = [*dict.fromkeys(groups)]   
    group_list.sort()
    
    X_ordered = []; y_ordered = []; groups_ordered =[]  
    all_base_clfs = []; base_preds = [[] for i in range(0, 5 * len(clfs))]; 
    for group in group_list:
        print("Training Fold {} of {}:".format(group, len(group_list)))
        np.random.seed(SEED)
        
        x_holdout = X[groups == group]
        y_holdout = y[groups == group]
        x_train = X[groups != group]
        y_train = y[groups != group]

        y_idx = ALL_TARGETS.index(target)
        
        X_ordered.append(x_holdout)
        y_ordered.append(y_holdout)
        groups_ordered.append(groups[groups == group])
        
        base_clfs = []
        for idx, clf in enumerate(clfs):
            base_clfs.append(clone(clf))
        
        def train_model(model, X, y):
            ss = (~pd.DataFrame(y).isnull().any(axis=1))
            model.fit(X[ss], y[ss]); return model
        
        base_clfs = Parallel(n_jobs=4)(delayed(train_model)(model, x_train, y_train[y_var]) for model in base_clfs)
        all_base_clfs.append(base_clfs)
        
        def predict_model(model, X):
            o = model.predict(X); return o    
        preds = Parallel(n_jobs=4)(delayed(predict_model)(model, x_holdout) for model in base_clfs)
        
        
        pidx = 0; clf_pred_names = []
        for idx, clf in enumerate(base_clfs):   
            print("{:.4f} for {}".format( 
                      fnae(y_holdout[target], preds[idx]), clf_names[idx]  ) )
            base_preds[pidx].append(preds[idx]); pidx+=1;
            clf_pred_names.append(clf_names[idx])
            
        print("\nTime Elapsed: {}\n".format(str(datetime.datetime.now() - start_time).split('.', 2)[0] ))

    base_preds = base_preds[:len(clf_pred_names)]
    for idx in range(0, len(base_preds)):
        base_preds[idx] = np.concatenate(base_preds[idx])

    
    print("\Base Classifier Train Time: {}\n".format(str(datetime.datetime.now() - start_time).split('.', 2)[0] ))
    return (all_base_clfs, base_preds, clf_pred_names, 
        pd.concat(X_ordered), pd.concat(y_ordered), np.concatenate(groups_ordered))

In [69]:
def Lassos():
    clfs = []; clf_names = []
    lassos =  [1e-5, 3e-5, 1e-4,  3e-4,  0.001, 0.003,  0.01,  0.03,  0.1,  0.3,  1, ]
    for l in lassos:
        clfs.append(Lasso(alpha = l,  selection = 'random', max_iter = 5000, tol = 1e-5))
        clf_names.append('Lasso alpha={}'.format(l))
        if CLF_SS > 1:
            clfs.append(clfs[-1]); clf_names.append(clf_names[-1])
 
    return clfs, clf_names

In [70]:
def Ridges():
    clfs = []; clf_names = []
    ridges =  [3e-5,  1e-4,  2e-4, 5e-4, 0.001, 0.002, 0.005,  0.01,  0.03,  0.1,  0.3,  1,  3,  10,    ]
    for r in ridges:
        clfs.append(Ridge(alpha = r, max_iter = 5000, tol = 1e-5))
        clf_names.append('Ridge alpha={}'.format(r))
        if CLF_SS > 1:
            clfs.append(clfs[-1]); clf_names.append(clf_names[-1])

    return clfs, clf_names

In [71]:
def SVRs():
    clfs = []; clf_names = []
    svrs =  ([0.2, 1, 7, 50], [1, 3, 7]) 
    for c in svrs[0]:
        for e in svrs[1]:
            clfs.append(SVR(C = c, epsilon = e, cache_size=1000, max_iter = 5000, tol = 1e-5))
            clf_names.append('SVR C={}, epsilon={}'.format(c,e))
            
    return clfs, clf_names

In [72]:
def ENets():
    clfs = []; clf_names = []
    enets = ([3e-5, 1e-4, 3e-4, 1e-3, 3e-3, 1e-2  ], [ 0, 0.05, 0.1, 0.5, 0.9, 0.95, 0.98, 1]) 
    for a in enets[0]:
        for l in enets[1]:
            clfs.append(ElasticNet(alpha = a, l1_ratio = l,
                         normalize = False, selection = 'random', 
                         max_iter = 5000, tol = 1e-5))
            clf_names.append('Enet alpha={}, l1_ratio={}'.format(a,l))
 
    for a in enets[0]:
        for l in enets[1]:
            clfs.append(ElasticNet(alpha = a, l1_ratio = l,
                         normalize = True, selection = 'random', 
                         max_iter = 5000, tol = 1e-5))
            clf_names.append('Enet-n alpha={}, l1_ratio={}'.format(a,l))
            
    return clfs, clf_names

In [73]:
def getBaseClfs(y_var):
    idx = ALL_TARGETS.index(y_var)

    clfs = []
    clf_names = []
    
    model_sets =  [SVRs(), ENets(), Lassos(), Ridges()]
    for model_set in model_sets:
        clfs.extend(model_set[0])
        clf_names.extend(model_set[1])
   

    return clfs[::CLF_SS], clf_names[::CLF_SS];


In [74]:
ALL_TARGETS = y.columns.to_list()  
if isinstance(TARGETS, list):
    targets = [ALL_TARGETS[i] for i in TARGETS]
elif TARGETS is not None and TARGETS >= 0:
    targets = ALL_TARGETS[TARGETS: TARGETS + 1]
else:
    targets = ALL_TARGETS
# print(targets)

In [75]:
def metaFilter(X):
    return X[[c for c in X.columns if c not in data.columns or c in loading.columns ]] 

### Train Models

In [76]:
all_clfs = []; all_raw_base_clfs = []; all_base_clfs = []; scalers = []
for idx, y_var in enumerate(targets):
    print('---Training Models for {}---\n'.format(y_var))
       
    
    # train base classifiers
    raw_base_clfs, base_clf_names = getBaseClfs(y_var)
    all_raw_base_clfs.append((raw_base_clfs, base_clf_names))
    
    base_clfs, base_clf_preds, base_clf_names, Xe, ye, ge = \
                    trainBaseClfs(raw_base_clfs, base_clf_names, 
                                  data = (X, y, groups), 
                                  target=y_var, )
    Xe = pd.concat( (Xe, pd.DataFrame( dict(zip(base_clf_names, base_clf_preds)), index=Xe.index) ),
                     axis = 'columns')
    
    all_base_clfs.append((base_clfs, base_clf_preds, base_clf_names, Xe, ye, ge ))
    
    
    # train meta model
    
    if y_var == 'age':
        s = FunctionTransformer()
        meta_model = trainNuSVR
    else:
        s = StandardScaler()
        meta_model = trainENet
     
    s.fit(metaFilter(Xe))
    scalers.append(s)
    
    all_clfs.append( runBag(data = (s.transform(metaFilter(Xe)), ye[y_var], ge), # target=y_var,
                                   model_type = meta_model) )
    # run

ha=0.0001, l1_ratio=0
0.1806 for Enet alpha=0.0001, l1_ratio=0.05
0.1806 for Enet alpha=0.0001, l1_ratio=0.1
0.1804 for Enet alpha=0.0001, l1_ratio=0.5
0.1796 for Enet alpha=0.0001, l1_ratio=0.9
0.1790 for Enet alpha=0.0001, l1_ratio=0.95
0.1792 for Enet alpha=0.0001, l1_ratio=0.98
0.1869 for Enet alpha=0.0001, l1_ratio=1
0.1812 for Enet alpha=0.0003, l1_ratio=0
0.1812 for Enet alpha=0.0003, l1_ratio=0.05
0.1812 for Enet alpha=0.0003, l1_ratio=0.1
0.1809 for Enet alpha=0.0003, l1_ratio=0.5
0.1803 for Enet alpha=0.0003, l1_ratio=0.9
0.1801 for Enet alpha=0.0003, l1_ratio=0.95
0.1795 for Enet alpha=0.0003, l1_ratio=0.98
0.1800 for Enet alpha=0.0003, l1_ratio=1
0.1818 for Enet alpha=0.001, l1_ratio=0
0.1818 for Enet alpha=0.001, l1_ratio=0.05
0.1818 for Enet alpha=0.001, l1_ratio=0.1
0.1816 for Enet alpha=0.001, l1_ratio=0.5
0.1808 for Enet alpha=0.001, l1_ratio=0.9
0.1807 for Enet alpha=0.001, l1_ratio=0.95
0.1806 for Enet alpha=0.001, l1_ratio=0.98
0.1800 for Enet alpha=0.001, l1_ratio=

### Prediction Code

In [77]:
def predictBag(X, y, groups, clfs, target = None):
    start_time = datetime.datetime.now(); 

    valid = ~y.isnull()
    X = X[valid]; y = y[valid]; groups = groups[valid]
    
    if target is not None:
        X = cleanX(X, target)
    
    group_list = [*dict.fromkeys(groups)]   
    group_list.sort()

    preds = []; ys=[]; datestack = []
    for idx, group in enumerate(group_list):
        g = gc.collect()
        x_holdout = X[groups == group]
        y_holdout = y[groups == group]
  
        y_pred = clfs[idx].predict(x_holdout)    
        preds.append(y_pred)
        ys.append(y_holdout)
    
        print("{}: {:.4f}".format(group,
              fnae(y_holdout, y_pred) ) )
        
    y_pred = np.concatenate(preds)
    y_true = np.concatenate(ys) 
    
    print("\Bag Prediction Time: {}\n".format(str(datetime.datetime.now() - start_time).split('.', 2)[0] ))
    return y_pred, y_true

In [78]:
def predictAll(X_test, all_base_clfs, all_clfs):
    start_time = datetime.datetime.now(); 
        
    def predict_model(model, X):
        o = model.predict(X)
        return o    
    
    all_preds = pd.DataFrame(columns = targets, index=X_test.index)
    for tidx, y_var in enumerate(targets): # loop over targets
        print(y_var)
        Xi = cleanX(X_test, y_var)
        base_clfs = all_base_clfs[tidx][0]
         

        preds = []; 
        for g_idx, g_clfs in enumerate(base_clfs): # loop over groups
            print(g_idx)
            preds.append(Parallel(n_jobs=4)(delayed(predict_model)(model, Xi) for model in g_clfs))
        print("\Base Classifier Prediction Time: {}\n".format(str(datetime.datetime.now() - start_time).split('.', 2)[0] ))


        c_preds = []; sub_preds = np.zeros((len(preds), len(Xi)))
        for c_idx in range(0, len(preds[0])):  
            if len(preds[0][c_idx].shape) > 1: 
                for t_idx in range(0, preds[0][c_idx].shape[1]):
                    for g_idx, this_pred_group in enumerate(preds):  
                        sub_preds[g_idx, :] = this_pred_group[c_idx][:, t_idx]
                    c_preds.append(np.mean( sub_preds, axis = 0))  
            else:
                for g_idx, this_pred_group in enumerate(preds): 
                    sub_preds[g_idx, :] = this_pred_group[c_idx]
                c_preds.append(np.mean( sub_preds, axis = 0)) 

        Xf = pd.concat( (Xi, pd.DataFrame( dict(zip(all_base_clfs[tidx][2], c_preds)), index=Xi.index) ),
                     axis = 'columns')
        print("\nTime Elapsed: {}\n".format(str(datetime.datetime.now() - start_time).split('.', 2)[0] ))
 

        s = scalers[tidx]
        print('\nrunning stacker')
        pred = Parallel(n_jobs=4)(delayed(predict_model)(model, s.transform(metaFilter(Xf))) 
                                                       for model in all_clfs[tidx])
        sub_preds = np.zeros((len(all_clfs[tidx]), len(Xi)))
        for g_idx, clf in enumerate(all_clfs[tidx]):
            sub_preds[g_idx, :] = pred[g_idx]
        all_preds[y_var] = np.mean(sub_preds, axis = 0)


    end_time = datetime.datetime.now(); 
    print("\Prediction Time: {}\n".format(str(end_time - start_time).split('.', 2)[0] ))
    return all_preds, Xf

### Show Scores by Fold

In [79]:
y_preds = pd.DataFrame(index = X.index)
y_trues = y_preds.copy()
scores = pd.DataFrame(index = targets, columns = ['score'])
for idx, y_var in enumerate(targets):
    print(y_var)
    s = scalers[idx]
    y_pred, y_true =  predictBag(s.transform(metaFilter(all_base_clfs[idx][3])), 
                                 all_base_clfs[idx][4][y_var], all_base_clfs[idx][5], all_clfs[idx] ) 
    score = fnae(y_true, y_pred)
    print('{}: {:.4f}\n\n'.format(y_var, score))
    scores.loc[y_var] = score

scores.round(4) # MSCORE

age
0: 0.1565
1: 0.1501
2: 0.1711
3: 0.1535
4: 0.1517
\Bag Prediction Time: 0:00:02

age: 0.1565


domain1_var1
0: 0.1532
1: 0.1439
2: 0.1405
3: 0.1519
4: 0.1444
\Bag Prediction Time: 0:00:02

domain1_var1: 0.1467


domain1_var2
0: 0.1440
1: 0.1545
2: 0.1570
3: 0.1377
4: 0.1452
\Bag Prediction Time: 0:00:03

domain1_var2: 0.1475


domain2_var1
0: 0.1808
1: 0.1718
2: 0.1847
3: 0.1645
4: 0.1728
\Bag Prediction Time: 0:00:04

domain2_var1: 0.1744


domain2_var2
0: 0.1692
1: 0.1655
2: 0.1726
3: 0.1725
4: 0.1636
\Bag Prediction Time: 0:00:03

domain2_var2: 0.1686




Unnamed: 0,score
age,0.156465
domain1_var1,0.146692
domain1_var2,0.147526
domain2_var1,0.174442
domain2_var2,0.168608


### Show Overall Score

In [80]:
try:
    weights = pd.DataFrame( index = ALL_TARGETS, data = [.3, .175, .175, .175, .175] )
    overall_score = np.sum(scores * weights.values).iloc[0]
    age_score = np.mean(scores.iloc[:1]).iloc[0]
    other_scores = np.mean(scores.iloc[1:]).iloc[0]

    print(np.round(scores,4))
    print("\nOverall Score: {:.4f}".format(overall_score))

    print("   {:.4f}:  {:.4f} / {:.4f}   {}".format(overall_score, age_score, other_scores, 
                          [ np.round(s, 4) for s in scores.score] ))

except:
    pass

score
age           0.156465
domain1_var1  0.146692
domain1_var2  0.147526
domain2_var1  0.174442
domain2_var2  0.168608

Overall Score: 0.1585
   0.1585:  0.1565 / 0.1593   [0.1565, 0.1467, 0.1475, 0.1744, 0.1686]


score  
age           0.155798  
domain1_var1  0.148843  
domain1_var2  0.147619  
domain2_var1  0.172886  
domain2_var2  0.169579  

Overall Score: 0.1586  
   0.1586:  0.1558 / 0.1597   [0.1558, 0.1488, 0.1476, 0.1729, 0.1696]  

### Build Submission

In [81]:
y_oos, Xf = predictAll(test_data, all_base_clfs, all_clfs) 

y_oos = y_oos.reset_index().melt(id_vars = 'Id', value_name = 'Predicted')
y_oos.Id = y_oos.Id.astype(str) + '_' + y_oos.variable
y_oos.drop(columns = 'variable', inplace=True)

y_oos.to_csv('submission/submission_icapca1.csv', index=False)

age
0
1
2
3
4
\Base Classifier Prediction Time: 0:00:41


Time Elapsed: 0:00:41


running stacker
domain1_var1
0
1
2
3
4
\Base Classifier Prediction Time: 0:01:19


Time Elapsed: 0:01:19


running stacker
domain1_var2
0
1
2
3
4
\Base Classifier Prediction Time: 0:01:54


Time Elapsed: 0:01:54


running stacker
domain2_var1
0
1
2
3
4
\Base Classifier Prediction Time: 0:02:31


Time Elapsed: 0:02:31


running stacker
domain2_var2
0
1
2
3
4
\Base Classifier Prediction Time: 0:03:08


Time Elapsed: 0:03:08


running stacker
\Prediction Time: 0:03:09



### Show Final Submission

In [82]:
y_oos

Unnamed: 0,Id,Predicted
0,10003_age,53.841504
1,10006_age,58.846360
2,10010_age,39.233802
3,10011_age,47.207781
4,10012_age,45.490055
...,...,...
29380,21745_domain2_var2,49.982932
29381,21748_domain2_var2,51.348587
29382,21749_domain2_var2,52.587803
29383,21751_domain2_var2,54.722831
