#### First, import libraries needed to run this notebook

In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import gridspec
#import seaborn as sns


#### Have a glance of the list of files stored in a folder

In [3]:
os.listdir('data')

['fnc.csv',
 'loading.csv',
 'ICN_numbers.csv',
 'trends_neuroimaging-master',
 'sample_submission.csv',
 '10003.mat.zip',
 'train_scores.csv',
 '10001.mat.zip',
 'reveal_ID_site2.csv',
 'fMRI_mask.nii']

#### Read into the data files

In [4]:
train_scores = pd.read_csv("data/train_scores.csv")
fnc = pd.read_csv("data/fnc.csv")
loading = pd.read_csv("data/loading.csv")
icn_no = pd.read_csv("data/ICN_numbers.csv")
id_site2 = pd.read_csv("data/reveal_ID_site2.csv")
submission = pd.read_csv("data/sample_submission.csv")


#### Just keeping a note of the no. of columns/variables in each file
length of train_scores.columns (inclu. id): 6  
length of loading.columns (inclu. id) : 27  
length of fnc.columns (inclu. id) : 1379


#### Dealing with missing data
There are missing values in the 'train_scores.csv' file (i.e. in some of the 'domain_' variables). 
Depending on your strategy, the missing data need to be treated. Here, we just substitute the missing data with mean values

In [7]:
#train_scores.isnull().sum()
train_scores.fillna(train_scores.mean(),inplace=True)

Now, combine and restrict to only train_data (the 'train_scores.csv' file contains only ID for 'train' cases)

In [8]:
#There are a lot of features in the fnc file. 'Rescale' them to prevent potential overfitting
FNC_SCALE = 1/500
fnc_features = fnc.columns[1:]
fnc[fnc_features] *= FNC_SCALE


train_data = train_scores.merge(loading, on='Id', how='left')
train_data = train_data.merge(fnc, on='Id', how='left')
train_data.head()

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2,IC_01,IC_07,IC_05,IC_16,...,CBN(13)_vs_DMN(94),CBN(18)_vs_DMN(94),CBN(4)_vs_DMN(94),CBN(7)_vs_DMN(94),CBN(18)_vs_CBN(13),CBN(4)_vs_CBN(13),CBN(7)_vs_CBN(13),CBN(4)_vs_CBN(18),CBN(7)_vs_CBN(18),CBN(7)_vs_CBN(4)
0,10001,57.436077,30.571975,62.553736,53.32513,51.427998,0.00607,0.014466,0.004136,0.000658,...,-5.971166e-07,2.211365e-06,5.241842e-07,1.341784e-06,2e-06,-1.714134e-07,4.985075e-07,-2.428485e-07,2e-06,1.161953e-06
1,10002,59.580851,50.969456,67.470628,60.651856,58.311361,0.009087,0.009291,0.007049,-0.002076,...,-8.568648e-07,-1.591679e-07,5.720555e-07,-7.598477e-07,2e-06,1.776925e-06,2.36975e-06,1.14595e-07,3e-06,9.933081e-07
2,10004,71.413018,53.152498,58.012103,52.418389,62.536641,0.004675,0.000957,0.006154,-0.000429,...,-5.213564e-07,1.238158e-06,5.65874e-07,1.234127e-07,1e-06,8.563871e-07,1.270224e-06,4.974028e-08,3e-06,3.254311e-07
3,10005,66.53263,51.474692,59.244132,52.108977,69.993075,-0.000398,0.006878,0.009051,0.000369,...,-5.580981e-07,1.579727e-06,1.617702e-07,1.713334e-06,2e-06,1.067021e-06,9.095169e-07,1.159346e-07,3e-06,3.515938e-07
4,10007,38.617381,49.197021,65.674285,40.151376,34.096421,0.005192,0.010585,0.01216,-0.00092,...,-6.008716e-07,1.635702e-06,2.880146e-07,6.303288e-07,2e-06,1.42179e-06,1.850702e-06,6.440192e-07,3e-06,1.17443e-06


### Feature Selection

#### Variance Threshold FS Attempt #not too helpful

In [None]:
#from sklearn.feature_selection import VarianceThreshold
#X_train_clone = X_train.copy()
#selector=VarianceThreshold()
#X_train_new = selector.fit_transform(X_train_clone)
#
#X_train_new.shape

### Try PCA for data reduction
##### on loading features

In [9]:
from sklearn.decomposition import PCA

loading_feat = loading.drop('Id', axis=1)

load_pca5 = PCA(n_components = 5)
load_pca5_sco = load_pca5.fit_transform(loading_feat)

print(load_pca5.explained_variance_ratio_)

load_pca_df = pd.DataFrame(data = load_pca5_sco, columns = ['loadpc1', 'loadpc2', 'loadpc3', 'loadpc4', 'loadpc5'])

[0.18168026 0.15362637 0.13054658 0.0755652  0.06002084]


In [10]:
load_pca_df = pd.concat([loading[['Id']], load_pca_df], axis=1)
load_pca_df

Unnamed: 0,Id,loadpc1,loadpc2,loadpc3,loadpc4,loadpc5
0,10001,0.008631,0.005304,0.010155,0.011989,-0.009915
1,10002,0.002058,-0.004325,0.006301,0.005274,-0.008553
2,10003,0.024332,0.008228,-0.009372,0.008338,-0.014058
3,10004,-0.000176,-0.005532,0.006451,-0.009241,0.003857
4,10005,-0.006259,-0.002157,0.016069,-0.000303,-0.000221
...,...,...,...,...,...,...
11749,21750,0.011279,0.009309,-0.003347,-0.005168,0.002109
11750,21751,-0.008887,0.008090,-0.005971,-0.001368,0.001193
11751,21752,-0.011322,0.008178,0.013650,-0.000309,-0.002126
11752,21753,-0.002964,-0.008021,-0.024727,0.002965,-0.019007


#### Also tried n_components = 8 or 10

In [None]:
#load_pca7 = PCA(n_components = 8)
#load_pca7_sco = load_pca7.fit_transform(loading_feat)
#
#load_pca7.explained_variance_ratio_

##### on fnc features

In [11]:
fnc2 = pd.read_csv("data/fnc.csv")
fnc2

fnc2_feat = fnc2.drop('Id', axis=1)

fnc2_pca5 = PCA(n_components = 5)
fnc2_pca5_sco = fnc2_pca5.fit_transform(fnc2_feat)

print(fnc2_pca5.explained_variance_ratio_)

fnc2_pca_df = pd.DataFrame(data = fnc2_pca5_sco, columns = ['fncpc1', 'fncpc2', 'fncpc3', 'fncpc4', 'fncpc5'])

[0.13839599 0.03939586 0.0355435  0.0310792  0.02873212]


In [12]:
fnc2_pca_df = pd.concat([fnc2[['Id']], fnc2_pca_df], axis=1)
fnc2_pca_df

Unnamed: 0,Id,fncpc1,fncpc2,fncpc3,fncpc4,fncpc5
0,10001,-1.961551,-0.485216,-0.670196,1.212302,0.409607
1,10002,-1.448180,-0.842196,1.347581,0.998600,-0.676815
2,10003,2.171379,2.326829,0.225485,-0.331557,-0.357406
3,10004,-1.329180,-1.249633,-1.083624,0.503536,0.342165
4,10005,-0.529954,-1.398175,0.681247,-1.683166,-0.303521
...,...,...,...,...,...,...
11749,21750,-2.350324,1.647932,-0.927377,0.118520,1.560186
11750,21751,-2.329617,1.181409,0.938004,0.491912,-1.280158
11751,21752,0.510884,-0.057311,0.287507,-0.521226,0.757112
11752,21753,-1.011531,0.018133,1.721649,0.170403,-0.582430


#### Also tried n_components = 8 or 10

In [None]:
#fnc2_pca10 = PCA(n_components = 10)
#fnc2_pca10_sco = fnc2_pca10.fit_transform(fnc2_feat)
#
#print(fnc2_pca10.explained_variance_ratio_)

### Combining pca-ed feature train data

In [13]:
pca_train_data = train_scores.merge(load_pca_df, on='Id', how='left')
pca_train_data = pca_train_data.merge(fnc2_pca_df, on='Id', how='left')
pca_train_data.head()

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2,loadpc1,loadpc2,loadpc3,loadpc4,loadpc5,fncpc1,fncpc2,fncpc3,fncpc4,fncpc5
0,10001,57.436077,30.571975,62.553736,53.32513,51.427998,0.008631,0.005304,0.010155,0.011989,-0.009915,-1.961551,-0.485216,-0.670196,1.212302,0.409607
1,10002,59.580851,50.969456,67.470628,60.651856,58.311361,0.002058,-0.004325,0.006301,0.005274,-0.008553,-1.44818,-0.842196,1.347581,0.9986,-0.676815
2,10004,71.413018,53.152498,58.012103,52.418389,62.536641,-0.000176,-0.005532,0.006451,-0.009241,0.003857,-1.32918,-1.249633,-1.083624,0.503536,0.342165
3,10005,66.53263,51.474692,59.244132,52.108977,69.993075,-0.006259,-0.002157,0.016069,-0.000303,-0.000221,-0.529954,-1.398175,0.681247,-1.683166,-0.303521
4,10007,38.617381,49.197021,65.674285,40.151376,34.096421,-0.004811,-0.003609,-0.005884,0.001857,0.001724,-0.021605,-0.985573,2.629944,-1.358889,0.961028


### Split data into X_train, y_train for train data, and only X_test for test set


In [14]:
targets = ('age', 'domain1_var1', 'domain1_var2','domain2_var1','domain2_var2')

#prepare the train data (X_train: features; y_train: outcomes)
Xpca_train = pca_train_data.drop(list(targets), axis=1).drop('Id', axis=1)
ypca_train = pca_train_data[list(targets)]

## prepare the test data
## Here, the 'fnc' features have already been 'rescaled' with a multiplication of 1/500
#Id_no = submission['Id'].apply(lambda x: int(x.split('_')[0])).unique()
#test = pd.DataFrame({'Id' : Id_no})
#test_pred_df = test.copy()
##submission.head()
#test_data = test.merge(loading, on='Id', how='left')
#test_data = test_data.merge(fnc, on='Id', how='left')
#
#X_test = test_data.drop('Id', axis=1)



### Using cross_val_predict to train/test model


In [45]:
from sklearn.model_selection import cross_val_predict #, #GridSearchCV, KFold, 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


model = RandomForestRegressor(random_state = 29, 
                      n_estimators=20
                             )



In [46]:
#To do cross_val_predict here

def eval_metric(y_true, y_pred):
    return np.mean(np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0))


weights = (.3, .175, .175, .175, .175)
target_scores = []

overall_score=0

for col, weight in zip(targets, weights): 
    ypca_pred_train = cross_val_predict(model, Xpca_train, ypca_train[col], cv=5, n_jobs=-1, verbose=2)
    
    #print(y_pred_train)
    
    score = eval_metric(ypca_train[col], ypca_pred_train)
    overall_score += score*weight
    
    target_scores.append((col, score))
    print("{}: {}".format(col,score))

target_scores.append(('overall-score', overall_score))
print("Overall-score: {}".format(overall_score))


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


age: 0.1750480103248077


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


domain1_var1: 0.15128620496999926


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


domain1_var2: 0.14825131714917636


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


domain2_var1: 0.19001609942364484
domain2_var2: 0.1853737764169167
Overall-score: 0.17062669774039632


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.3s finished


In [47]:
model_name = model.__class__.__name__
#report.write("---" + model_name +  "---" +"\n")


report = open('try_pca_outfile.txt', 'w')

report.write("---" + model_name +  "---" +"\n")
report.write(str(model.get_params()))


#report.write("-" * 45 + '\n')get
#report.write("!train feature data are pca scores from loading & fnc seperately (5pcs in each case!)" + "\n")
#report.write("---gridsearch: " + "-"*30 + "\n")
#report.write(str(gs) + "\n" + "\n")

#report.write("---best model params for each target :" + "-"*20 +"\n")
#report.write(str(best_models10) + "\n" +"\n")

#report.write("---model CV info: 'param_n_estimators','mean_test_score'" + "-"* 20 + '\n')
#report.write(str(models_cv_results10) + "\n" +"\n")



report.write("-" * 45 + '\n')
for target in target_scores: 
    report.write(str(target) + "\n")
#report.write("Overall-score: " + str(overall_score))
report.close()

### Try GridSeachCV on pca-ed data  (scoring='neg_mean_absolute_error')

In [48]:
from sklearn.model_selection import GridSearchCV, KFold  #cross_val_predict 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


def eval_metric(y_true, y_pred):
    return np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0)




model = RandomForestRegressor(random_state = 29 #, 
                      #n_estimators=20
                             )



cv = KFold(n_splits = 5, shuffle=True, random_state=29)
grid_params = {
    'n_estimators':[5, 10, 20] #,100
}

gs = GridSearchCV(model, grid_params, n_jobs=-1, cv=cv, verbose=2, scoring='neg_mean_absolute_error')


In [49]:
#%%timeit

best_models10 = {}
models_cv_results10 = {}
cv_info = ['param_n_estimators','mean_test_score']


for col in targets:
    gs.fit(Xpca_train, ypca_train[col])   
    best_models10[col] = gs.best_estimator_  
    models_cv_results10[col] = [gs.cv_results_.get(info) for info in cv_info]

    print(gs.best_score_)

best_models10

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   11.7s finished


-8.764955562353606
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   14.2s finished


-7.74704257873497
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   14.3s finished


-8.800539113930762
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   11.8s finished


-8.941972946579803
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   12.8s finished


-9.523295151269147


{'age': RandomForestRegressor(n_estimators=20, random_state=29),
 'domain1_var1': RandomForestRegressor(n_estimators=20, random_state=29),
 'domain1_var2': RandomForestRegressor(n_estimators=20, random_state=29),
 'domain2_var1': RandomForestRegressor(n_estimators=20, random_state=29),
 'domain2_var2': RandomForestRegressor(n_estimators=20, random_state=29)}

In [50]:
weights = (.3, .175, .175, .175, .175)

overall_score=0
target_scores = []

for col, weight in zip(targets, weights):
    ypca_pred_train = best_models10[col].predict(Xpca_train)
    score = eval_metric(ypca_train[col], ypca_pred_train)
    overall_score += score*weight

    target_scores.append((col, score))
    print("{}: {}".format(col,score))

target_scores.append(('overall-score', overall_score))
print("Overall-score: {}".format(overall_score))

age: 0.06797016180464376
domain1_var1: 0.05779629635483965
domain1_var2: 0.05760204162047615
domain2_var1: 0.07255068741893919
domain2_var2: 0.07106840672618339
Overall-score: 0.06571909916246983


### Write the output to text file

In [51]:
#model_name = model.__class__.__name__
#report.write("---" + model_name +  "---" +"\n")


report = open('gscv_pca_outfile.txt', 'w')

#report.write("-" * 45 + '\n')
report.write("!train feature data are pca scores from loading & fnc seperately (5pcs in each case!)" + "\n")
report.write("---gridsearch: " + "-"*30 + "\n")
report.write(str(gs) + "\n" + "\n")

report.write("---best model params for each target :" + "-"*20 +"\n")
report.write(str(best_models10) + "\n" +"\n")

report.write("---model CV info: 'param_n_estimators','mean_test_score'" + "-"* 20 + '\n')
report.write(str(models_cv_results10) + "\n" +"\n")



report.write("-" * 45 + '\n')
for target in target_scores: 
    report.write(str(target) + "\n")
#report.write("Overall-score: " + str(overall_score))
report.close()

# YOU CAN IGNORE THE FOLLOWING FOR NOW
## No PCA treatment - TRAIN on raw data points
### Split data into X_train, y_train for train data, and only X_test for test set


In [69]:
targets = ('age', 'domain1_var1', 'domain1_var2','domain2_var1','domain2_var2')

#prepare the train data (X_train: features; y_train: outcomes)
X_train = train_data.drop(list(targets), axis=1).drop('Id', axis=1)
y_train = train_data[list(targets)]

# prepare the test data
# Here, the 'fnc' features have already been 'rescaled' with a multiplication of 1/500
Id_no = submission['Id'].apply(lambda x: int(x.split('_')[0])).unique()
test = pd.DataFrame({'Id' : Id_no})
test_pred_df = test.copy()
#submission.head()
test_data = test.merge(loading, on='Id', how='left')
test_data = test_data.merge(fnc, on='Id', how='left')

X_test = test_data.drop('Id', axis=1)



### GridSearchCV on RAW data

In [70]:
## GridSearchCV, KFold
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor #, GradientBoostingRegressor


def eval_metric(y_true, y_pred):
    return np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0)


model = RandomForestRegressor(random_state = 29)
#model = GradientBoostingRegressor()


cv = KFold(n_splits = 5, shuffle=True, random_state=29)
grid_params = {
    'n_estimators':[5,10, 20] #,100
}
gs = GridSearchCV(model, grid_params, n_jobs=-1, cv=cv, verbose=2, scoring='neg_mean_absolute_error')


In [71]:

best_models01 = {}
models_cv_results01 = {}
cv_info = ['param_n_estimators','mean_test_score']


for col in targets:
    gs.fit(X_train, y_train[col])   
    best_models01[col] = gs.best_estimator_  
    models_cv_results01[col] = [gs.cv_results_.get(info) for info in cv_info]

    print(gs.best_score_)

best_models01

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 16.0min finished


-8.472785717020276
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 19.7min finished


-7.630123752488837
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 27.8min finished


-8.698082516972553
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 23.9min finished


-8.8968683566741
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 23.8min finished


-9.446079120387248


{'age': RandomForestRegressor(n_estimators=20, random_state=29),
 'domain1_var1': RandomForestRegressor(n_estimators=20, random_state=29),
 'domain1_var2': RandomForestRegressor(n_estimators=20, random_state=29),
 'domain2_var1': RandomForestRegressor(n_estimators=20, random_state=29),
 'domain2_var2': RandomForestRegressor(n_estimators=20, random_state=29)}

In [72]:
weights = (.3, .175, .175, .175, .175)

overall_score01=0
target_scores01 = []

for col, weight in zip(targets, weights):
    y_pred_train = best_models01[col].predict(X_train)
    score = eval_metric(y_train[col], y_pred_train)
    overall_score01 += score*weight

    target_scores01.append((col, score))
    print("{}: {}".format(col,score))

target_scores01.append(('overall-score', overall_score))
print("Overall-score: {}".format(overall_score))

age: 0.06531893579826167
domain1_var1: 0.0573875404435151
domain1_var2: 0.05694669917183827
domain2_var1: 0.07169501263787025
domain2_var2: 0.06906957637857804
Overall-score: 0.06575444676198482


### More model testing will be incorporated here

#### Evaluation metrics to assess the performances of the models for this challenge

In [None]:
def eval_metric(y_true, y_pred):
    return np.mean(np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0))


In [None]:
from sklearn.model_selection import cross_val_predict  #GridSearchCV, KFold

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5
)


#cv = KFold(n_splits = 5, shuffle=True, random_state=29)
#grid = {
#    'n_estimators':[5,10]  #20,100
#}
#gs = GridSearchCV(model, grid, n_jobs=-1, cv=cv, verbose=5, scoring='neg_mean_absolute_error')

In [None]:
#best_models = {}
#for col in features:
#    gs.fit(X_train, y_train[col])   
#    best_models[col] = gs.best_estimator_  
#    print(gs.best_score_)
#
#best_models

targets = ('age', 'domain1_var1', 'domain1_var2','domain2_var1','domain2_var2')


In [None]:
weights = (.3, .175, .175, .175, .175)
overall_score=0


for col, weight in zip(targets, weights): 
    y_pred_train = cross_val_predict(model, X_train, y_train[col], cv=3, n_jobs=-1, verbose=10)
    
    #print(y_pred_train)
    
    score = eval_metric(y_train[col], y_pred_train)
    overall_score += score*weight
    
    print("{}: {}".format(col,score))
print("Overall-score: {}".format(overall_score))


In [None]:
#weights = (.3, .175, .175, .175, .175)
#
#overall_score=0
#
#for col, weight in zip(features, weights):
#    y_pred_train = best_models[col].predict(X_train)
#    score = eval_metric(y_train[col], y_pred_train)
#    overall_score += score*weight
#
#    print("{}: {}".format(col,score))
#print("Overall-score: {}".format(overall_score))

## Ignore them from now...
### Preparing the 'test' predicted outcomes in the format required for submission

In [None]:
for col in targets:
    test_pred_df[col] = best_models[col].predict(X_test)


In [None]:
submit_df = pd.melt(test_pred_df, id_vars=["Id"], value_name='Predicted')
submit_df["Id"]=submit_df["Id"].astype("str") + "_" + submit_df["variable"].astype("str")

submit_df = submit_df.drop("variable", axis=1).sort_values("Id")

#check if the submit_df has the correct number of entries
if not submit_df.shape[0] == test_pred_df.shape[0]*5:
    raise AssertionError()


In [None]:
submit_df