In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline                                           
from sklearn.preprocessing import StandardScaler                                     
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedKFold as SKF
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score as AUC
from sklearn.base import clone
from sklearn import tree
from sklearn.model_selection import ParameterGrid, GridSearchCV
from joblib import Parallel, delayed
from multiprocessing import Pool
from tqdm import tqdm
from scipy.spatial import distance
from skbio.stats.ordination import pcoa
import umap
from sklearn.metrics import make_scorer
from sklearn import linear_model
import warnings
warnings.filterwarnings("ignore")

# Get data

In [2]:
# read meta data
df_meta = pd.read_csv('../../../../../../our_data/meta_data.csv', index_col=0)
df_meta = df_meta[df_meta.Diet=='Inulin'] # only for inulin group
df_meta = df_meta[df_meta.Day != 0] # remove day 0

# read SCFA data
df_scfa = pd.read_csv('../../../../../../our_data/SCFA.csv', index_col=0)

# read bacterial abundance (family level)
df_bac = pd.read_csv('../../../../../../our_data/16S_absolute_abundance_family.csv', index_col=0)

# find common samples
common_samples = list(set(df_meta.index).intersection(df_scfa.index).intersection(df_bac.index))
df_meta = df_meta.loc[common_samples]
df_scfa = df_scfa.loc[common_samples]
df_bac = df_bac.loc[common_samples]

# Run random forest model

## self-defined function

In [3]:
def get_weights(X, Z, method=None): # X is test and Z is train
    X.loc[:, "is_z"] = 0
    Z.loc[:, "is_z"] = 1
    XZ = pd.concat([X, Z],ignore_index=True) # keep index
    labels = XZ['is_z'].values
    XZ_mat = XZ.drop('is_z', axis=1).values
     
    # test if X and Z can be distinguished
    clf = RandomForestClassifier(n_estimators=100, max_depth=3,random_state=0)
    predictions = np.zeros(labels.shape)
    skf = SKF(n_splits=20, shuffle=True, random_state=0)
    for fold, (train_idx, test_idx) in enumerate(skf.split(XZ_mat, labels)):
        X_train, X_test = XZ_mat[train_idx,:], XZ_mat[test_idx,:]
        y_train, y_test = labels[train_idx], labels[test_idx]

        clf.fit(X_train, y_train)
        probs = clf.predict_proba(X_test)[:, 1] # probability that each data point is a sample from training set
        predictions[test_idx] = probs
    roc_auc = AUC(labels, predictions)
    
    # weight_i = p_i(X|D)/p_i(Z|D)
    predictions_Z = predictions[len(X):] # p(Z/D)
    weights = (1./predictions_Z) - 1. # p(X|D)/p(Z/D)
    weights /= np.mean(weights) # we do this to re-normalize the computed log-loss
    XZ['size'] = 4
    XZ.iloc[len(X):, XZ.columns.get_loc('size')] = 0.1 + weights*15
    
    # perform decomposition
    if method is not None:
        XZ = XZ.sort_index()
        XZ_mat = XZ.drop(['is_z','size'], axis=1).values

        # dimensionality reduction
        if method=='UMAP':
            fit = umap.UMAP(random_state=0)
            u = fit.fit_transform(XZ_mat)
            XZ_dec = pd.DataFrame(u, index=XZ.index, columns=['Axis1','Axis2'])
        elif method=='PCoA':
            dist_relab = distance.squareform(distance.pdist(XZ_mat, metric="braycurtis"))
            OrdinationResults = pcoa(dist_relab, number_of_dimensions=2)
            XZ_dec = pd.DataFrame(OrdinationResults.samples.values, index=XZ.index, columns=['Axis1','Axis2'])
        else:
            print('uknown method: %s'%(method))
            raise

        XZ_dec['is_z'] = XZ['is_z']
        XZ_dec['size'] = XZ['size']        
        return roc_auc, weights, XZ_dec
    else:
        return roc_auc, weights, XZ

## initialization

In [4]:
roc_auc = {}
results = []
use_weights=False
plot_weights=False

## intrapolation

In [5]:
if use_weights and plot_weights:
    fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(20,4), sharex=True, sharey=True)

for k,group_to_exclude in enumerate(['A','B','C','D']):

    # split train/test data
    mice_to_keep = list(set(df_meta[df_meta.RandomizedGroup!=group_to_exclude].MiceID))
    samples_to_keep = list(set(df_meta[df_meta.MiceID.isin(mice_to_keep)].index))
    mice_to_exclude = list(set(df_meta[df_meta.RandomizedGroup==group_to_exclude].MiceID))
    samples_to_exclude = list(set(df_meta[df_meta.MiceID.isin(mice_to_exclude)].index))

    # get weights of training sets
    xdata_train = df_bac.loc[samples_to_keep]
    xdata_test = df_bac.loc[samples_to_exclude]
    if use_weights:        
        roc_auc[group_to_exclude], weights, df_ord = get_weights(X=xdata_test, Z=xdata_train, method='PCoA')
        assert len(xdata_train)==len(weights)
        print ("ROC-AUC (%s): %2.2f" % (group_to_exclude, roc_auc[group_to_exclude]))
     
        # show weights on train and test
        if plot_weights:
            # _ = sns.scatterplot(x='Axis1', y='Axis2', hue='is_z', size='size', data=df_ord, ax=ax[k])
            df_ord_0 = df_ord.loc[df_ord.is_z==0]
            _ = ax[k].scatter(df_ord_0['Axis1'], df_ord_0['Axis2'], marker='o', s=df_ord_0['size'], c='r', label='test')
            df_ord_1 = df_ord.loc[df_ord.is_z==1]
            _ = ax[k].scatter(df_ord_1['Axis1'], df_ord_1['Axis2'], marker='o', s=df_ord_1['size'], c='b', label='train')

    xdata_train = np.asarray(xdata_train.values)
    xdata_test = np.asarray(xdata_test.values)
    
    # run random forest regression with weights
    for scfa in ['Acetate','Propionate','Butyrate']:                
        ydata_train = np.asarray(df_scfa.loc[samples_to_keep, scfa])
        ydata_test = np.asarray(df_scfa.loc[samples_to_exclude, scfa])

        # make pipeline
        # use standardscaler for transformation
        # use lasso for feature selection
        # use random forest for prediction
        param_grid = {
            'selectfrommodel__estimator__alpha':[10**v for v in [-4,-3,-2,-1,0]], # too large alpha will produce a null model (all features are 0)
            'randomforestregressor__max_features':['auto','sqrt','log2',0.16,0.32,0.64],
            'randomforestregressor__max_depth':[2,4,8,16],
            'randomforestregressor__min_samples_split':[2,4,8,16],
            'randomforestregressor__min_samples_leaf':[1,2,4]
        }
        
        clf1 = linear_model.Lasso(tol=1e-5,positive=True,random_state=0,max_iter=1000000)
        clf2 = RandomForestRegressor(n_estimators=2000,random_state=0,oob_score=True)
        pipe = make_pipeline(StandardScaler(), SelectFromModel(clf1, threshold=1e-5), clone(clf2))  
        CV = GridSearchCV(pipe, param_grid, scoring='r2', cv=5, n_jobs=-1, verbose=2)
        if use_weights:
            CV.fit(xdata_train, ydata_train, selectfrommodel__sample_weight=weights, randomforestregressor__sample_weight=weights)   
        else:
            CV.fit(xdata_train, ydata_train)

        print('Intrapolation, group %s, %s, best score and parameter combination = '%(group_to_exclude, scfa))
        print(CV.best_score_)    
        print(CV.best_params_)    
        print('\n')

        # predict training set
        ydata_train_predicted = CV.predict(xdata_train)
        ydata_test_predicted = CV.predict(xdata_test)

        for sample_, obs_, pred_ in zip(samples_to_keep, ydata_train, ydata_train_predicted):
            day_ = df_meta.loc[sample_,'Day']
            results.append(['intrapolation', scfa, group_to_exclude, 'train', sample_, day_, obs_, pred_])
        for sample_, obs_, pred_ in zip(samples_to_exclude, ydata_test, ydata_test_predicted):
            day_ = df_meta.loc[sample_,'Day']
            results.append(['intrapolation', scfa, group_to_exclude, 'test', sample_, day_, obs_, pred_])

if use_weights and plot_weights:
    plt.tight_layout()

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   48.0s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.5min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 21.0min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 26.1min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.7min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.9min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 44.5min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 47.1min finished


Intrapolation, group A, Acetate, best score and parameter combination = 
0.10216139254528415
{'randomforestregressor__max_depth': 16, 'randomforestregressor__max_features': 'log2', 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.01}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   47.7s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 21.2min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 26.3min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.9min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 38.1min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 44.7min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 47.4min finished


Intrapolation, group A, Propionate, best score and parameter combination = 
0.38381973008110465
{'randomforestregressor__max_depth': 8, 'randomforestregressor__max_features': 0.64, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.01}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   47.0s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.4min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.8min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.7min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.2min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.2min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.6min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 46.2min finished


Intrapolation, group A, Butyrate, best score and parameter combination = 
0.41925416841134633
{'randomforestregressor__max_depth': 4, 'randomforestregressor__max_features': 0.64, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 4, 'selectfrommodel__estimator__alpha': 0.01}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.6s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.7min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.7min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.1min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.2min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.7min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 46.2min finished


Intrapolation, group B, Acetate, best score and parameter combination = 
0.017802155025078648
{'randomforestregressor__max_depth': 4, 'randomforestregressor__max_features': 0.16, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 16, 'selectfrommodel__estimator__alpha': 1}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.8s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.7min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.7min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.2min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.2min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.8min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 46.3min finished


Intrapolation, group B, Propionate, best score and parameter combination = 
0.42754789353460065
{'randomforestregressor__max_depth': 4, 'randomforestregressor__max_features': 0.32, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.1}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.4s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.7min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.4min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.6min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 44.4min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 47.1min finished


Intrapolation, group B, Butyrate, best score and parameter combination = 
0.44742304544871925
{'randomforestregressor__max_depth': 4, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.01}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   48.8s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 17.1min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 21.7min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 26.9min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 38.5min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 45.0min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 47.6min finished


Intrapolation, group C, Acetate, best score and parameter combination = 
0.019145242871262534
{'randomforestregressor__max_depth': 16, 'randomforestregressor__max_features': 0.16, 'randomforestregressor__min_samples_leaf': 2, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 1}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   47.0s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.4min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.8min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.8min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.3min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.4min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.9min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 46.4min finished


Intrapolation, group C, Propionate, best score and parameter combination = 
0.4944354838856893
{'randomforestregressor__max_depth': 4, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__min_samples_leaf': 4, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.01}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.4s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.5min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.0min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.0min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.4min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 46.0min finished


Intrapolation, group C, Butyrate, best score and parameter combination = 
0.35012523443810045
{'randomforestregressor__max_depth': 16, 'randomforestregressor__max_features': 0.64, 'randomforestregressor__min_samples_leaf': 4, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.001}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.4s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.7min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.6min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.1min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.1min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.6min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 46.2min finished


Intrapolation, group D, Acetate, best score and parameter combination = 
0.041222273360252235
{'randomforestregressor__max_depth': 4, 'randomforestregressor__max_features': 0.64, 'randomforestregressor__min_samples_leaf': 2, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.1}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.8s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.7min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.7min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.2min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.2min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.7min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 46.3min finished


Intrapolation, group D, Propionate, best score and parameter combination = 
0.45690150761701914
{'randomforestregressor__max_depth': 16, 'randomforestregressor__max_features': 0.64, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 8, 'selectfrommodel__estimator__alpha': 0.1}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.6min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.0min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.0min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.4min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 46.0min finished


Intrapolation, group D, Butyrate, best score and parameter combination = 
0.3248508369174573
{'randomforestregressor__max_depth': 2, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__min_samples_leaf': 4, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.0001}




## extrapolation

In [6]:
if use_weights and plot_weights:
    fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(20,4), sharex=True, sharey=True)

for k,vendor_to_exclude in enumerate(['Beijing','Guangdong','Hunan','Shanghai']):
        
    # split train/test data
    mice_to_keep = list(set(df_meta[df_meta.Vendor!=vendor_to_exclude].MiceID))
    samples_to_keep = list(set(df_meta[df_meta.MiceID.isin(mice_to_keep)].index))
    mice_to_exclude = list(set(df_meta[df_meta.Vendor==vendor_to_exclude].MiceID))
    samples_to_exclude = list(set(df_meta[df_meta.MiceID.isin(mice_to_exclude)].index))

    # get weights of training sets
    xdata_train = df_bac.loc[samples_to_keep]
    xdata_test = df_bac.loc[samples_to_exclude]
    if use_weights:
        roc_auc[vendor_to_exclude], weights, df_ord = get_weights(X=xdata_test, Z=xdata_train, method='PCoA')
        assert len(xdata_train)==len(weights)
        print ("ROC-AUC (%s): %2.2f" % (vendor_to_exclude, roc_auc[vendor_to_exclude]))
    
        # show weights on train and test
        if plot_weights:
            # _ = sns.scatterplot(x='Axis1', y='Axis2', hue='is_z', size='size', data=df_ord, ax=ax[k])
            df_ord_0 = df_ord.loc[df_ord.is_z==0]
            _ = ax[k].scatter(df_ord_0['Axis1'], df_ord_0['Axis2'], marker='o', s=df_ord_0['size'], c='r', label='test')
            df_ord_1 = df_ord.loc[df_ord.is_z==1]
            _ = ax[k].scatter(df_ord_1['Axis1'], df_ord_1['Axis2'], marker='o', s=df_ord_1['size'], c='b', label='train')

    xdata_train = np.asarray(xdata_train.values)
    xdata_test = np.asarray(xdata_test.values)
    
    # run random forest regression with weights
    for scfa in ['Acetate','Propionate','Butyrate']:                
        ydata_train = np.asarray(df_scfa.loc[samples_to_keep, scfa])
        ydata_test = np.asarray(df_scfa.loc[samples_to_exclude, scfa])

        # make pipeline
        # use lasso for feature selection
        param_grid = {
            'selectfrommodel__estimator__alpha':[10**v for v in [-4,-3,-2,-1,0]], # too large alpha will produce a null model (all features are 0)
            'randomforestregressor__max_features':['auto','sqrt','log2',0.16,0.32,0.64],
            'randomforestregressor__max_depth':[2,4,8,16],
            'randomforestregressor__min_samples_split':[2,4,8,16],
            'randomforestregressor__min_samples_leaf':[1,2,4]
        }
        
        clf1 = linear_model.Lasso(tol=1e-5,positive=True,random_state=0,max_iter=1000000)
        clf2 = RandomForestRegressor(n_estimators=2000,random_state=0,oob_score=True)
        pipe = make_pipeline(StandardScaler(), SelectFromModel(clf1, threshold=1e-5), clone(clf2))  
        CV = GridSearchCV(pipe, param_grid, scoring='r2', cv=5, n_jobs=-1, verbose=2)
        if use_weights:
            CV.fit(xdata_train, ydata_train, selectfrommodel__sample_weight=weights, randomforestregressor__sample_weight=weights)
        else:
            CV.fit(xdata_train, ydata_train)

        print('Extrapolation, vendor %s, %s, best score and parameter combination = '%(vendor_to_exclude, scfa))
        print(CV.best_score_)    
        print(CV.best_params_)    
        print('\n')   

        # predict training set
        ydata_train_predicted = CV.predict(xdata_train)
        ydata_test_predicted = CV.predict(xdata_test)

        for sample_, obs_, pred_ in zip(samples_to_keep, ydata_train, ydata_train_predicted):
            day_ = df_meta.loc[sample_,'Day']
            results.append(['extrapolation', scfa, vendor_to_exclude, 'train', sample_, day_, obs_, pred_])
        for sample_, obs_, pred_ in zip(samples_to_exclude, ydata_test, ydata_test_predicted):
            day_ = df_meta.loc[sample_,'Day']
            results.append(['extrapolation', scfa, vendor_to_exclude, 'test', sample_, day_, obs_, pred_])

if use_weights and plot_weights:
    plt.tight_layout()

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.7s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.6min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.0min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.0min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.5min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 46.0min finished


Extrapolation, vendor Beijing, Acetate, best score and parameter combination = 
0.19645703127339798
{'randomforestregressor__max_depth': 16, 'randomforestregressor__max_features': 'sqrt', 'randomforestregressor__min_samples_leaf': 2, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.0001}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.7s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.7min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.7min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.1min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.1min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.6min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 46.2min finished


Extrapolation, vendor Beijing, Propionate, best score and parameter combination = 
0.5371959474771982
{'randomforestregressor__max_depth': 16, 'randomforestregressor__max_features': 0.64, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 8, 'selectfrommodel__estimator__alpha': 0.1}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.3s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.5min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 30.9min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 36.9min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.3min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 45.9min finished


Extrapolation, vendor Beijing, Butyrate, best score and parameter combination = 
0.3610997654755839
{'randomforestregressor__max_depth': 4, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__min_samples_leaf': 4, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.0001}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.5s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.6min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.0min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.1min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.5min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 46.1min finished


Extrapolation, vendor Guangdong, Acetate, best score and parameter combination = 
-0.017702489490492867
{'randomforestregressor__max_depth': 16, 'randomforestregressor__max_features': 0.64, 'randomforestregressor__min_samples_leaf': 4, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.0001}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.6s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.7min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.7min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.1min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.2min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.7min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 46.2min finished


Extrapolation, vendor Guangdong, Propionate, best score and parameter combination = 
0.4549194025458405
{'randomforestregressor__max_depth': 4, 'randomforestregressor__max_features': 0.64, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.0001}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.5min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 30.9min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 36.9min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.4min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 45.9min finished


Extrapolation, vendor Guangdong, Butyrate, best score and parameter combination = 
0.3968641107217252
{'randomforestregressor__max_depth': 2, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__min_samples_leaf': 4, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.01}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.7s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.7min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.7min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.1min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.2min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.7min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 46.2min finished


Extrapolation, vendor Hunan, Acetate, best score and parameter combination = 
0.002081231344603851
{'randomforestregressor__max_depth': 16, 'randomforestregressor__max_features': 'sqrt', 'randomforestregressor__min_samples_leaf': 4, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.01}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.7s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.7min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.8min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.2min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.3min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.8min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 46.4min finished


Extrapolation, vendor Hunan, Propionate, best score and parameter combination = 
0.43933551505228674
{'randomforestregressor__max_depth': 16, 'randomforestregressor__max_features': 'sqrt', 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.01}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.5min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.0min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 36.9min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.4min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 46.0min finished


Extrapolation, vendor Hunan, Butyrate, best score and parameter combination = 
0.5181045870000514
{'randomforestregressor__max_depth': 4, 'randomforestregressor__max_features': 0.64, 'randomforestregressor__min_samples_leaf': 2, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.1}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.7s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.7min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.7min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.1min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 546.4min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 552.8min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 555.4min finished


Extrapolation, vendor Shanghai, Acetate, best score and parameter combination = 
0.051387654133858526
{'randomforestregressor__max_depth': 4, 'randomforestregressor__max_features': 0.16, 'randomforestregressor__min_samples_leaf': 2, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 1}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.4s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 25.6min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 31.1min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 37.1min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 43.6min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 46.1min finished


Extrapolation, vendor Shanghai, Propionate, best score and parameter combination = 
0.1961007483722696
{'randomforestregressor__max_depth': 8, 'randomforestregressor__max_features': 0.64, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'selectfrommodel__estimator__alpha': 0.01}


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   46.5s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 965 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 1410 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 1937 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 2544 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done 3233 tasks      | elapsed: 19.9min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 24.7min
[Parallel(n_jobs=-1)]: Done 4853 tasks      | elapsed: 29.9min
[Parallel(n_jobs=-1)]: Done 5784 tasks      | elapsed: 35.7min
[Parallel(n_jobs=-1)]: Done 6797 tasks      | elapsed: 41.9min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 44.4min finished


Extrapolation, vendor Shanghai, Butyrate, best score and parameter combination = 
0.17680667885560034
{'randomforestregressor__max_depth': 16, 'randomforestregressor__max_features': 0.64, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 4, 'selectfrommodel__estimator__alpha': 0.01}




## save results to file

In [8]:
df_roc_w_weights = pd.DataFrame.from_dict(roc_auc, orient='index').rename({0:'AUC'}, axis=1)
df_roc_w_weights

Unnamed: 0,AUC
A,0.432822
B,0.475415
C,0.529167
D,0.58503
Beijing,0.997114
Guangdong,0.989583
Hunan,1.0
Shanghai,0.999711


In [7]:
df_prediction = pd.DataFrame(results, columns=['PerturbationType','SCFA','Permutation','PredictionType','SampleID','Day','ObservedValue','PredictedValue'])
if use_weights:
    df_prediction.to_csv('rf_prediction_predictor_familyonly_w_weights.csv')
else:
    df_prediction.to_csv('rf_prediction_predictor_familyonly_wo_weights.csv')