In [41]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline                                           
from sklearn.preprocessing import StandardScaler                                     
from sklearn.feature_selection import SelectFromModel
from sklearn.base import clone
from sklearn import tree
from sklearn.model_selection import ParameterGrid
from joblib import Parallel, delayed
from multiprocessing import Pool
%run -i '../../../../../utils.py'

# Get data

In [4]:
# read meta data
df_meta = pd.read_csv('../../../../../our_data/meta_data.csv', index_col=0)
df_meta = df_meta[df_meta.Diet=='Inulin'] # only for inulin group
df_meta = df_meta[df_meta.Day != 0] # remove day 0

# read SCFA data
df_scfa = pd.read_csv('../../../../../our_data/SCFA.csv', index_col=0)

# read bacterial abundance
df_bac = pd.read_csv('../../../../../our_data/16S_absolute_abundance_species.csv', index_col=0)

# find common samples
common_samples = list(set(df_meta.index).intersection(df_scfa.index).intersection(df_bac.index))
df_meta = df_meta.loc[common_samples]
df_scfa = df_scfa.loc[common_samples]
df_bac = df_bac.loc[common_samples]

# Scan parameters

In [54]:
def run_random_forest(params):
    # get data
    xdata_train = np.asarray(df_bac.values)
    ydata_train = np.asarray(df_scfa[params['scfa']])

    print(params)
    # make pipeline
    clf = RandomForestRegressor(n_estimators=params['n_trees'],
                                max_features=params['max_features'],
                                random_state=0, 
                                oob_score=True, 
                                max_depth=params['max_depth'],
                                min_samples_split=params['min_samples_split'],
                                min_samples_leaf=params['min_samples_leaf']
                               )
    pipe = make_pipeline(StandardScaler(), SelectFromModel(clf, max_features=params['n_features_to_select'], threshold=-np.inf), clone(clf))  
    pipe.fit(xdata_train, ydata_train)
    return list([params['scfa'],
                 params['n_features_to_select'],
                 params['n_trees'],
                 params['max_features'],
                 params['max_depth'],
                 params['min_samples_split'],
                 params['min_samples_leaf'],
                 pipe[2].oob_score_])
       
#     # convert to panda
#     df_tmp = pd.DataFrame.from_dict(params_exp, orient='index').T
#     if df_oob is None:
#         df_oob = deepcopy(df_tmp)
#     else:
#         df_oob = pd.concat([df_oob, df_tmp], ignore_index=True)


In [None]:
# parameters to test
param_grid = {'scfa':['Acetate','Butyrate','Propionate'],
              'n_features_to_select':[2,4,8,16,32],
              'n_trees':[1024,2048,4096,8192],
              'max_features':['auto','sqrt','log2',0.16,0.32,0.64],
              'max_depth':[2,4,8,16],
              'min_samples_split':[2,4,8,16],
              'min_samples_leaf':[1,2,4]
             }
param_scores = Parallel(n_jobs=12)(delayed(run_random_forest)(params) for params in ParameterGrid(param_grid))