In [1]:
import pandas as pd
import boto3
import botocore
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

## Define functions

In [2]:
def final_cleaning(ids, target, train, test=None):

    if type(ids) == list:
        ids.append(target)
        drop_cols = ids.copy()
    else:
        drop_cols = [ids, target]
            
    # Shuffle
    train = train.sample(frac=1).reset_index(drop=True)
    
    # Split features and labels
    X_train = train.drop(columns=drop_cols)
    y_train = np.array(train[target].tolist())
    
    # Impute missing values
    imputer = SimpleImputer(strategy = 'median')
    imputer.fit(X_train)
    X_train = imputer.transform(X_train)
    
    # Scale each feature to have mean 0 and std dev of 1
    scaler = StandardScaler() 
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    

    if test:
        test = test.sample(frac=1).reset_index(drop=True)
        X_test = test.drop(columns=drop_cols)
        y_test = np.array(test[target].tolist())
        X_test = imputer.transform(X_test)
        X_test = scaler.transform(X_test)
        return X_train, X_test, y_train, y_test

    else:
        return X_train, y_train

In [3]:
def run_random_search(model, random_grid, scoring, cv, n_iter, X_train, y_train):
    
    # Use the random grid to search for best hyperparameters
    m = model
    print('--> Model defined')

    random_search_model = RandomizedSearchCV(estimator = m, scoring=scoring,
                                   param_distributions = random_grid,
                                   n_iter = n_iter, cv = cv, verbose=0,
                                   random_state=8, n_jobs = -1,
                                   return_train_score=True)
    print('--> Random search defined')

    # Fit the random search model
    random_search_model.fit(X_train, y_train)
    print('--> Fitting done')

    # Print the best CV score
    print('--> Best CV Score: ', random_search_model.best_score_)
    
    return random_search_model

## Import Data & Clean

In [4]:
s3 = boto3.resource('s3')
s3.Object('mimic-jamesi', 'acute_respiratory_failure_train.csv').download_file('acute_respiratory_failure_train.csv')
train = pd.read_csv('acute_respiratory_failure_train.csv', index_col=0)

In [5]:
X_train, y_train = final_cleaning(ids = ['subject_id', 'hadm_id'], target = 'target', train = train)
print('--> Cleaning done')

--> Cleaning done


## Run RF test

In [6]:
iterations = 2

In [7]:
# define the grid search parameters
n_estimators = list(np.arange(20, 3000, 5))
max_features = list(np.arange(2, X_train.shape[1]))
max_depth = list(np.arange(1, 100))
max_depth.append(None)
min_samples_split = list(np.arange(2, 250))
min_samples_leaf = list(np.arange(1, 250))
bootstrap = [True, False]

# Create the random grid
rf_random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}

print('--> Grid defined')

--> Grid defined


In [8]:
%timeit
# Run the random search model
rf_random_search_model = run_random_search(model=RandomForestRegressor(), random_grid=rf_random_grid,
                                        scoring='roc_auc', cv=4, n_iter=iterations, 
                                        X_train=X_train, y_train=y_train)

--> Model defined
--> Random search defined
--> Fitting done
--> Best CV Score:  0.7997943444619462


In [10]:
rf_random_search_model.cv_results_

{'mean_fit_time': array([433.10917705, 113.91217208]),
 'std_fit_time': array([5.1546215 , 0.79263555]),
 'mean_score_time': array([1.46421957, 0.64447302]),
 'std_score_time': array([0.11543866, 0.00889725]),
 'param_n_estimators': masked_array(data=[2880, 1010],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[232, 131],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[12, 136],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_max_features': masked_array(data=[22, 29],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[96, 16],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_bootstrap': masked_array(data=[False, True],
              mask=[False, False],

In [18]:
%timeit run_random_search(model=RandomForestRegressor(), random_grid=rf_random_grid, scoring='roc_auc', cv=4, n_iter=1, X_train=X_train, y_train=y_train)

--> Model defined
--> Random search defined


KeyboardInterrupt: 