In [1]:
# Importing libraries
from data_driven.modeling.scripts.evaluation import performing_cross_validation, y_randomization
from data_driven.modeling.scripts.tuning import parameter_tuning

import numpy as np
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import os
import pandas as pd
import re

# 1. Accessing the data 

If you will use Google Drive, you have to enable Google Drive and obtain the client secrets file client_secrets.json (see the following [link](https://developers.google.com/drive/api/v3/enable-drive-api) or [this other](https://www.geeksforgeeks.org/get-list-of-files-and-folders-in-google-drive-storage-using-python/))

In [2]:
# Connecting to Google Drive

## Specifying the path for the credentials
credentials_file = os.path.join(os.getcwd(),
                                os.pardir,
                               'client_secrets.json')
GoogleAuth.DEFAULT_SETTINGS['client_config_file'] = credentials_file
GoogleAuth.DEFAULT_SETTINGS['oauth_scope'] = ['https://www.googleapis.com/auth/drive']


# Authenticating
gauth = GoogleAuth()
gauth.LocalWebserverAuth() # client_secrets.json need to be in the same directory as the script
drive = GoogleDrive(gauth)

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=128273080156-gak8id4v61mj7jjdqqucv74kc5volhj0.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


If you need to check the folder on where you have stored the data for running the multi-class classification algorithm, use the folder ID and the function just below

In [3]:
def ListFolder(parent_id):
    '''
    Function to list the files in a Google Drive folder
    
    Input:
    
        - parent_id: string
    
    Output:
    
        - filelist: Python list with title of the files and their urls
    
    '''
    
    filelist=[]
    file_list = drive.ListFile({'q': "'%s' in parents and trashed=false" % parent_id}).GetList()
    
    for f in file_list:
        if f['mimeType']=='application/vnd.google-apps.folder': # if folder
            filelist.append({"id":f['id'],"title":f['title'],"list":ListFolder(f['id'])})
        else:
            filelist.append({"title":f['title'],"title1":f['alternateLink']})
            
    return filelist

In [4]:
# When you share a folder/file from Google Drive by using its shareable link you will see tomething like 
# https://drive.google.com/drive/folders/{ID_element}?usp=sharing. You can take the ID_element

multi_class_data_folder_id = '1hoUT3WQxblQJWm-8v-9EKLJZ4614JVtI'
ListFolder(multi_class_data_folder_id)

[{'title': '3.npz',
  'title1': 'https://drive.google.com/file/d/18KrIiGmRn90uCNchktAWDPRa8wl2qKyO/view?usp=drivesdk'},
 {'title': '2.npz',
  'title1': 'https://drive.google.com/file/d/1pAgEm3u5_spuniqBBfFta3EK2xPuLQc3/view?usp=drivesdk'},
 {'title': '1.npz',
  'title1': 'https://drive.google.com/file/d/1bGA6Zuvez9weG0sZglM6nFpphSMjdl8D/view?usp=drivesdk'},
 {'title': '.gitkeep',
  'title1': 'https://drive.google.com/file/d/1S2h6RPn7t_iIrcq-9KbaYWRfhpJuIW88/view?usp=drivesdk'}]

In [5]:
ids = {re.search(r'([1-9]{1,2}).npz', file['title']).group(1):
       re.search(r'https://.*/d/(.*)/view?.*', file['title1']).group(1)
       for file in ListFolder(multi_class_data_folder_id)
       if file['title'].endswith('.npz')}

In [6]:
ids

{'3': '18KrIiGmRn90uCNchktAWDPRa8wl2qKyO',
 '2': '1pAgEm3u5_spuniqBBfFta3EK2xPuLQc3',
 '1': '1bGA6Zuvez9weG0sZglM6nFpphSMjdl8D'}

In [7]:
def open_dataset(id, key, test=False):
    '''
    Function open the data sets:
    
    Input:
        - id: string = data processing id
        - key: string = key or id for the file in Google Drive
        - test: boolean = if the test set is required
        
    Output:
        - X_train, Y_train, X_test, Y_test: numpy arrays
    '''
    
    # Getting the from Google Drive
    f_data = drive.CreateFile({'id': key})
    f_data.GetContentFile(f'{id}.npz')
                             
    # Loading the .npz for training
    with np.load(f'{id}.npz') as data:
        
        X_train = data['X_train']
        Y_train = data['Y_train']
        # Checking the dimensions
        print(f'X train has the following dimensions: {X_train.shape}')
        print(f'Y train has the following dimensions: {Y_train.shape}')
        
        if test:
            
            X_test = data['X_test']
            Y_test = data['Y_test']
            
            # Checking the dimensions
            print(f'X test has the following dimensions: {X_test.shape}')
            print(f'Y test has the following dimensions: {Y_test.shape}')
            
            os.remove(f'{id}.npz')
            
            return X_train, Y_train, X_test, Y_test
        
        os.remove(f'{id}.npz')
        
        return X_train, Y_train

# 2. Building RFC with default parameters

In [None]:
def build_base_model(id, X_train, Y_train):
    '''
    Function to test the RFC with default params
    
    Input:
        - id: string = data processing id
        - X_train, Y_train: numpy arrays
        
    Output:
        - df_result: dataframe = model evaluation under Y-randomization and cross-validation
    '''
                               
    # Default parameters
    model_params = {
        'bootstrap': True,
        'ccp_alpha': 0.0,
        'class_weight': 'balanced',
        'criterion': 'gini',
        'max_depth': None,
        'max_features': 'sqrt',
        'max_leaf_nodes': None,
        'max_samples': None,
        'min_impurity_decrease': 0.0,
        'min_samples_leaf': 1,
        'min_samples_split': 2,
        'min_weight_fraction_leaf': 0.0,
        'n_estimators': 100,
        'n_jobs': 4,
        'oob_score': False,
        'random_state': 0,
        'verbose': 0,
        'warm_start': False,
    }
                               
    # 5-fold cross-validation
    cv_result = performing_cross_validation('RFC',
                                           model_params,
                                           X_train,
                                           Y_train.reshape(Y_train.shape[0],),
                                           'multi-class classification')
    df_cv = pd.DataFrame({key: [val] for key, val in cv_result.items()})
                               
    # Y-randomization
    y_randomization_error = y_randomization('RFC',
                                       model_params,
                                       X_train,
                                       Y_train.reshape(Y_train.shape[0],),
                                       'multi-class classification')
    df_yr = pd.DataFrame({key: [val] for key, val in y_randomization_error.items()})
                               
    
    df_result = pd.concat([df_cv, df_yr], axis=1)
    df_result['id'] = id
    
    return df_result

In [None]:
df_results = pd.DataFrame()

for id, key in ids.items():
    
    print(f'Evaluating base model for data preparation id {id}\n')
    
    # Opening the dataset
    X_train, Y_train = open_dataset(id, key, test=False)
    
    # Y-randomization and cross-validation
    df_results = pd.concat([df_results, build_base_model(id, X_train, Y_train)],
                          axis=0,
                          ignore_index=True)

In [None]:
df_results

# 3. Selection of best data preprocessing

As shown the above results, the default RFC presents overfitting for the three preprocessing pipelines. In addition, their performances are similar being the pipeline id 1 a little worse than the others. Nonenetheless, for Y randomization and cross validation the pipeline id 1 required around 3 minutes and 19 seconds. Tha means, that this pipeline would represent a less tuning and prediction time for future serving models.  

# 4. Tuning the model

In [8]:
# Opening the dataset
X_train, Y_train, X_test, Y_test = open_dataset('1', '1bGA6Zuvez9weG0sZglM6nFpphSMjdl8D', test=True)

X train has the following dimensions: (59070, 62)
Y train has the following dimensions: (59070, 1)
X test has the following dimensions: (8465, 62)
Y test has the following dimensions: (8465, 1)


In [9]:
# Default parameters
model_params = {
    'ccp_alpha': 0.0,
    'class_weight': 'balanced',
    'criterion': 'gini',
    'max_leaf_nodes': None,
    'max_samples': None,
    'min_impurity_decrease': 0.0,
    'min_weight_fraction_leaf': 0.0,
    'n_jobs': 1,
    'oob_score': False,
    'random_state': 0,
    'verbose': 0,
    'warm_start': False,
}

# For tuning
params_for_tuning = {'bootstrap':[True, False],
        'min_samples_leaf': list(np.linspace(0.1, 0.5, 5)) + list(np.linspace(0.0001, 0.05, 5)),
        'min_samples_split': list(np.linspace(0.1, 1.0, 5)) + list(np.linspace(0.0001, 0.05, 5)),
        'max_features': ['log2', 'sqrt', None],
        #'n_estimators': list(np.linspace(50, 1000, 5, dtype=int)) + [100],
        'max_depth': list(np.linspace(5, 50, 5, dtype=int)) + [None]}

In [10]:
# Tuning
tuning_result = parameter_tuning(X_train, Y_train.reshape(Y_train.shape[0],),
                                             'RFC', model_params,
                                             params_for_tuning,
                                             'multi-class classification')

[2m[36m(_Trainable pid=425089)[0m 2022-02-05 14:06:28.138033: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
[2m[36m(_Trainable pid=425089)[0m 2022-02-05 14:06:28.138067: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[2m[36m(_Trainable pid=425088)[0m 2022-02-05 14:06:28.273566: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
[2m[36m(_Trainable pid=425088)[0m 2022-02-05 14:06:28.273598: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[2m[36m(_Trainable pid=425087)[0m 2022-02-05 14:06:28.334221: W tensorflo

RayTaskError(ValueError): [36mray::_Trainable.train_buffered()[39m (pid=425089, ip=212.128.174.62, repr=<tune_sklearn._trainable._Trainable object at 0x7f4bda324f10>)
  File "/home/jodhernandezbe/anaconda3/envs/PRTR/lib/python3.9/site-packages/ray/tune/trainable.py", line 255, in train_buffered
    result = self.train()
  File "/home/jodhernandezbe/anaconda3/envs/PRTR/lib/python3.9/site-packages/ray/tune/trainable.py", line 314, in train
    result = self.step()
  File "/home/jodhernandezbe/anaconda3/envs/PRTR/lib/python3.9/site-packages/tune_sklearn/_trainable.py", line 106, in step
    return self._train()
  File "/home/jodhernandezbe/anaconda3/envs/PRTR/lib/python3.9/site-packages/tune_sklearn/_trainable.py", line 196, in _train
    self._early_stopping_ensemble(i, estimator, X_train,
  File "/home/jodhernandezbe/anaconda3/envs/PRTR/lib/python3.9/site-packages/tune_sklearn/_trainable.py", line 153, in _early_stopping_ensemble
    estimator.fit(X_train, y_train)
  File "/home/jodhernandezbe/anaconda3/envs/PRTR/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 384, in fit
    self._validate_estimator()
  File "/home/jodhernandezbe/anaconda3/envs/PRTR/lib/python3.9/site-packages/sklearn/ensemble/_base.py", line 138, in _validate_estimator
    raise ValueError(
ValueError: n_estimators must be greater than zero, got 0.

In [20]:
tuning_result

{'best_params': {'bootstrap': False,
  'min_samples_leaf': 0.0001,
  'min_samples_split': 0.012575,
  'max_features': None,
  'max_depth': None,
  'n_estimators': 100},
 'best_estimator': RandomForestClassifier(bootstrap=False, class_weight='balanced',
                        max_features=None, min_samples_leaf=0.0001,
                        min_samples_split=0.012575, n_jobs=1, random_state=0),
 'best_score': 0.34259353309632645}