In [1]:
# Importing libraries
import numpy as np
from data_driven.modeling.scripts.evaluation import performing_cross_validation, y_randomization
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import os
import pandas as pd

# 1. Accessing the data 

If you will use Google Drive, you have to enable Google Drive and obtain the client secrets file client_secrets.json (see the following [link](https://developers.google.com/drive/api/v3/enable-drive-api) or [this other](https://www.geeksforgeeks.org/get-list-of-files-and-folders-in-google-drive-storage-using-python/))

In [2]:
# Connecting to Google Drive

## Specifying the path for the credentials
credentials_file = os.path.join(os.getcwd(),
                                os.pardir,
                               'client_secrets.json')
GoogleAuth.DEFAULT_SETTINGS['client_config_file'] = credentials_file
GoogleAuth.DEFAULT_SETTINGS['oauth_scope'] = ['https://www.googleapis.com/auth/drive']


# Authenticating
gauth = GoogleAuth()
gauth.LocalWebserverAuth() # client_secrets.json need to be in the same directory as the script
drive = GoogleDrive(gauth)

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=128273080156-gak8id4v61mj7jjdqqucv74kc5volhj0.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


If you need to check the folder on where you have stored the data for running the multi-class classification algorithm, use the folder ID and the function just below

In [3]:
def ListFolder(parent_id):
    '''
    Function to list the files in a Google Drive folder
    
    Input:
    
        - parent_id: string
    
    Output:
    
        - filelist: Python list with title of the files and their urls
    
    '''
    
    filelist=[]
    file_list = drive.ListFile({'q': "'%s' in parents and trashed=false" % parent_id}).GetList()
    
    for f in file_list:
        if f['mimeType']=='application/vnd.google-apps.folder': # if folder
            filelist.append({"id":f['id'],"title":f['title'],"list":ListFolder(f['id'])})
        else:
            filelist.append({"title":f['title'],"title1":f['alternateLink']})
            
    return filelist

In [4]:
multi_class_data_folder_id = '1hoUT3WQxblQJWm-8v-9EKLJZ4614JVtI'
ListFolder(multi_class_data_folder_id)

[{'title': '3.npz',
  'title1': 'https://drive.google.com/file/d/18KrIiGmRn90uCNchktAWDPRa8wl2qKyO/view?usp=drivesdk'},
 {'title': '2.npz',
  'title1': 'https://drive.google.com/file/d/1pAgEm3u5_spuniqBBfFta3EK2xPuLQc3/view?usp=drivesdk'},
 {'title': '1.npz',
  'title1': 'https://drive.google.com/file/d/1bGA6Zuvez9weG0sZglM6nFpphSMjdl8D/view?usp=drivesdk'},
 {'title': '.gitkeep',
  'title1': 'https://drive.google.com/file/d/1S2h6RPn7t_iIrcq-9KbaYWRfhpJuIW88/view?usp=drivesdk'}]

# 2. Building RFC with default parameters

In [5]:
ids = {1: '1bGA6Zuvez9weG0sZglM6nFpphSMjdl8D',
       2: '1pAgEm3u5_spuniqBBfFta3EK2xPuLQc3',
       3: '18KrIiGmRn90uCNchktAWDPRa8wl2qKyO'}

In [19]:
def build_base_model(id, key):
    '''
    Function to test the RFC with default params
    
    Input:
        - id: int = data processing id
        - key: string = key or id for the file in Google Drive
        
    Output:
        - df_result: dataframe = model evaluation under Y-randomization and cross-validation
    '''
        
    # Getting the from Google Drive
    f_data = drive.CreateFile({'id': key})
    f_data.GetContentFile(f'{id}.npz')
                             
    # Loading the .npz for training
    with np.load(f'{id}.npz') as data:
        X_train=data['X_train']
        Y_train=data['Y_train']
        
    # Checking the dimensions
    print(f'X train has the following dimensions: {X_train.shape}')
    print(f'Y train has the following dimensions: {Y_train.shape}')
                               
    # Default parameters
    model_params = {
        'bootstrap': True,
        'ccp_alpha': 0.0,
        'class_weight': 'balanced',
        'criterion': 'gini',
        'max_depth': None,
        'max_features': 'sqrt',
        'max_leaf_nodes': None,
        'max_samples': None,
        'min_impurity_decrease': 0.0,
        'min_samples_leaf': 1,
        'min_samples_split': 2,
        'min_weight_fraction_leaf': 0.0,
        'n_estimators': 100,
        'n_jobs': 4,
        'oob_score': False,
        'random_state': 0,
        'verbose': 0,
        'warm_start': False,
    }
                               
    # 5-fold cross-validation
    cv_result = performing_cross_validation('RFC',
                                           model_params,
                                           X_train,
                                           Y_train.reshape(Y_train.shape[0],),
                                           'multi-class classification')
    df_cv = pd.DataFrame({key: [val] for key, val in cv_result.items()})
                               
    # Y-randomization
    y_randomization_error = y_randomization('RFC',
                                       model_params,
                                       X_train,
                                       Y_train.reshape(Y_train.shape[0],),
                                       'multi-class classification')
    df_yr = pd.DataFrame({key: [val] for key, val in y_randomization_error.items()})
                               
    
    df_result = pd.concat([df_cv, df_yr], axis=1)
    df_result['id'] = id
    
    return df_result

In [20]:
df_results = pd.DataFrame()

for id, key in ids.items():
    
    print(f'Evaluating base model for data preparation id {id}')
    
    df_results = pd.concat([df_results, build_base_model(id, key)],
                          axis=0,
                          ignore_index=True)

df_results

Evaluating base model for data preparation id 1
X train has the following dimensions: (486440, 82)
Y train has the following dimensions: (486440, 1)


5-fold cross validation: 100%|██████████| 5/5 [10:39<00:00, 127.97s/it]
Y-Randomization: 100%|██████████| 10/10 [59:38<00:00, 357.87s/it]


Evaluating base model for data preparation id 2
X train has the following dimensions: (486440, 83)
Y train has the following dimensions: (486440, 1)


5-fold cross validation: 100%|██████████| 5/5 [02:28<00:00, 29.70s/it]
Y-Randomization: 100%|██████████| 10/10 [07:30<00:00, 45.09s/it]


Evaluating base model for data preparation id 3
X train has the following dimensions: (21530, 451)
Y train has the following dimensions: (21530, 1)


5-fold cross validation: 100%|██████████| 5/5 [00:08<00:00,  1.70s/it]
Y-Randomization: 100%|██████████| 10/10 [00:16<00:00,  1.68s/it]


Unnamed: 0,mean_validation_accuracy,mean_train_accuracy,accuracy_analysis,mean_validation_f1,mean_train_f1,mean_validation_0_1_loss_or_error,std_validation_0_1_loss_or_error,y_randomization_mean_0_1_loss_or_error,y_randomization_std_0_1_loss_or_error,id
0,0.43,0.8,over-fitting (high variance),0.43,0.8,0.57,0.004,0.9,0.0,1
1,0.45,0.8,over-fitting (high variance),0.45,0.8,0.55,0.004,0.9,0.0,2
2,0.42,0.8,over-fitting (high variance),0.42,0.8,0.58,0.007483,0.9,0.005385,3
