#### Summer of Reproducibility - noWorkflow base experiment

This notebook implements an experimental setup modeling a Credit Fraud problem.

In [1]:
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
import numpy as np
#np.set_printoptions(threshold=np.inf)
np.set_printoptions(precision=2)


from noworkflow.now.tagging.var_tagging import *

#### Reading the dataset

In [2]:
now_tag('dataset_reading')
df = pd.read_csv('dataset/creditcard.csv', encoding='utf-8')

### Feature engineering stage

Separate the features and target variable. First step in feature treatment.

In [3]:
#now_tag('feature_eng')
X = df.drop('Class', axis=1)
y = df['Class']

#### Feature engineering: Apply PCA for feature extraction.

Here we define hyperparam_def tag given that n_components argument in PCA is required

In [4]:
pca_components = now_variable('pca_components', 6)
pca = PCA(n_components=pca_components)  # Adjust the number of components as needed
X_pca = pca.fit_transform(X)

Evaluation(id=40, checkpoint=38.49172881, code_component_id=1171, activation_id=37, repr=6)


#### Feature engineering: Apply random undersampling over the extracted features

Another case of feature engineering operation with hyperparameter definition. Here is random_state value for RandmUnderSampler


In [5]:
random_seed = now_variable('random_seed', 123456)
rus = RandomUnderSampler(random_state=random_seed)
X_resampled, y_resampled = rus.fit_resample(X_pca, y)

Evaluation(id=58, checkpoint=39.666666252, code_component_id=1204, activation_id=55, repr=123456)


#### Feature engineering: Spliting dataset into train and test

Here we have two hyperparameters assignments: the proportion of the test_size and the random_state. A guess here would be implement some logic to take all scalar values in hyperparam_def in cells. Not sure at the moment if there are any corner case where a hyperparameter could be vectorial or an object.

In [6]:
now_tag('feature_eng')
test_dim = now_variable('test_dim', 0.2)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=test_dim, random_state=random_seed)

Evaluation(id=80, checkpoint=39.965805788, code_component_id=1246, activation_id=74, repr=0.2)


#### Scoring: model training and transforming features into predictions
##### RandomForest

Train and evaluate Random Forest Classifier. Unsure now if adding a model_training tag would be redundant here. Scoring is enough at first sight.

In [7]:
#now_tag('scoring')
now_tag('model_training')
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

#### Evaluating: evaluating the performance of models
##### RandomForest
Computing performance metrics 

In [8]:
now_tag('evaluating')
y_pred_rf = rf.predict(X_test)

roc_rf = now_variable('roc_rf', roc_auc_score(y_test, y_pred_rf))
#roc_rf = roc_auc_score(y_test, y_pred_rf)
f1_rf = now_variable('f1_rf', f1_score(y_test, y_pred_rf))
#f1_rf = f1_score(y_test, y_pred_rf)

print("Random Forest - ROC = %f, F1 = %f" % (roc_rf, f1_rf))

Evaluation(id=124, checkpoint=40.567982439, code_component_id=1323, activation_id=110, repr=0.9447428217310474)
Evaluation(id=133, checkpoint=40.570411600999996, code_component_id=1339, activation_id=110, repr=0.9424083769633509)
Random Forest - ROC = 0.944743, F1 = 0.942408


### Experiment comparision

The steps are:
1. calls get_pre for a given tagged variable and keeps the operations_dictionary output
2. calls store operations() to store the dict into a shelve object with current trial_id key
3. load the shelve object to retrieve other stored experiment as well the current one
4. calls exp_compare passing two trial ids as argumens to make a comparision



In [9]:
ops_dict, dep_list = get_pre('roc_rf')

In [10]:
id_1 = __noworkflow__.trial_id
store_operations(id_1, ops_dict)

Dictionary stored in shelve.


In [11]:
import shelve
shelf = shelve.open('ops')
list_id = list(shelf.keys())
list_id

['fa369ec4-b11e-43b2-ac8a-6eb9b6eec13c',
 'adf942da-daa2-4899-97e3-e94122d03f6a',
 '8bfca706-7c71-45d8-8f5b-6506a06927cc',
 '7a7b9e5b-ad79-44fb-9b33-b818aecc9407',
 '595dc841-cc3c-45d4-87ba-a63b94243706',
 '1e09f81d-af9a-4e5b-bf2e-9b7177eb6b63',
 'baefc258-9e43-4583-a52e-9aa87ffa50c0',
 '2248d786-62d7-4137-99a3-c30fdb72ac82']

In [12]:
exp_compare(list_id[-1], list_id[0], html=True)

0,1,2,3,4,5
f,1,"y_test, matrix dim(137,)",f,1,"y_test, matrix dim(137,)"
,2,"RandomForestClassifier(), matrix dim(24,)",,2,"RandomForestClassifier(), matrix dim(24,)"
,3,"rf, matrix dim(24,)",,3,"rf, matrix dim(24,)"
n,4,"X_resampled, matrix dim(264,)",n,4,"X_resampled, matrix dim(462,)"
,5,"RandomUnderSampler(random_state=random_seed), matrix dim(39,)",,5,"RandomUnderSampler(random_state=random_seed), matrix dim(39,)"
,6,"rus, matrix dim(39,)",,6,"rus, matrix dim(39,)"
n,7,"now_variable('pca_components', 3), 3",n,7,"now_variable('pca_components',6), 6"
,8,"pca_components, 3",,8,"pca_components, 6"
,9,"PCA(n_components=pca_components), PCA(n_components=3)",,9,"PCA(n_components=pca_components), PCA(n_components=6)"
,10,"pca, PCA(n_components=3)",,10,"pca, PCA(n_components=6)"


In [18]:
def dict_compare(trial_a, trial_b):
    import shelve
    import numpy as np
    from IPython.display import HTML
    
    comp_dict = {}
    # Retrieve the ops dictionary from the shelve file
    with shelve.open('ops') as shelf:
        dict1 = shelf[trial_a]
        dict2 = shelf[trial_b]
    
    if len(dict1) == len(dict2):

        for key in dict1:
            value1 = dict1[key]
            value2 = dict2[key]

            if isinstance(value1, np.ndarray) and isinstance(value2, np.ndarray):
                # If both values are NumPy arrays, compare if they are equal
                if np.array_equal(value1, value2):
                    comp_dict[value1[0]] = 'equal matrices'
                else:
                    comp_dict[value1[0]] = 'different matrices'
                
            elif value1 != value2:
                # If one or both values are scalars, compare their equality
                comp_dict[value1[0]] = 'different values'
            else:
                comp_dict[value1[0]] = 'equal values'
    
    return comp_dict

In [19]:
dict_compare(list_id[-1], list_id[0])

{'y_test': 'equal values',
 'RandomForestClassifier()': 'equal values',
 'rf': 'equal values',
 'X_resampled': 'different values',
 'RandomUnderSampler(random_state=random_seed)': 'equal values',
 'rus': 'equal values',
 "now_variable('pca_components', 3)": 'different values',
 'pca_components': 'different values',
 'PCA(n_components=pca_components)': 'different values',
 'pca': 'different values',
 'X': 'equal values',
 'X_pca': 'different values',
 'df': 'equal values',
 "df['Class']": 'equal values',
 'y': 'equal values',
 'y_resampled': 'equal values',
 "now_variable('test_dim', 0.2)": 'equal values',
 'test_dim': 'equal values',
 "now_variable('random_seed', 123456)": 'equal values',
 'random_seed': 'equal values',
 'train_test_split(X_resampled, y_resampled, test_size=test_dim, random_state=random_seed)': 'different values',
 'X_test': 'different values',
 'y_pred_rf': 'equal values',
 'roc_auc_score(y_test, y_pred_rf)': 'different values'}