#### Summer of Reproducibility - noWorkflow experiment

In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

Read the dataset

In [2]:
#now_tag('dataset_reading')
df = pd.read_csv('dataset/creditcard.csv', encoding='utf-8')

#### Separate the features and target variable

In [5]:
#now_tag('feature_eng')
X = df.drop('Class', axis=1)
y = df['Class']

#### Feature engineering: Apply random undersampling over the extracted features

Another case of feature engineering operation with hyperparameter definition. Here is random_state value for RandmUnderSampler


In [6]:
#now_tag('feature_eng')
#now_tag('hyperparam_def')
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

#### Feature engineering: Apply PCA for feature extraction

Here we define hyperparam_def tag given that n_components argument in PCA is required

In [7]:
#now_tag('feature_eng')
#now_tag('hyperparam_def')
pca = PCA(n_components=5)  # Adjust the number of components as needed
X_pca = pca.fit_transform(X_resampled)

#### Feature engineering: Spliting dataset into train and test

Here we have two hyperparameters assignments: the proportion of the test_size and the random_state. A guess here would be implement some logic to take all scalar values in hyperparam_def in cells. Not sure at the moment if there are any corner case where a hyperparameter could be vectorial or an object.

In [8]:
#now_tag('feature_eng')
#now_tag('hyperparam_def')
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_resampled, test_size=0.2, random_state=42)


#### Scoring: model training and transforming features into predictions
##### RandomForest

Train and evaluate Random Forest Classifier. Unsure now if adding a model_training tag would be redundant here. Scoring is enough at first sight.

In [30]:
#now_tag('scoring')
#now_tag('model_training')
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_resampled)

#### Evaluating: evaluating the performance of models
##### RandomForest
Computing performance metrics 

In [31]:
#now_tag('evaluating')
roc_rf = roc_auc_score(y_resampled, y_pred_rf)
f1_rf = f1_score(y_resampled, y_pred_rf)

print("Random Forest - ROC = %f, F1 = %f" % (roc_rf, f1_rf))

Random Forest - ROC = 0.981707, F1 = 0.981633


#### Scoring: model training and transforming features into predictions
##### XGBoost

In [32]:
#now_tag('scoring')
#now_tag('model_training')
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_resampled)


#### Evaluating: evaluating the performance of models
##### XGBoost
Computing performance metrics

In [33]:
#now_tag('evaluating')
roc_xgb = roc_auc_score(y_resampled, y_pred_xgb)
f1_xgb = f1_score(y_resampled, y_pred_xgb)

print("XGBoost - ROC = %f, F1 = %f" % (roc_xgb, f1_xgb))

XGBoost - ROC = 0.974593, F1 = 0.974411


#### Scoring: model training and transforming features into predictions
##### LightGBM

In [34]:
#now_tag('scoring')
#now_tag('model_training')
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict(X_resampled)

#### Evaluating: evaluating the performance of models
##### LightGBM

In [35]:
#now_tag('evaluating')
roc_lgb = roc_auc_score(y_resampled, y_pred_lgb)
f1_lgb = f1_score(y_resampled, y_pred_lgb)

print("LightGBM - ROC = %f, F1 = %f" % (roc_lgb, f1_lgb))

LightGBM - ROC = 0.973577, F1 = 0.973306


#### Scoring: model training and transforming features into predictions
##### CatBoost

In [9]:
#now_tag('scoring')
#now_tag('model_training')
catboost_model = cat.CatBoostClassifier(logging_level='Silent')
catboost_model.fit(X_train, y_train)
y_pred_cbt = catboost_model.predict(X_resampled)

#### Evaluating: evaluating the performance of models
##### CatBoost

In [10]:
#now_tag('evaluating')
roc_cbt = roc_auc_score(y_resampled, y_pred_cbt)
f1_cbt = f1_score(y_resampled, y_pred_cbt)

print("CatBoost - ROC = %f, F1 = %f" % (roc_cbt, f1_cbt))

CatBoost - ROC = 0.502033, F1 = 0.667571
