In [None]:
%pip install comet_ml --quiet
%pip install scikit-learn==0.24.2 --quiet 
%pip install imbalanced-learn --upgrade --quiet

In [None]:
import pandas as pd 
import numpy as np
import comet_ml

from comet_ml import Experiment, Artifact
import warnings
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

from sklearn.metrics import fbeta_score, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import AdaBoostClassifier, HistGradientBoostingClassifier, RandomForestClassifier

comet_ml.init()

COMET INFO: Comet API key is valid


In [None]:
experiment = Experiment(workspace='team-comet-ml', project_name='fraud-detection-demo')
logged_artifact = experiment.get_artifact("full_data_model")
logged_artifact.download(path = './')
experiment.end()

COMET INFO: Couldn't find a Git repository in '/content' and lookings in parents. You can override where Comet is looking for a Git Patch by setting the configuration `COMET_GIT_DIRECTORY`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/team-comet-ml/fraud-detection-demo/3383fb36d98646df8828d40c20bd96e2

COMET INFO: Artifact 'team-comet-ml/full_data_model:1.0.0' download has been started asynchronously
COMET INFO: Still downloading 1 file(s), remaining 75.17 MB/75.17 MB
COMET INFO: Artifact 'team-comet-ml/full_data_model:1.0.0' has been successfully downloaded
COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/team-comet-ml/fraud-detection-demo/3383fb36d98646df8828d40c20bd96e2
COMET INFO:   Downloads:
COMET INFO:     artifact assets : 1 (75.17 MB)
COMET INFO:     artifacts       : 

In [None]:
full_data = pd.read_parquet('full_data_model.parquet.gzip')
X = full_data.drop(columns=['isFraud', 'nameDest'])
y = pd.DataFrame(full_data['isFraud'])

# Split unbalanced data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify= y)

In [None]:
datas = [(X_train, 'X_train'), (X_test, 'X_test'), (y_train, 'y_train'), (y_test, 'y_test')] 

data_artifacts = {
    'X_train':{'df':'X_train.parquet.gzip',
               'type':'training data',
               'alias':['x-train-unbalanced'],
               'metadata':{'filetype':'parquet', 'notes':'75/25 split, unbalanced, stratified'}
    },
    'X_test':{'df':'X_test.parquet.gzip',
               'type':'testing data',
               'alias':['x-test'],
               'metadata':{'filetype':'parquet', 'notes':'features for test data'}
    },
    'y_train':{'df':'y_train.parquet.gzip',
               'type':'training data',
               'alias':['y-train-unbalanced'],
               'metadata':{'filetype':'parquet', 'notes':'75/25 split, unbalanced, stratified'}
    },
    'y_test':{'df':'y_test.parquet.gzip',
               'type':'testing data',
               'alias':['y-test'],
               'metadata':{'filetype':'parquet', 'notes':'testing lables'}
    },
    'X_train_smt':{'df':'X_train_smt.parquet.gzip',
               'type':'training data sampled using SMOTETomek',
               'alias':['x-train-smt'],
               'metadata':{'filetype':'parquet', 'notes':'resampled using SMOTETomek'}
    },
    'y_train_smt':{'df':'y_train_smt.parquet.gzip',
               'type':'upsampled training data',
               'alias':['y-train-smt'],
               'metadata':{'filetype':'parquet', 'notes':'resampled using SMOTETomek'}
    }
}

def compress_df(df, filename):
    """Compress dataframe to a parquet file

    Args:
        df (pd.DataFrame): pandas DataFrame to be compresses
        filename (str): prefix of the filenames, which will have `.parquet.gzip` 
                        appended to the end of it.
    """
    df.to_parquet(filename + '.parquet.gzip', compression='gzip')


def artifact_logger(artifact_dict:dict, key: dict, ws:str, exp_name:str, exp_tag:str):
    """Log the artifact to Comet

    Args:
        artifact_dict (dict): dictionary containing metadata for artifact
        ws(str): Workspace name
        key (str): The key from which to grab dictionary items
        exp_name (str): Name of the experiment on Comet
        exp_tag (str) : Experiment tag 
        
    """
    experiment = Experiment(workspace=ws,project_name=exp_name)
    experiment.add_tag(exp_tag)
    experiment.set_name('log_artifact_' + key)
    artifact = Artifact(name = key,
                        artifact_type = artifact_dict[key]['type'],
                        aliases = artifact_dict[key]['alias'],
                        metadata = artifact_dict[key]['metadata']
                    )
    artifact.add(artifact_dict[key]['df'])
    experiment.log_artifact(artifact)
    experiment.end()


# Save training and testing sets to disk as parquet files
for df, filename in datas:
    compress_df(df, filename)


# Log training and testing sets to Comet as artifacts
for key in data_artifacts:
    if key not in ['X_train_smt', 'y_train_smt']:
        artifact_logger(data_artifacts,key, ws= 'team-comet-ml', exp_name='fraud-detection-demo', exp_tag="train-test-split")

COMET INFO: Couldn't find a Git repository in '/content' and lookings in parents. You can override where Comet is looking for a Git Patch by setting the configuration `COMET_GIT_DIRECTORY`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/team-comet-ml/fraud-detection-demo/a5c4cd6986be41f0accadedfbd22a0db

COMET INFO: Artifact 'X_train' version 3.0.0 created (previous was: 2.0.0)
COMET INFO: Scheduling the upload of 1 assets for a size of 107.00 MB, this can take some time
COMET INFO: Artifact 'team-comet-ml/X_train:3.0.0' has started uploading asynchronously
COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/team-comet-ml/fraud-detection-demo/a5c4cd6986be41f0accadedfbd22a0db
COMET INFO:   Others:
COMET INFO:     Name : log_artifact_X_train
COMET INFO:   Uploads:
COMET INFO:     arti

# Sampling data

Imbalanced data is a pain to work with.

`Why?`

Because machine learning techniques typically fail in these scenarios, and if they don't fail...its possible that you may observe misleadingly optimistic performance with your classification model. 

`Why is that?`

Well, many classifications algorithms are designed for situations where you have an equal number of observations for each class. 

If this is not the case, then algorithms behave in a way where the few minority class examples are not considered "important" and are ignored in order to achieve good performance.

`When this happens, the result is a model that has poor predictive performance, specifically for the minority class.` 

And in most cases the minority class is one you care about the most! Because the majority class often reflects a normal case, whereas the minority class represents a positive case for a diagnostic, fault, fraud, or other types of exceptional circumstance.

A popular solution to the problem of class imbalance is to change the composition of the training dataset. Instead of banging our head against the wall and trying to build a model to deal with the imbalance, we can balance the class frequencies. 

There are a number of sampling methods available, I'll list just a few of them for you here.

### Oversampling

Oversampling methods basically create "fake" (maybe synthetic is a better word?) examples of the minority class from actual examples of the minority class present in your data.

Some of the more widely used and implemented oversampling methods include:

 - Random Oversampling
 - Synthetic Minority Oversampling Technique (SMOTE)
 - Borderline-SMOTE
 - Borderline Oversampling with SVM
 - Adaptive Synthetic Sampling (ADASYN)

One of the most widely used oversampling methods is called `SMOTE` (Synthetic Minority Oversampling Technique). 

At a high-level, this is what SMOTE does: 


 - Selects a minority class instance at random and finds its k nearest minority class neighbors. 
 
 - A synthetic instance is created by choosing one of the k nearest neighbors at random
 
 - These neighbours are connected to form a line segment in the feature space. 

 - A synthetic instance is created as a convex combination of the two chosen instances a and b.

Basically: This works by examining examples which are close in the feature space, drawing a line between the examples in the feature space, and creating a new sample as a point along that line.

### Undersampling

Undersampling methods pretty much do the oposite of Oversampling. They delete or select a subset of examples from the majority class.

Some widely used undersampling methods include:

 - Random Undersampling
 - Condensed Nearest Neighbor Rule (CNN)
 - Near Miss Undersampling
 - Tomek Links Undersampling
 - Edited Nearest Neighbors Rule (ENN)
 - One-Sided Selection (OSS)
 - Neighborhood Cleaning Rule (NCR)

The most widely used deletion undersampling approach is called `Tomek Links`, which was originally developed as part of an extension to the Condensed Nearest Neighbors rule. 

A `Tomek Link` refers to a pair of examples in the training dataset that are both nearest neighbors - that is, they have the minimum distance in feature space - and belong to different classes. Tomek Links are often misclassified examples found along the class boundary and the examples in the majority class are deleted.

### Combining techniques

Used individually, oversampling or undersampling method are pretty effective. But [combining them together](https://imbalanced-learn.org/stable/combine.html#combine) can often result in better overall model performance.

We're going to sample our data using `SMOTE + Tomek Links`.

`Why this combination?`

`SMOTE` works by synthesizes new plausible examples from the minority class.  `Tomek Links` identifies pairs of nearest neighbors in a dataset that have different classes. Removing one or both of the examples in these pairs - such as the examples in the majority class -  has the effect creating a less noisy or ambiguous decision boundary.

We'll use the [`imblearn`](https://imbalanced-learn.org/stable/references/index.html) library in python to perform this sampling technique, specifically the
[`SMOTETomek`](https://imbalanced-learn.org/stable/references/generated/imblearn.combine.SMOTETomek.html) method. We'll then log the resulting data to Comet as an Artifact.

Keep in mind that we'll only apply the sampling methods on the training data.




In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline as imbpipe

smt = SMOTETomek(smote = SMOTE(sampling_strategy = .35, random_state=42, n_jobs=-1), 
                tomek = TomekLinks(sampling_strategy = 'majority', n_jobs=-1),
                random_state=42, n_jobs=-1
                )

X_smt, y_smt = smt.fit_resample(X_train, y_train)

sampled_data = [(X_smt, 'X_train_smt'), (y_smt,'y_train_smt')]

# Save training and testing sets to disk as parquet files
for df, filename in sampled_data:
    compress_df(df, filename)

# Log training and testing sets to Comet as artifacts
for key in data_artifacts:
    if key in ['X_train_smt', 'y_train_smt']:
        artifact_logger(data_artifacts,key, ws='team-comet-ml', exp_name='fraud-detection-demo', exp_tag="SMOTETomek-sampling")

COMET INFO: Couldn't find a Git repository in '/content' and lookings in parents. You can override where Comet is looking for a Git Patch by setting the configuration `COMET_GIT_DIRECTORY`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/team-comet-ml/fraud-detection-demo/f97ecff305a24c5e8e6c4b42bf8b5cc9

COMET INFO: Artifact 'X_train_smt' version 1.0.0 created
COMET INFO: Scheduling the upload of 1 assets for a size of 138.10 MB, this can take some time
COMET INFO: Artifact 'team-comet-ml/X_train_smt:1.0.0' has started uploading asynchronously
COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/team-comet-ml/fraud-detection-demo/f97ecff305a24c5e8e6c4b42bf8b5cc9
COMET INFO:   Others:
COMET INFO:     Name : log_artifact_X_train_smt
COMET INFO:   Uploads:
COMET INFO:     artifact asset

# Spot checking and baselining

The reason we spot check a suite of algorithms for our problem is primarily to determine whether we can even solve this problem using machine learning. Whatever results we obtain here will end up serving as a a basis for comparison for any more complex model we build.

But how will we know if our results are any good or if machine learning is even appropriate for the problem?

Well, we need a meaningful reference point for comparison.  

We call this a baseline. The baseline model is the simplest possible model. In some cases it can be a random result, and in others the most common prediction. It serves the purpose of provideing a point of comparison for any advanced methods that we test out later on in our process

I's a simple, yet powerful idea. 

Once we've established a baseline model, we can add or change the features, test various algorithms, experiment with the parameters of the algorithms, and through this process we can determine whether our efforts are getting us any closer to an improved solution to our problem.

This experimental part of machine learning is, in my opinion, the most fun and creative aspect of it all. And it's where a tool like [Comet](https://tinyurl.com/trycomet) becomes invaluable, because it helps you keep track of all the experiments you're running so you can focus on building the best model for your usecase.

## Evaluation metric

We're gonna use Average Precision as the primary evaluation metric for our model (with f1 score as a secondary metric).

Why?

Because average precision will tell us whether our model correctly identifies all the true positive examples without accidentally marking too many negative examples as positive (ie, committing too many false positives).

Average precision will be higher when our model can correctly handle positives - which is what we care most about. It's calculated as the area under a curve (AUC) and measures the trade off between precision and recall at different decision thresholds.

Here's an illustrative example to help us understand what the number means: A random classifier will have an average precision equal to the percentage of positives in the class - that is, the fraction of positives is calculated as (# positive examples / total # examples).

If a dataset consists of 8% cancer examples and 92% healthy examples, the baseline AUPRC is 0.08, so obtaining an AUPRC of 0.40 in this scenario is good! A perfect classifier has an average precision of 1.0. 

Because we also care about not having too many False Negatives, we will use the `F2 score` as a secondary evaluation metric. F2 places lower importance on precision and more importance on recall. Since maximizing precision implies that we have minimized false positives, and maximizing recall implies that we have minimized false negatives, then the F2 puts more attention on minimizing false negatives than minimizing false positives.


## Dummy Classifier

Our baselines here will be the `DummyClassifier`

We'll also test out a few other algorithms, and pick the one which performs best out-of-the-box as 

The `scikit-learn` implementation of the `DummyClassifier` is a classifier which makes predictions using simple rules. This classifier is useful as a simple baseline to compare with other (real, more complex) classifiers. 

We'll then compare the `DummyClassifier` against some more complex models, selecting the model with the best initial performance for further optimization.

I encourage you to take the template below and experiment with some other models on your own. Somethings I would try if I were you:

 - XGBoost
 - AdaBoost with base classifier as a decision tree with depth 2
 - Adaboost with a base classifier as LinearSVC
 - ExtraTrees classifier

I'd love to see what you come up with, so [swing by our community Slack channel](https://bit.ly/comet-community) and show off your work!

In [None]:
dummy = DummyClassifier(strategy='stratified', random_state=42)

lr_pipe = Pipeline([
        ('std_scaler',StandardScaler()),
        ('lr',LogisticRegression(random_state=42, solver='saga', penalty= 'l1',n_jobs=-1))
        ])

ada_pipe = Pipeline([
        ('std_scaler',StandardScaler()),
        ('ada',AdaBoostClassifier(random_state=42))
        ])

hist_pipe = Pipeline([
        ('std_scaler',StandardScaler()),
        ('hist',HistGradientBoostingClassifier(random_state=42))
        ])

rf_pipe = Pipeline([
        ('std_scaler',StandardScaler()),
        ('rf',RandomForestClassifier(random_state=42, max_depth=4, n_jobs=-1))
        ])

classifier_dict = {
        'DummyClassifier': dummy,
        'LogisticRegression': lr_pipe,
        'AdaBoostClassifier': ada_pipe,
        'HistGradientBoostingClassifier': hist_pipe,
        'RandomForestClassifier': rf_pipe,        
}


def baseline_classifier(classifier, algo_name, X_train, y_train, X_test, y_test, ws, exp_name, exp_tag):
     
    #Comet experiment code
    experiment = Experiment(workspace=ws, project_name=exp_name)
    experiment.add_tag(exp_tag)
    experiment.set_name(algo_name)
    
    #Grab classifier from dictionary
    clf = classifier

    #Fit classifier to training data
    clf.fit(X_train, y_train)

    #Predict class probabilities and grab probability 
    #of Class 1, which is fraud class
    y_proba = clf.predict_proba(X_test)[:, 1]

    #Apply business rule, if probability of 
    #Class 1 is greater than or equal 0.80
    #classify as fraud case
    y_pred = np.where(y_proba >= 0.80, 1, 0)

    #calculate evaluation metrics
    f_beta = fbeta_score(y_test, y_pred, beta=2)
    avg_precision = average_precision_score(y_test, y_pred)
    
    params = {"algorithm": algo_name}

    #Comet experiment metadata
    metrics = {"f_beta": f_beta,
            "average_percision_score": avg_precision,
            }
    
    #Comet experiment code
    experiment.log_parameters(params)
    experiment.log_metrics(metrics)
    experiment.log_confusion_matrix(y_test,y_pred)
    experiment.end()

In [None]:
for key in classifier_dict.keys():
    baseline_classifier(classifier=classifier_dict[key],
                        algo_name=key,
                        X_train=X_smt, 
                        y_train=y_smt, 
                        X_test=X_test, 
                        y_test=y_test, 
                        ws='team-comet-ml',
                        exp_name='fraud-detection-demo', 
                        exp_tag='baseline-spot-checks'
                        )

COMET INFO: Couldn't find a Git repository in '/content' and lookings in parents. You can override where Comet is looking for a Git Patch by setting the configuration `COMET_GIT_DIRECTORY`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/team-comet-ml/fraud-detection-demo/3b847117adb640bf97454d9cd4762e68

COMET ERROR: Error creating confusion matrix: 0; ignoring
COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/team-comet-ml/fraud-detection-demo/3b847117adb640bf97454d9cd4762e68
COMET INFO:   Metrics:
COMET INFO:     average_percision_score : 0.0029510427678951516
COMET INFO:     f_beta                  : 0.013926167348329925
COMET INFO:   Others:
COMET INFO:     Name : DummyClassifier
COMET INFO:   Parameters:
COMET INFO:     algorithm    : DummyClassifier
COMET INFO:     constant 

In [None]:
experiment = Experiment(workspace='team-comet-ml',project_name='fraud-detection-demo')
experiment.display()
experiment.end()

COMET INFO: Couldn't find a Git repository in '/content' and lookings in parents. You can override where Comet is looking for a Git Patch by setting the configuration `COMET_GIT_DIRECTORY`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/team-comet-ml/fraud-detection-demo/7ba6a67ec0b947648fea3cdd6f1af9ad



COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/team-comet-ml/fraud-detection-demo/7ba6a67ec0b947648fea3cdd6f1af9ad
COMET INFO:   Uploads:
COMET INFO:     environment details : 1
COMET INFO:     filename            : 1
COMET INFO:     installed packages  : 1
COMET INFO:     notebook            : 1
COMET INFO:     os packages         : 1
COMET INFO:     source_code         : 1
COMET INFO: ---------------------------
COMET INFO: Uploading 1 metrics, params and output messages
