In [1]:
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv("train.csv", index_col=False)
df_valid = pd.read_csv("valid.csv", index_col = False)
df_test = pd.read_csv("test.csv", index_col = False)

In [4]:
# df_train = pd.concat([df_train, df_valid])

In [3]:
df_train['Age'] = (df_train['Age']-df_train['Age'].mean())/df_train['Age'].std()
df_valid['Age'] = (df_valid['Age']-df_valid['Age'].mean())/df_valid['Age'].std()
df_test['Age'] = (df_test['Age']-df_test['Age'].mean())/df_test['Age'].std()

In [4]:
y_train = df_train['FraudFound_P']
x_train = df_train.drop('FraudFound_P',axis = 1)
y_valid = df_valid['FraudFound_P']
x_valid = df_valid.drop('FraudFound_P',axis = 1)
y_test = df_test['FraudFound_P']
x_test = df_test.drop('FraudFound_P',axis = 1)

In [5]:
from collections import Counter
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0, sampling_strategy=0.6)

x_train, y_train = smote_enn.fit_resample(x_train, y_train)

In [6]:
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [16]:
def create_model(trial):
    max_depth = trial.suggest_int('max_depth', 5, x_train.shape[1])
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 20)
    zero_weight = trial.suggest_float('zero_weight', 0, 1)
    class_weight = {0: zero_weight, 1: 1-zero_weight}
    model = DecisionTreeClassifier(
        max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, random_state = 4012,
        class_weight = class_weight
        )
    return model
def model_performance(model, X = x_valid, y = y_valid):
    y_pred = model.predict(X)
    return round(metrics.recall_score(y, y_pred), 3)
def objective(trial):
    model = create_model(trial)
    model.fit(x_train, y_train)
    return model_performance(model)

In [17]:
# %pip install optuna
import optuna
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 200, timeout = 600)


[32m[I 2022-11-04 15:04:37,593][0m A new study created in memory with name: no-name-97f6a883-09d3-492c-a91c-b1031cf41f1b[0m
[32m[I 2022-11-04 15:04:37,765][0m Trial 0 finished with value: 0.276 and parameters: {'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 9, 'zero_weight': 0.7526705303943864}. Best is trial 0 with value: 0.276.[0m
[32m[I 2022-11-04 15:04:37,901][0m Trial 1 finished with value: 0.709 and parameters: {'max_depth': 8, 'min_samples_split': 9, 'min_samples_leaf': 14, 'zero_weight': 0.34753643046433436}. Best is trial 1 with value: 0.709.[0m
[32m[I 2022-11-04 15:04:38,074][0m Trial 2 finished with value: 0.172 and parameters: {'max_depth': 46, 'min_samples_split': 17, 'min_samples_leaf': 19, 'zero_weight': 0.9255276096416117}. Best is trial 1 with value: 0.709.[0m
[32m[I 2022-11-04 15:04:38,226][0m Trial 3 finished with value: 0.239 and parameters: {'max_depth': 20, 'min_samples_split': 3, 'min_samples_leaf': 15, 'zero_weight': 0.733204432090444

In [18]:
best_model = create_model(study.best_trial)
best_model.fit(x_train, y_train)
print("Best Recall Score: ", model_performance(best_model))

Best Recall Score:  0.993


In [19]:
y_pred = best_model.predict(x_valid)
print(metrics.classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.37      0.54      2131
           1       0.09      0.99      0.16       134

    accuracy                           0.41      2265
   macro avg       0.54      0.68      0.35      2265
weighted avg       0.94      0.41      0.52      2265



In [20]:
study.best_trial.params

{'max_depth': 6,
 'min_samples_split': 8,
 'min_samples_leaf': 18,
 'zero_weight': 0.0011031255546806902}

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
def create_model(trial):
    leaf_size = trial.suggest_int("leaf_size", 1, 50)
    n_neighbors = trial.suggest_int("n_neighbors", 1, 30)
    p = trial.suggest_int("p",1,2)
    model = KNeighborsClassifier(leaf_size=leaf_size, n_neighbors=n_neighbors, p = p)
    return model
def model_performance(model, X = x_valid, y = y_valid):
    y_pred = model.predict(X)
    return round(metrics.recall_score(y, y_pred), 3)
def objective(trial):
    model = create_model(trial)
    model.fit(x_train, y_train)
    return model_performance(model)

# %pip install optuna
import optuna
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 200, timeout = 600)



[32m[I 2022-11-12 20:43:50,097][0m A new study created in memory with name: no-name-5d7f0359-e616-400f-bb78-06fa89646c28[0m
[32m[I 2022-11-12 20:43:50,504][0m Trial 0 finished with value: 0.642 and parameters: {'leaf_size': 31, 'n_neighbors': 12, 'p': 2}. Best is trial 0 with value: 0.642.[0m
[32m[I 2022-11-12 20:43:50,870][0m Trial 1 finished with value: 0.552 and parameters: {'leaf_size': 44, 'n_neighbors': 6, 'p': 2}. Best is trial 0 with value: 0.642.[0m
[32m[I 2022-11-12 20:43:51,670][0m Trial 2 finished with value: 0.597 and parameters: {'leaf_size': 2, 'n_neighbors': 20, 'p': 1}. Best is trial 0 with value: 0.642.[0m
[32m[I 2022-11-12 20:43:52,031][0m Trial 3 finished with value: 0.634 and parameters: {'leaf_size': 13, 'n_neighbors': 14, 'p': 2}. Best is trial 0 with value: 0.642.[0m
[32m[I 2022-11-12 20:43:52,412][0m Trial 4 finished with value: 0.716 and parameters: {'leaf_size': 46, 'n_neighbors': 22, 'p': 2}. Best is trial 4 with value: 0.716.[0m
[32m[I 20

In [12]:

best_model = create_model(study.best_trial)
best_model.fit(x_train, y_train)
y_pred = best_model.predict(x_test)
print(metrics.classification_report(y_test,y_pred))
print(metrics.recall_score(y_test,y_pred))
print(study.best_trial.params)

              precision    recall  f1-score   support

           0       0.96      0.51      0.67      2131
           1       0.08      0.69      0.15       134

    accuracy                           0.52      2265
   macro avg       0.52      0.60      0.41      2265
weighted avg       0.91      0.52      0.64      2265

0.6940298507462687
{'leaf_size': 36, 'n_neighbors': 29, 'p': 2}
