In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('./creditcard.csv')

In [3]:
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [4]:
cols = df.columns.tolist()
cols.remove('Class')
X = df[cols]
y = df['Class']

In [30]:
def result_table(y_test, y_pred):
    score = accuracy_score(y_test, y_pred)
    result = confusion_matrix(y_test, y_pred)
    table = pd.DataFrame(
        {
            'normal': {'normal': result[0][0], 'fraud': result[1][0]},
            'fraud': {'normal': result[0][1], 'fraud': result[1][1]}
        }
    )
    table.columns.name = '{0:.5f}%'.format(score)
    recall = result[1][1] / (result[1][0] + result[1][1])
    precision = result[1][1] / (result[0][1] + result[1][1])
    f_measure = 2 * recall * precision / (recall + precision)
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")
    print(f"F-measure: {f_measure}")
    return table

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 71)

In [6]:
# clf = xgb.XGBClassifier(max_depth=6, n_estimators=100, objective='binary:logistic', random_state=42)
reg = xgb.XGBRegressor(max_depth=6, n_estimators=100, random_state=42)

In [11]:
reg.fit(X_train, y_train)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [12]:
y_pred = reg.predict(X_test)

In [13]:
y_pred = np.where(y_pred <= 0.5, 0, 1)

In [14]:
print(f"{accuracy_score(y_test, y_pred) * 100} %")
print( confusion_matrix(y_test, y_pred))

99.94616293903537 %
[[85285     8]
 [   38   112]]


| 99.94616% | normal | fraud |
|:--:|:--:|:--:|
| normal | 85285 | 8 |
| fraud | 38 | 112 |


## Under Sampling

In [16]:
from imblearn.under_sampling import RandomUnderSampler

In [17]:
y_train.value_counts()

0    199022
1       342
Name: Class, dtype: int64

In [18]:
positive_count_train = len(y_train[y_train == 1])
print (positive_count_train)

342


In [19]:
rus = RandomUnderSampler(ratio={0:positive_count_train*20, 1:positive_count_train}, random_state=71)

In [20]:
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)

In [21]:
X_train_under = pd.DataFrame(X_train_under)
X_train_under.columns = cols

In [22]:
# clf.fit(X_train_under, y_train_under)
reg.fit(X_train_under, y_train_under)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [23]:
y_pred = reg.predict(X_test)

In [24]:
y_pred = np.where(y_pred <= 0.5, 0, 1)

In [31]:
result_table(y_test, y_pred)

Recall: 0.8266666666666667
Precision: 0.36470588235294116
F-measure: 0.5061224489795918


0.99717%,normal,fraud
normal,85077,216
fraud,26,124


## Over Sampling

In [54]:
# from imblearn.over_sampling import RandomOverSampler

In [55]:
# ros = RandomOverSampler(ratio = {0:X_train.shape[0], 1:X_train.shape[0]//9}, random_state = 71)

In [56]:
# X_train_over, y_train_over = ros.fit_sample(X_train, y_train)

In [57]:
# X_train_over = pd.DataFrame(X_train_over)
# X_train_over.columns = cols

In [58]:
# reg.fit(X_train_over, y_train_over)

In [59]:
# y_pred = reg.predict(X_test)

In [60]:
# y_pred = np.where(y_pred <= 0.5, 0, 1)

In [61]:
# result_table(y_test, y_pred)

## SMOTE

In [261]:
from imblearn.over_sampling import SMOTE

In [262]:
smote = SMOTE(ratio={0:X_train.shape[0], 1:X_train.shape[0]//20}, random_state=71)

In [263]:
#列ごとに正規化
# normalized_X_train = (X_train - X_train.min()) / (X_train.max() - X_train.min())

In [264]:
# X_train_smote, y_train_smote = smote.fit_sample(normalized_X_train, y_train)
X_train_smote, y_train_smote = smote.fit_sample(X_train, y_train)

  n_samples_majority))


In [265]:
X_train_smote = pd.DataFrame(X_train_smote)
X_train_smote.columns = X_train.columns.tolist()

In [266]:
reg.fit(X_train_smote, y_train_smote)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [267]:
y_pred = reg.predict(X_test)

In [268]:
y_pred = np.where(y_pred <= 0.5, 0, 1)

In [269]:
result_table(y_test, y_pred)

Recall: 0.7933333333333333
Precision: 0.8095238095238095
F-measure: 0.8013468013468015


0.99931%,normal,fraud
normal,85265,28
fraud,31,119


# MixUp( 0と1 or 1と1 のMix )

In [8]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [78]:
# def under_and_get_mixup_df(X_train, y_train, alpha=0.6, iteration=5):
#     #  一回undersamplingで減らす
#     X_cols = X_train.columns.tolist()
#     positive_count = int(y_train.sum())
#     rus = RandomUnderSampler(ratio={0:positive_count*20, 1:positive_count}, random_state=71)
#     X_res, y_res = rus.fit_sample(X_train, y_train)
#     X_res, y_res = (pd.DataFrame(X_res), pd.DataFrame(y_res))
#     X_res.columns, y_res.columns = (X_cols, ["Class"])

#     #下準備(全体の不正データの件数と同量の正常データとMixUpする)
#     sampling_data = pd.concat([X_res, y_res], axis=1)
#     cols = sampling_data.columns.tolist()
#     fraud_data = sampling_data[sampling_data["Class"]==1].sample(frac=1, random_state=1)
#     for _ in range(iteration):
#         normal_data = sampling_data[sampling_data["Class"]==0].sample(n=positive_count).sample(frac=1, random_state=0)
#         assert normal_data.shape == fraud_data.shape # 342件

#         lmd = np.random.beta(alpha, alpha, positive_count).reshape(positive_count, 1) # positive_countがbatch_sizeみたいなもの
#         mixup_new_data = pd.DataFrame(lmd * normal_data.values + (1-lmd) * fraud_data.values)
#         mixup_new_data.columns = cols
#         sampling_data = pd.concat([sampling_data, mixup_new_data])

#     sampling_data.reset_index(drop=True, inplace=True)
    
#     return np.array(sampling_data[X_cols]), np.array(sampling_data['Class'])

In [121]:
# X_mixup_train, y_mixup_train = under_and_get_mixup_df(X_train, y_train, 0.7, 9)

In [122]:
# X_mixup_train.shape

(10260, 30)

In [123]:
# reg.fit(X_mixup_train, y_mixup_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [124]:
# y_pred = reg.predict(np.array(X_test))

In [125]:
# y_pred = np.where(y_pred <= 0.5, 0, 1)

In [126]:
# y_pred = y_pred.astype(int)

In [53]:
# print(f"{accuracy_score(y_test, y_pred) * 100} %")
# print(confusion_matrix(y_test, y_pred))

72.92229907657737 %
[[62266 23027]
 [  109    41]]


### サンプリングしていないデータとmixしてやってみる

In [219]:
def all_and_get_mixup_df(X_train, y_train, alpha=0.4, generation_size=2): #generation_size >= 2
    all_data = pd.concat([X_train, y_train], axis=1)
    normal_data = all_data[all_data["Class"]==0].sample(n= int(y_train.sum()) * (generation_size - 1)).sample(frac=1, random_state=0)
    fraud_data = all_data[all_data["Class"]==1].sample(frac=1, random_state=1)
    generate_material_data = pd.concat([normal_data, fraud_data]).sample(frac=1)
    fraud_material_data = fraud_data.copy()
    for i in range(generation_size - 1):
        fraud_material_data = pd.concat([fraud_material_data, fraud_data]).sample(frac=1)

    assert generate_material_data.shape[0] == fraud_data.shape[0] * generation_size

    lmd = np.random.beta(alpha, alpha, generate_material_data.shape[0]).reshape(generate_material_data.shape[0], 1)
    mixup_new_data = pd.DataFrame(lmd * generate_material_data.values + (1-lmd) * fraud_material_data.values)
    mixup_new_data.columns = all_data.columns.tolist()
    data = pd.concat([all_data, mixup_new_data])
    data.reset_index(drop=True, inplace=True)
    return np.array(data[X_train.columns.tolist()]), np.array(data['Class'])

In [226]:
X_mixup_train, y_mixup_train = all_and_get_mixup_df(X_train, y_train, 1.0, 100)

In [227]:
reg.fit(X_mixup_train, y_mixup_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [228]:
y_pred = reg.predict(np.array(X_test))

In [229]:
y_pred = np.where(y_pred <= 0.5, 0, 1)

In [230]:
y_pred = y_pred.astype(int)

In [231]:
result_table(y_test, y_pred)

Recall: 0.7866666666666666
Precision: 0.9147286821705426
F-measure: 0.8458781362007168


0.99950%,normal,fraud
normal,85282,11
fraud,32,118
