# IEEE-CIS Fraud Detection
## Model: Xgboost
*Visit [kaggle](https://www.kaggle.com/c/ieee-fraud-detection) for competition details*

**Authored by Soyoung Kang**

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import roc_auc_score

from sklearn.datasets import make_classification

from imblearn.over_sampling import SMOTE

In [None]:
train = pd.read_csv('', engine = 'python')
test = pd.read_csv('', engine = 'python')

In [None]:
train.drop('Unnamed: 0', axis = 1, inplace = True)
test.drop('Unnamed: 0', axis = 1, inplace = True)

In [None]:
# Later used for TransactionID

original_test = pd.read_csv('/ieee-fraud-detection/test_transaction.csv', engine = 'python')

In [None]:
X = train.iloc[:, train.columns != 'isFraud']
y = train.isFraud

In [None]:
# # Robust Scaling

# robustScaler = RobustScaler()
# robustScaler.fit(X)
# X_train = robustScaler.transform(X)

# X_test = test.copy()
# scaler_ = RobustScaler()
# scaler_.fit(X_test)
# X_test = scaler_.transform(X_test)

In [None]:
# # MinMax Scaling

# scaler = MinMaxScaler(feature_range=(0, 1))
# X_train = scaler.fit_transform(X)

# X_test = test.copy()
# scaler_ = MinMaxScaler(feature_range=(0, 1))
# X_test = scaler_.fit_transform(X_test)

In [None]:
# Split validation set

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

In [None]:
# # SMOTE 모델설정
# sm = SMOTE(ratio='auto', kind='regular')

# # train데이터를 넣어 복제함
# X_resampled, Y_resampled = sm.fit_sample(X_train,list(y))

In [None]:
xgb_model = xgb.XGBClassifier(n_estimators=2000,
                        n_jobs=4,
                        max_depth=15,
                        learning_rate=0.001,
                        gamma = 0.02,
                        subsample = 0.9,
                        colsample_bytree=0.9,
                        missing=-999,
                        tree_method='gpu_hist')
print(xgb_model)

In [None]:
xgb_model.fit(X, y)

In [None]:
preds = xgb_model.predict_proba(test)
preds = preds[:,1]

In [None]:
# Make test result file

id = list(original_test.TransactionID)
result = list(zip(id, preds))

df = pd.DataFrame(result, columns = ['TransactionID', 'isFraud'])
df.set_index('TransactionID', inplace = True)
df.to_csv('result_drop28.csv', encoding = 'utf8')

In [None]:
# p = preds > np.percentile(preds, 80)

In [None]:
# Validation score

auc = roc_auc_score(y_valid, preds)
print("roc-auc score: ", auc)

In [None]:
# CV validation score

auc_scores = cross_val_score(xgb_model, X_resampled, Y_resampled, cv=5, scoring='roc_auc')
print("Mean roc-auc score: ", np.mean(auc_scores))

## Feature importance

In [None]:
from xgboost import plot_importance

# xgb_model = xgb.XGBClassifier()
# xgb_model.fit(X, y)
sorted_idx = np.argsort(xgb_model.feature_importances_)[::-1]
for index in sorted_idx:
    print([X.columns[index], xgb_model.feature_importances_[index]])