# 신용카드 사기 거래 탐지 AI 경진대회 💳

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
!pip install catboost
!pip install pandas-profiling==3.1.0
!pip install pycaret
!pip install optuna
!pip install scikit-learn==0.23.2
!pip install missingpy

In [None]:
import pandas as pd
import numpy as np
import warnings

from missingpy import MissForest
from sklearn.metrics import f1_score, recall_score, precision_score
from lightgbm import LGBMClassifier, Dataset
import optuna
from optuna.samplers import TPESampler
from sklearn.neighbors import KNeighborsClassifier

In [None]:
import os
import sys
import platform
import random
import math
from typing import List ,Dict, Tuple

import pandas as pd
import numpy as np
import optuna 
from sklearn.model_selection import StratifiedKFold , KFold
import sklearn 
from sklearn.model_selection import StratifiedKFold , KFold
from sklearn.metrics import f1_score 
from sklearn.model_selection import train_test_split

from catboost import Pool,CatBoostClassifier
warnings.filterwarnings(action='ignore')

## Data Load

In [None]:
path = '/content/drive/MyDrive/DACON/credit_fraud/'

In [None]:
train = pd.read_csv(path + 'train.csv')
valid = pd.read_csv(path + 'val.csv')

In [None]:
train = train[['ID', 'V3', 'V7', 'V10', 'V14', 'V17', 'V16']]
valid = valid[['ID', 'V3', 'V7', 'V10', 'V14', 'V17', 'V16', 'Class']]

## MissForest

In [None]:
train['data_cd'] = 'train'
valid['data_cd'] = 'valid'

In [None]:
df = pd.concat([train.drop(columns=['ID']), valid.drop(columns=['ID'])])

df.set_index('data_cd', drop=True, inplace=True)
df['Class'] = df['Class'].astype("category")
cat_cols = [df.columns.get_loc(col) for col in df.select_dtypes(['category']).columns.tolist()]

In [None]:
imputer = MissForest(max_iter=10, n_estimators=28462, random_state=42)
df_imputed = imputer.fit_transform(df, cat_vars=cat_cols)
df_imputed = pd.DataFrame(df_imputed, columns=df.columns, index=df.index)

Iteration: 0
Iteration: 1
Iteration: 2


In [None]:
train2 = df_imputed[df_imputed.index=='train']
train2['ID'] = train['ID']
train2.reset_index(drop=True, inplace=True)

In [None]:
X = train2[['V3', 'V7', 'V10', 'V14', 'V17', 'V16']]
y= train2['Class']

## OverSampling

In [None]:
from imblearn.combine import SMOTETomek
X_resampled, y_resampled = SMOTETomek(random_state=42).fit_resample(X, y)

In [None]:
X_resampled['Class'] = y_resampled

## KNN + Optuna

In [None]:
X_train = X_resampled.drop(columns=['Class'])
X_train = X_train[['V3', 'V7', 'V10', 'V14', 'V17', 'V16']]
y_train = X_resampled['Class']


X_valid = valid2.drop(columns=['ID', 'Class'])
X_valid = X_valid[['V3', 'V7', 'V10', 'V14', 'V17', 'V16']]
y_valid = valid2['Class']

In [None]:
sampler = TPESampler(seed=42)

def objective(trial):    
    params = {
        'n_neighbors':trial.suggest_int("n_neighbors", 1, 60),
        'weights': trial.suggest_categorical('weights', ["uniform", "distance"]),
        'metric': trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski']),
        'algorithm': trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
        'leaf_size':trial.suggest_int("leaf_size", 30, 60),
        'p': trial.suggest_categorical('p', [1, 2]),
        'n_jobs': -1,
        

    }
    
    model = KNeighborsClassifier(**params)
    model.fit(X_train, y_train)
    
    y_valid_pred = model.predict(X_valid)
    return f1_score(y_valid, y_valid_pred, average="macro")

In [None]:
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=50)

In [None]:
best_model = study.best_trial
best_params = best_model.params

print("Best model:")
print("  F1-score: {}".format(round(best_model.value, 4)))
print("  params  : {}".format(best_params))

'n_neighbors': 6, 'weights': 'distance', 'metric': 'manhattan', 'algorithm': 'brute', 'leaf_size': 33, 'p': 2

In [None]:
SEED = 42
patience = 50

In [None]:
X = X_resampled[['V3', 'V7', 'V10', 'V14', 'V17', 'V16']]
y= X_resampled['Class']

In [None]:
model = KNeighborsClassifier(**best_params)
model.fit(X, y)

## Prediction

In [None]:
test = pd.read_csv(path + "test.csv")
X_test = test.drop(columns=['ID'])
X_test = X_test[['V3', 'V7', 'V10', 'V14', 'V17', 'V16']]
y_test_pred = model.predict(X_test)

In [None]:
submit = pd.read_csv(path + "sample_submission.csv")
submit['Class'] = y_test_pred
submit.to_csv(path + "knn_smote.csv", index=False)