In [0]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

## Model

In [0]:
from sklearn.tree import DecisionTreeClassifier

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

# from imblearn.over_sampling import SMOTE

In [0]:
trans_k = pd.read_csv('1st_round.csv')

FileNotFoundError: ignored

In [0]:
trans_k.head(3)

In [0]:
del trans_k['Unnamed: 0']

In [0]:
del trans_k['TransactionID']

In [0]:
X = trans_k.iloc[:, :-1]
y = trans_k.iloc[:, -1]

In [0]:
print(X.shape, y.shape)

In [0]:
from sklearn.preprocessing import RobustScaler
robustScaler = RobustScaler()
robustScaler.fit(X)
X_robust = robustScaler.transform(X)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X_robust, y, test_size=0.3)

In [0]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier()

In [0]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
score = cross_val_score(tree_clf, X_train, y_train, cv=skf)
print(score.mean())

# 로지스틱, 트리

In [0]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [0]:
# log_reg_params = {["penalty": 'l2', 'solver': 'lbfgs'], ['penalty': 'l1', 'solver': 'newton-cg'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

log_reg_params = [{'penalty': ['l1'], 'solver': [ 'liblinear', 'saga'], 'C': [0.01, 0.1, 1, 10]},
  {'penalty': ['l2'], 'solver': ['newton-cg'], 'C': [0.01, 0.1, 1, 10]}]

In [0]:
grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)
grid_log_reg.fit(X_train, y_train)

In [0]:
log_reg = grid_log_reg.best_estimator_

In [0]:
log_reg_score = cross_val_score(log_reg, X_train, y_train, cv=5)
print('Logistic Regression Cross Validation Score: ', round(log_reg_score.mean() * 100, 2).astype(str) + '%')

In [0]:
tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)), 
              "min_samples_leaf": list(range(5,7,1))}
grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params)
grid_tree.fit(X_train, y_train)

In [0]:
tree_clf = grid_tree.best_estimator_

In [0]:
tree_score = cross_val_score(tree_clf, X_train, y_train, cv=5)
print('DecisionTree Classifier Cross Validation Score', round(tree_score.mean() * 100, 2).astype(str) + '%')

In [0]:
from sklearn.model_selection import cross_val_predict

In [0]:
log_reg_pred = cross_val_predict(log_reg, X_train, y_train, cv=5,
                             method="decision_function")

In [0]:
tree_pred = cross_val_predict(tree_clf, X_train, y_train, cv=5)

In [0]:
from sklearn.metrics import roc_auc_score

In [0]:
print('Logistic Regression: ', roc_auc_score(y_train, log_reg_pred))
print('Decision Tree Classifier: ', roc_auc_score(y_train, tree_pred))


In [0]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [0]:
log_fpr, log_tpr, log_thresold = roc_curve(y_train, log_reg_pred)
tree_fpr, tree_tpr, tree_threshold = roc_curve(y_train, tree_pred)

In [0]:
def logistic_roc_curve(log_fpr, log_tpr):
    plt.figure(figsize=(12,8))
    plt.title('Logistic Regression ROC Curve', fontsize=16)
    plt.plot(log_fpr, log_tpr, 'b-', linewidth=2)
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.axis([-0.01,1,0,1])
    
    


In [0]:
logistic_roc_curve(log_fpr, log_tpr)
plt.show()

In [0]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

In [0]:
precision, recall, threshold = precision_recall_curve(y_train, log_reg_pred)


In [0]:
y_pred = log_reg.predict(X_train)

In [0]:
print('---' * 45)

print('Recall Score: {:.2f}'.format(recall_score(y_train, y_pred)))
print('Precision Score: {:.2f}'.format(precision_score(y_train, y_pred)))
print('F1 Score: {:.2f}'.format(f1_score(y_train, y_pred)))
print('Accuracy Score: {:.2f}'.format(accuracy_score(y_train, y_pred)))
print('---' * 45)

In [0]:
y_pred_tree = tree_clf.predict(X_train)

In [0]:
print('---' * 45)

print('Recall Score: {:.2f}'.format(recall_score(y_train, y_pred_tree)))
print('Precision Score: {:.2f}'.format(precision_score(y_train, y_pred_tree)))
print('F1 Score: {:.2f}'.format(f1_score(y_train, y_pred_tree)))
print('Accuracy Score: {:.2f}'.format(accuracy_score(y_train, y_pred_tree)))
print('---' * 45)

In [0]:
X_train.shape

In [0]:
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

# 모델설정
sm = SMOTE(ratio='auto', kind='regular')

X_resampled, y_resampled = sm.fit_sample(X_train,list(y_train))

print('After OverSampling, the shape of train_X: {}'.format(X_resampled.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(X_resampled.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_resampled==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_resampled==0)))

In [0]:
y_resampled.shape

In [0]:
tree_clf.fit(X_resampled, y_resampled)

In [0]:
y_pred_tree_smote = tree_clf.predict(X_test)

In [0]:
print('---' * 45)

print('Recall Score: {:.2f}'.format(recall_score(y_test, y_pred_tree_smote)))
print('Precision Score: {:.2f}'.format(precision_score(y_test, y_pred_tree_smote)))
print('F1 Score: {:.2f}'.format(f1_score(y_test, y_pred_tree_smote)))
print('Accuracy Score: {:.2f}'.format(accuracy_score(y_test, y_pred_tree_smote)))
print('---' * 45)

-----

In [0]:
test_iden = pd.read_csv('test_identity.csv', error_bad_lines=False)
test_trans = pd.read_csv('test_transaction.csv', error_bad_lines=False)

need : test data 전처리 및 모델 스코어 확인