In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from matplotlib.pyplot import figure
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')


np.random.seed(123)

# Generate train data
X_inliers = 0.3 * np.random.randn(100, 2) ## 정규분포에서 100*2만들고
X_inliers = np.r_[X_inliers + 2, 2*X_inliers - 2] ## 각각 2,2 혹은 -2,-2만큼 평행이동한거를 vstack. 즉 cluster 2개

# Generate some outliers
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
X = np.r_[X_inliers, X_outliers] ##-4,4에서 뽑은 outlier와 inlier를 vstack

n_outliers = len(X_outliers)
ground_truth = np.ones(len(X), dtype=int)
ground_truth[-n_outliers:] = -1

# fit the model for outlier detection (default)
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
## use fit_predict to compute the predicted labels of the training samples
## (when LOF is used for outlier detection, the estimator has no predict, decision_function and score_samples methods).
y_pred = clf.fit_predict(X) # 1,-1로 나온다.
n_errors = (y_pred != ground_truth).sum()
X_scores = clf.negative_outlier_factor_ ## 음수 LOF scsore

plt.title("Local Outlier Factor (LOF)")
plt.scatter(X[:, 0], X[:, 1], color='black', s=3., label='Data points')
# plot circles with radius proportional to the outlier scores
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
plt.scatter(X[:, 0], X[:, 1], s=500 * radius, edgecolors='g',
            facecolors='none', label='Outlier scores')
n = np.copy(X_scores)
n[n>-2] = np.nan ## LOF scsore(음수)의 threshold는 2
n = np.round(n,2)
n = -1 * n ## 양수변환

# LOF만 txt표기
for i, txt in enumerate(n):
    if np.isnan(txt):continue
    plt.annotate(txt, (X[i,0], X[i,1]))
legend = plt.legend(loc='upper left')
plt.show()


In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
os.chdir('/content/drive/Othercomputers/내 MacBook Air/MLDL/project/open')

In [3]:
train = pd.read_csv('./train.csv')
val = pd.read_csv('./val.csv')
test = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')

In [4]:
val_normal = val[val['Class'] == 0].iloc[:, 1:]
val_outlier = val[val['Class'] == 1].iloc[:, 1:]

val_normal = val_normal.drop(['Class'], axis=1)
val_outlier = val_outlier.drop(['Class'], axis=1)

val_x = val.drop(['Class'], axis=1).iloc[:, 1:]
val_label = val['Class']

In [5]:
train_x = train.iloc[:, 1:]
test_x = test.iloc[:, 1:]

In [6]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from matplotlib.pyplot import figure
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')

np.random.seed(123)

<Figure size 640x480 with 0 Axes>

In [26]:
from sklearn.preprocessing import StandardScaler  # 표준화 
std = StandardScaler()
val_x_std = std.fit_transform(val_x) # 독립변인 표준화

In [49]:
# 이상치 탐지를위한 모델 적합 (기본값: n_neighbors=20, contamination=0.1)
clf = LocalOutlierFactor(n_neighbors=50, contamination=0.001)

# 훈련 샘플의 예측 레이블을 계산하기 위해 fit_predict 사용
# LOF가 이상치 탐지에 사용되는 경우 추정기는 예측(predict)이 없습니다.
# 결정 _ 함수 및 점수 _ 샘플 방법
y_pred = clf.fit_predict(val_x)
valid_scores = clf.negative_outlier_factor_

표준화 전 : 0.0822

표준화 후 : 0.0805

In [15]:
def get_pred_label(model_pred):
    # LOF 모델 출력 (1:정상, -1:이상치) 이므로 (0:정상, 1:불량(사기))로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [50]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import classification_report

val_pred = get_pred_label(y_pred)
val_score = f1_score(val_label, val_pred, average='macro')
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(val_label, val_pred))

Validation F1 Score : [0.8218492504549133]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.66      0.63      0.64        30

    accuracy                           1.00     28462
   macro avg       0.83      0.82      0.82     28462
weighted avg       1.00      1.00      1.00     28462



In [51]:
n_errors = (val_label != val_pred).sum()
n_errors

21

In [52]:
valid_scores

array([-1.08621209, -1.26158085, -1.13228744, ..., -1.02632746,
       -1.08902685, -1.22347056])

In [53]:
# train 셋에 적합
clf = LocalOutlierFactor(n_neighbors=50, contamination=0.001)
train_y_pred = clf.fit_predict(val_x)
train_LOF_scores = clf.negative_outlier_factor_

In [70]:
train_label = get_pred_label(train_y_pred)
print('train_normal:', len(train_label[train_label == 0]))
print('train_outlier:', train_label[train_label == 1].sum())

train_normal: 28433
train_outlier: 29
