In [None]:
import pandas as pd
df = pd.read_csv("data_0104.csv")
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
#X : 데이터에서 '정체'열을 떼어낸 학습용, y는 '정체'열만 들어있는 label데이터
#X2 : 후에 feature importance파악 시 사용할 데이터
X = X2 = df.drop('정체', axis = 1)
y = df['정체']
X

In [None]:
#0은 정체, 1은 원활
y

In [None]:
#dummy 변수 설정
X = pd.get_dummies(X, columns = X.columns, drop_first = True)
X.head()

In [None]:
#데이터의 30%를 test용으로 분류
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
#train set oversampling
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 11)
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

print('SMOTE 적용 전', X_train.shape, y_train.shape)
print('SMOTE 적용 후', X_train_over.shape, y_train_over.shape)
print('SMOTE 적용 전 레이블 값 분포:\n', pd.Series(y_train).value_counts())
print('SMOTE 적용 후 레이블 값 분포:\n', pd.Series(y_train_over).value_counts())

In [None]:
#평균 = 0, 표준편차 = 1인 표준정규분포로 표준화(데이터 노이즈 감소로 overfit 가능성 낮춤)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train_over = sc.fit_transform(X_train_over)
X_test = sc.transform(X_test)

In [None]:
#차원축소
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)

X_train_over = pca.fit_transform(X_train_over)
X_test = pca.transform(X_test)

In [None]:
#정확도 점수, 표준 편차 등을 표시하는 함수들
#train인지 test인지에 따라 출력 값이 다름
#매개변수 값이 train이라면 결과값을 낼 때 10번의 교차검증을 모든 cpu를 사용하여 진행

from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

def print_score(classifier, X_train_over, y_train_over, X_test, y_test, train = True):
    if train == True:
        print("Training results:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_train_over, classifier.predict(X_train_over))))
        print("Classification Report:\n{}\n".format(classification_report(y_train_over, classifier.predict(X_train_over))))
        print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_train_over, classifier.predict(X_train_over))))
        
        res = cross_val_score(classifier, X_train_over, y_train_over, cv = 10, n_jobs = -1, scoring = 'accuracy')
        
        print("Average Accuracy:\t{0:.4f}\n".format(res.mean()))
        print("Standard Deviation:\t{0:.4f}\n".format(res.std()))
        
    elif train == False:
        print("Test results:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_test, classifier.predict(X_test))))
        print("Classification Report:\n{}\n".format(classification_report(y_test, classifier.predict(X_test))))
        print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_test, classifier.predict(X_test))))

In [None]:
#100개의 결정트리 생성, 엔트로피 분리기준
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42)
classifier.fit(X_train_over, y_train_over)

In [None]:
print_score(classifier, X_train_over, y_train_over, X_test, y_test, train = True)

In [None]:
print_score(classifier, X_train_over, y_train_over, X_test, y_test, train = False)

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

lr_probs = classifier.predict_proba(X_test)
lr_probs = lr_probs[:, 1]
lr_auc = roc_auc_score(y_test, lr_probs)

In [None]:
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)

In [None]:
import matplotlib.pyplot as plt
plt.plot(lr_fpr, lr_tpr, marker='.', label='RandomForest')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

print('Random Forest: ROC AUC = %.3f' % (lr_auc))