In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


# 데이터 선택

In [None]:
x = np.load('./data/x.npy')
x = np.load('./data/x_percentile.npy')
x = np.load('./data/x_standard.npy'')
x = np.load('./data/x_try.npy'')
x = np.load('./data/x_robust.npy')

## 데이터 읽기

- **y** 는 LabelEncoder를 적용한 라벨 **숫자** 데이터
- **yy** 는 글자로 지정된 라벨 **문자** 데이터
- **pred** 는 만들어진 모델에 적용할 테스트(test) 데이터

In [None]:
sample_submission = pd.read_csv('./data/sample_submission.csv', index_col=0)

y = np.load('./data/y.npy')
pred = np.load('./data/pred.npy')
yy =pd.read_csv('./data/yy.csv',header=None)

path='./data/column_name.txt'
with open(path,'r')  as f:
    col_name=f.read() 

x_name=col_name.split('\n')
try:
    del(x_name[20])
except:
    pass

# To_categorical

In [22]:
#from keras.utils import to_categorical
#y=to_categorical(y)

# Train_test_Split

In [None]:
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.8, shuffle=True ,random_state=0)

# Modeling
- 1. RandomForestClassifier

`to_categorical` is lowest accuracy

In [None]:
kfold_cv = KFold(n_splits=5, shuffle=True)
parameters =  {"n_estimators": [100,300,500], "max_depth":[4,8]}

clf1=RandomForestClassifier()
n_iter_search = 6

clf1=RandomizedSearchCV( clf1, param_distributions=parameters, cv=kfold_cv, \
                        n_iter=n_iter_search,n_jobs=4)
clf1.fit(x_train,y_train)
y_pr1=clf1.predict(x_test)

print("Acc:", accuracy_score(y_test, y_pr1))
print(clf1.best_params_)

In [None]:
forest = RandomForestClassifier(**clf1.best_params_)
forest.fit(x_train,y_train)
res1=forest.score(x_test, y_test) # Accuracy만 반환
print("RF accuracy: {:.3f}".format(res1*100))

mid=len(x_name)//2

plt.figure(figsize=(15, 6))
plt.bar(x_name[:mid], forest.feature_importances_[:mid])
plt.show()

plt.figure(figsize=(15, 6))
plt.bar(x_name[mid:], forest.feature_importances_[mid:])
plt.show()

In [None]:
y_pred1 = forest.predict_proba(pred)
# 제출 파일 생성
submission1 = pd.DataFrame(data=y_pred1, columns=sample_submission.columns, index=sample_submission.index)
submission1.to_csv('./data/submission_data_1.csv', index=True)

- 2. LGBM

In [None]:
kfold_cv = KFold(n_splits=5, shuffle=True)
parameters =  {"n_estimators": [100,300,500], "max_depth":[4,8]}

clf2=LGBMClassifier()
n_iter_search = 6

clf2=RandomizedSearchCV( clf2, param_distributions=parameters, cv=kfold_cv, \
                        n_iter=n_iter_search,n_jobs=4)
clf2.fit(x_train,y_train)
y_pr2=clf2.predict(x_test)

print("Acc:", accuracy_score(y_test, y_pr1))
print(clf2.best_params_)

In [None]:
LGBM = LGBMClassifier(**clf2.best_params_)

eval_set = [(x_test, y_test)]
LGBM.fit(x_train, y_train, eval_metric="mlogloss", eval_set=eval_set, verbose=True,early_stopping_rounds=50)
res2=LGBM.score(x_test, y_test)
print("LGBM accuracy: {:.3f}".format(res2*100))
from lightgbm import plot_importance
plot_importance(LGBM)

In [None]:
y_pred2 = LGBM.predict_proba(pred)
# 제출 파일 생성
submission2 = pd.DataFrame(data=y_pred2, columns=sample_submission.columns, index=sample_submission.index)
submission2.to_csv('./data/submission_data_2.csv', index=True)

- 3. XGB

In [None]:
kfold_cv = KFold(n_splits=5, shuffle=True)
parameters =  {"n_estimators": [100,300,500], "max_depth":[4,8]}

clf3=XGBClassifier()
n_iter_search = 6

clf3=RandomizedSearchCV( clf3, param_distributions=parameters, cv=kfold_cv, \
                        n_iter=n_iter_search,n_jobs=4)
clf3.fit(x_train,y_train)
y_pr3=clf3.predict(x_test)

print("Acc:", accuracy_score(y_test, y_pr3))
print(clf3.best_params_)

In [None]:
XGB = XGBClassifier(**clf3.best_params_)

eval_set = [(x_test, y_test)]
XGB.fit(x_train, y_train, eval_metric="mlogloss", eval_set=eval_set, verbose=True,early_stopping_rounds=50)
res3=XGB.score(x_test, y_test)
print("XGB accuracy: {:.3f}".format(res3*100))
from xgboost import plot_importance
plot_importance(XGB)

In [None]:
y_pred3 = XGB.predict_proba(pred)
# 제출 파일 생성
submission3 = pd.DataFrame(data=y_pred3, columns=sample_submission.columns, index=sample_submission.index)
submission3.to_csv('./data/submission_data_3.csv', index=True)