In [0]:
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import matplotlib.pyplot as plt

#dir 저장
input_dir = '/content/drive/My Drive/pro_data/'
out_dir   = '/content/drive/My Drive/pro_data/output/'
model_dir    = '/content/drive/My Drive/pro_data/'

#score로 rmsle 사용
def rmsle(predicted_values, actual_values):  #정밀도 0에 가까울수록 예측이 좋다
    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)
    print("actual_values\n", actual_values)
    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)

    difference = (log_predict - log_actual) ** 2
    print("rmsle_difference\n",difference)

    mean_difference = difference.mean()
    score = np.sqrt(mean_difference)

    return score
rmsle_scorer = make_scorer(rmsle)
print(rmsle_scorer)

#최적 모델 찾기
def learn_models(X_train, y_train, X_test,Y_test):
    i = 0
    #gradient boosting regression 모델에 사용할 파라미터 지정
    param_grid_gbr = {
        "n_estimators": [2000,3000,4000,5000],
        'learning_rate':[0.01,0.05, 0.1],
        'alpha' :[0.1,0.2,0.3]
    }
    #xgboost regression 모델에 사용할 파라미터 지정
    param_xgr = {
        'learning_rate': [0.06],
        'max_depth': [2, 3],
        'n_estimators': [40000],
        'colsample_bytree': [0.5, 1],
    }
    #cross validation 지정
    shuffle_cv = KFold(n_splits=10, shuffle=True, random_state=2019)
    models = [GradientBoostingRegressor(),xgb.XGBRegressor()]
    model_names =["gbr",'xgr']
    param_grid = [param_grid_gbr,param_xgr]
    rmsle_s =[]

    #모델별로 최적의 하이퍼파라미터 찾기
    for model in models:
        grid_search = GridSearchCV(model, param_grid=param_grid[i], cv=shuffle_cv, n_jobs=1, verbose=2,
                                   scoring=rmsle_scorer)
        grid_search.fit(X_train, y_train)     #train
        prediction = grid_search.predict(X_test)  # 검증을 위한 테스트
        score = grid_search.best_score_           # 최고 점수 저장
        print(model_names[i] + "최고 점수 : ", score)
        print(model_names[i] + "최고 점수를 낸 파라미터 : ", grid_search.best_params_)
        print(model_names[i] + "최고 점수를 낸 파라미터를 가진 모형 : ", grid_search.best_estimator_)
        rmsle_s.append(score)
        joblib.dump(grid_search, model_dir + f'{model_names[i]}_log_reg.pkl') # 최고 점수를 낸 모델 저장
        i += 1
    d = {'Model': model_names, 'RMSLE': rmsle_s}
    print("d==",d)
    #gbr 최고 성능 모델과 xgr 최고 성능 모델 스코어 비교 후 더 좋은 모델 이름 return
    if rmsle_s[0] < rmsle_s[1]:
        print('best: '+model_names[0]+str(rmsle_s[0]))
        return model_names[0]
    else:
        print('best: '+model_names[1]+str(rmsle_s[1]))
        return model_names[1]

scaler = MinMaxScaler()
#예측 함수
def predict_data():
    pred_raw =pd.read_csv(input_dir + "nspdatatest12.csv")
    Xp = pred_raw.iloc[:, 0:2]
    Xp_scale = scaler.transform(Xp)
    #저장해놓은 최고 성능 모델 load
    grid_from_joblib = joblib.load(model_dir+f'{best_model}_log_reg.pkl')
    test_y_pred = grid_from_joblib.predict(Xp_scale)  #predict

    pred_df05 = pd.DataFrame()
    pred_df05['test_y_pred'] = test_y_pred
    pred_df05.to_csv(out_dir+'nsp_predict.csv')

#사용할 data 불러오기
dataset = pd.read_csv('/content/drive/My Drive/pro_data/nspdata12.csv')

#X, Y 편집
X = dataset.iloc[:,:2] #독립변수
X = X.dropna(axis=0)   #null 값 제거
print(X.head())
Y = dataset['correct'] #종속변수
Y = Y.dropna(axis=0)   #null 값 제거
print(Y.head())
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=1004)
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

#베스트 모델 찾기
best_model = learn_models(X_train_scale, Y_train, X_test_scale, Y_test)

predict_data()

make_scorer(rmsle)
     onetwo  twothree
0  0.977854  0.297051
1  0.050302  0.736681
2  0.995564  0.019322
3  0.882049  0.846484
4  0.949409  0.112150
0    0.5883
1    0.3640
2    0.4617
3    0.5927
4    0.4341
Name: correct, dtype: float64
Fitting 10 folds for each of 36 candidates, totalling 360 fits
[CV] alpha=0.1, learning_rate=0.01, n_estimators=2000 ................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


actual_values
 [0.45827053 0.46168464 0.65972398 0.46948909]
rmsle_difference
 [0.00164549 0.00020511 0.01321284 0.01860917]
[CV] . alpha=0.1, learning_rate=0.01, n_estimators=2000, total=   0.4s
[CV] alpha=0.1, learning_rate=0.01, n_estimators=2000 ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


actual_values
 [0.4393466  0.52911239 0.3817373  0.40071274]
rmsle_difference
 [0.00273338 0.02191615 0.00022888 0.00181637]
[CV] . alpha=0.1, learning_rate=0.01, n_estimators=2000, total=   0.4s
[CV] alpha=0.1, learning_rate=0.01, n_estimators=2000 ................
actual_values
 [0.59870458 0.56269999 0.45489514 0.32551738]
rmsle_difference
 [0.00156471 0.00268414 0.02507092 0.01852802]
[CV] . alpha=0.1, learning_rate=0.01, n_estimators=2000, total=   0.4s
[CV] alpha=0.1, learning_rate=0.01, n_estimators=2000 ................
actual_values
 [0.4914195  0.42393405 0.38918625 0.54359551]
rmsle_difference
 [1.51678403e-02 5.06086169e-05 2.49507106e-03 9.26053958e-06]
[CV] . alpha=0.1, learning_rate=0.01, n_estimators=2000, total=   0.5s
[CV] alpha=0.1, learning_rate=0.01, n_estimators=2000 ................
actual_values
 [0.54231831 0.4257837  0.50772788 0.45469207]
rmsle_difference
 [0.00024004 0.01165161 0.00300599 0.0008219 ]
[CV] . alpha=0.1, learning_rate=0.01, n_estimators=2000, t

[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:  4.1min finished


gbr_최고 점수 :  0.0767116125602502
gbr_최고 점수를 낸 파라미터 :  {'alpha': 0.3, 'learning_rate': 0.1, 'n_estimators': 2000}
gbr_최고 점수를 낸 파라미터를 가진 모형 :  GradientBoostingRegressor(alpha=0.3, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=2000,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
(5, 2)
Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] colsample_bytree=0.5, learning_rate=0.06, max_depth=2, n_estimators=40000 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


actual_values
 [0.4579679  0.47625956 0.7617298  0.4846588 ]
rmsle_difference
 [0.0016287  0.00058776 0.03048233 0.02151668]
[CV]  colsample_bytree=0.5, learning_rate=0.06, max_depth=2, n_estimators=40000, total=   1.1s
[CV] colsample_bytree=0.5, learning_rate=0.06, max_depth=2, n_estimators=40000 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


actual_values
 [0.33834034 0.53093743 0.30791938 0.46973488]
rmsle_difference
 [1.56351823e-02 2.22707421e-02 4.90456721e-03 3.00511656e-05]
[CV]  colsample_bytree=0.5, learning_rate=0.06, max_depth=2, n_estimators=40000, total=   1.1s
[CV] colsample_bytree=0.5, learning_rate=0.06, max_depth=2, n_estimators=40000 
actual_values
 [0.5805057  0.5636623  0.44020408 0.32578862]
rmsle_difference
 [0.00079004 0.00274831 0.02838785 0.01847237]
[CV]  colsample_bytree=0.5, learning_rate=0.06, max_depth=2, n_estimators=40000, total=   1.1s
[CV] colsample_bytree=0.5, learning_rate=0.06, max_depth=2, n_estimators=40000 
actual_values
 [0.48883602 0.3823855  0.44959164 0.50680095]
rmsle_difference
 [0.0147438  0.00134886 0.00855892 0.00073814]
[CV]  colsample_bytree=0.5, learning_rate=0.06, max_depth=2, n_estimators=40000, total=   1.1s
[CV] colsample_bytree=0.5, learning_rate=0.06, max_depth=2, n_estimators=40000 
actual_values
 [0.50662875 0.44019946 0.40421504 0.39117157]
rmsle_difference
 [0.00

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   42.4s finished


xgr_최고 점수 :  0.08351729535036527
xgr_최고 점수를 낸 파라미터 :  {'colsample_bytree': 0.5, 'learning_rate': 0.06, 'max_depth': 2, 'n_estimators': 40000}
xgr_최고 점수를 낸 파라미터를 가진 모형 :  XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0,
             importance_type='gain', learning_rate=0.06, max_delta_step=0,
             max_depth=2, min_child_weight=1, missing=None, n_estimators=40000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)
(5, 2)
d==== {'Modelling Algo': ['gbr', 'xgr'], 'RMSLE': [0.0767116125602502, 0.08351729535036527]}
best: gbr0.0767116125602502


In [0]:
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import matplotlib.pyplot as plt

#dir 저장
input_dir = '/content/drive/My Drive/pro_data/'
out_dir   = '/content/drive/My Drive/pro_data/output/'
model_dir    = '/content/drive/My Drive/pro_data/'

#최적 모델 찾기
def learn_models(X_train, y_train, X_test,Y_test):
    i = 0
    param_grid_gbr = {
        "n_estimators": [100,500,1000,3000, 5000],
        'learning_rate':[0.01,0.05, 0.1],
        'subsample' :[0.5,0.7,0.9]
    }
    param_xgr = {
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [2, 3],
        'subsample' :[0.5,0.7,0.9],
        'colsample_bytree': [0.5, 1],
    }
    shuffle_cv = KFold(n_splits=5, shuffle=True, random_state=2019)
    models = [GradientBoostingClassifier(random_state=2019),xgb.XGBClassifier()]  #Classifier
    model_names =["gbr",'xgr']
    param_grid = [param_grid_gbr,param_xgr]
    accu_s =[]
    for model in models:
        grid_search = GridSearchCV(model, param_grid=param_grid[i], cv=shuffle_cv, n_jobs=1, verbose=5,
                                   scoring='accuracy')
        grid_search.fit(X_train, y_train)
        prediction = grid_search.predict(X_test)  # 검증을 위한 테스트
        score = grid_search.best_score_
        print(model_names[i] + "최고 점수 : ", score)
        print(model_names[i] + "최고 점수를 낸 파라미터 : ", grid_search.best_params_)
        print(model_names[i] + "최고 점수를 낸 파라미터를 가진 모형 : ", grid_search.best_estimator_)
        accu_s.append(score)
        joblib.dump(grid_search, model_dir + f'{model_names[i]}_cl.pkl')
        i += 1
    d = {'Model': model_names, 'Accuracy': accu_s}
    print("d==",d)
    if accu_s[0] > accu_s[1]:
        print('best: '+model_names[0]+str(accu_s[0]))
        return model_names[0]
    else:
        print('best: '+model_names[1]+str(accu_s[1]))
        return model_names[1]

scaler = StandardScaler()
def predict_data():
    pred_raw=pd.read_csv(input_dir + "nspdatatestcl.csv")
    Xp = pred_raw.iloc[:, 0:2]
    Xp_scale = scaler.transform(Xp)
    grid_from_joblib = joblib.load(model_dir+f'{best_model}_cl.pkl')
    if best_model == 'xgr':
      Xp.columns = ['f0', 'f1']
    test_y_pred = grid_from_joblib.predict(Xp_scale)
    pred_df = pd.DataFrame()
    pred_df['test_y_pred'] = test_y_pred
    pred_df.to_csv(out_dir+'nsp_predictcl2.csv')


dataset = pd.read_csv('/content/drive/My Drive/pro_data/nspdataclv2.csv')

X = dataset.iloc[:,:2]
X = X.dropna(axis=0)
print(X.head())
Y = dataset['correct']
Y = Y.dropna(axis=0)
print(Y.head())
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=1004)
scaler = MinMaxScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

best_model = learn_models(X_train_scale, Y_train, X_test_scale, Y_test)

predict_data()

     onetwo  twothree
0  0.435944  0.448416
1  0.459359  0.452463
2  0.387551  0.443144
3  0.452032  0.455800
4  0.453465  0.456102
0    1.0
1    0.0
2    0.0
3    1.0
4    0.0
Name: correct, dtype: float64
Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV] learning_rate=0.01, n_estimators=100, subsample=0.5 .............
[CV]  learning_rate=0.01, n_estimators=100, subsample=0.5, score=0.875, total=   0.1s
[CV] learning_rate=0.01, n_estimators=100, subsample=0.5 .............
[CV]  learning_rate=0.01, n_estimators=100, subsample=0.5, score=0.500, total=   0.1s
[CV] learning_rate=0.01, n_estimators=100, subsample=0.5 .............
[CV]  learning_rate=0.01, n_estimators=100, subsample=0.5, score=0.375, total=   0.1s
[CV] learning_rate=0.01, n_estimators=100, subsample=0.5 .............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s


[CV]  learning_rate=0.01, n_estimators=100, subsample=0.5, score=0.750, total=   0.1s
[CV] learning_rate=0.01, n_estimators=100, subsample=0.5 .............
[CV]  learning_rate=0.01, n_estimators=100, subsample=0.5, score=0.875, total=   0.1s
[CV] learning_rate=0.01, n_estimators=100, subsample=0.7 .............
[CV]  learning_rate=0.01, n_estimators=100, subsample=0.7, score=0.750, total=   0.1s
[CV] learning_rate=0.01, n_estimators=100, subsample=0.7 .............
[CV]  learning_rate=0.01, n_estimators=100, subsample=0.7, score=0.750, total=   0.1s
[CV] learning_rate=0.01, n_estimators=100, subsample=0.7 .............


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s


[CV]  learning_rate=0.01, n_estimators=100, subsample=0.7, score=0.375, total=   0.1s
[CV] learning_rate=0.01, n_estimators=100, subsample=0.7 .............
[CV]  learning_rate=0.01, n_estimators=100, subsample=0.7, score=0.750, total=   0.1s
[CV] learning_rate=0.01, n_estimators=100, subsample=0.7 .............
[CV]  learning_rate=0.01, n_estimators=100, subsample=0.7, score=0.875, total=   0.1s
[CV] learning_rate=0.01, n_estimators=100, subsample=0.9 .............
[CV]  learning_rate=0.01, n_estimators=100, subsample=0.9, score=0.875, total=   0.1s
[CV] learning_rate=0.01, n_estimators=100, subsample=0.9 .............
[CV]  learning_rate=0.01, n_estimators=100, subsample=0.9, score=0.750, total=   0.1s
[CV] learning_rate=0.01, n_estimators=100, subsample=0.9 .............
[CV]  learning_rate=0.01, n_estimators=100, subsample=0.9, score=0.500, total=   0.1s
[CV] learning_rate=0.01, n_estimators=100, subsample=0.9 .............
[CV]  learning_rate=0.01, n_estimators=100, subsample=0.9,

[Parallel(n_jobs=1)]: Done 225 out of 225 | elapsed:  2.8min finished


Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] colsample_bytree=0.5, learning_rate=0.01, max_depth=2, subsample=0.5 
[CV]  colsample_bytree=0.5, learning_rate=0.01, max_depth=2, subsample=0.5, score=0.500, total=   0.0s
[CV] colsample_bytree=0.5, learning_rate=0.01, max_depth=2, subsample=0.5 
[CV]  colsample_bytree=0.5, learning_rate=0.01, max_depth=2, subsample=0.5, score=0.500, total=   0.0s
[CV] colsample_bytree=0.5, learning_rate=0.01, max_depth=2, subsample=0.5 
[CV]  colsample_bytree=0.5, learning_rate=0.01, max_depth=2, subsample=0.5, score=0.375, total=   0.0s
[CV] colsample_bytree=0.5, learning_rate=0.01, max_depth=2, subsample=0.5 
[CV]  colsample_bytree=0.5, learning_rate=0.01, max_depth=2, subsample=0.5, score=0.875, total=   0.0s
[CV] colsample_bytree=0.5, learning_rate=0.01, max_depth=2, subsample=0.5 
[CV]  colsample_bytree=0.5, learning_rate=0.01, max_depth=2, subsample=0.5, score=0.500, total=   0.0s
[CV] colsample_bytree=0.5, learning_rate=0.01, m

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s


[CV]  colsample_bytree=0.5, learning_rate=0.01, max_depth=3, subsample=0.7, score=0.500, total=   0.0s
[CV] colsample_bytree=0.5, learning_rate=0.01, max_depth=3, subsample=0.7 
[CV]  colsample_bytree=0.5, learning_rate=0.01, max_depth=3, subsample=0.7, score=0.375, total=   0.0s
[CV] colsample_bytree=0.5, learning_rate=0.01, max_depth=3, subsample=0.7 
[CV]  colsample_bytree=0.5, learning_rate=0.01, max_depth=3, subsample=0.7, score=0.750, total=   0.0s
[CV] colsample_bytree=0.5, learning_rate=0.01, max_depth=3, subsample=0.7 
[CV]  colsample_bytree=0.5, learning_rate=0.01, max_depth=3, subsample=0.7, score=0.625, total=   0.0s
[CV] colsample_bytree=0.5, learning_rate=0.01, max_depth=3, subsample=0.9 
[CV]  colsample_bytree=0.5, learning_rate=0.01, max_depth=3, subsample=0.9, score=0.625, total=   0.0s
[CV] colsample_bytree=0.5, learning_rate=0.01, max_depth=3, subsample=0.9 
[CV]  colsample_bytree=0.5, learning_rate=0.01, max_depth=3, subsample=0.9, score=0.500, total=   0.0s
[CV] co

[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:    1.8s finished


d==== {'Modelling Algo': ['gbr', 'xgr'], 'Accuracy': [0.775, 0.7]}
best: gbr0.775
