In [10]:
import pandas as pd
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [2]:
X_train = pd.read_csv('Day_048_HW/train.csv', header=None)
Y_train = pd.read_csv('Day_048_HW/trainLabels.csv', header=None)
X_test = pd.read_csv('Day_048_HW/test.csv', header=None)
print(X_train.shape)
print(Y_train.shape)

(1000, 40)
(1000, 1)


In [3]:
Scaler = MinMaxScaler()
X_train = Scaler.fit_transform(X_train)
X_test = Scaler.fit_transform(X_test)

In [4]:
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.25, random_state=42)

# Build a model by defaule parameter

## GBDT

In [5]:
gbdt = GradientBoostingClassifier()
gbdt.fit(X = x_train, y = y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [6]:
print(f'AUC: {roc_auc_score(y_val[0], gbdt.predict_proba(x_val)[:,1])}')
print(f'F1-score: {f1_score(y_val[0], gbdt.predict(x_val))}')

AUC: 0.9381410256410256
F1-score: 0.8790322580645161


## RF

In [8]:
rf = RandomForestClassifier()
rf.fit(X = x_train, y = y_train)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [9]:
print(f'AUC: {roc_auc_score(y_val[0], rf.predict_proba(x_val)[:,1])}')
print(f'F1-score: {f1_score(y_val[0], rf.predict(x_val))}')

AUC: 0.9045192307692308
F1-score: 0.8245614035087719


# Search best Parameter

## GBDT

In [13]:
gbdt_GridSearch = GridSearchCV(estimator = GradientBoostingClassifier(),
                               param_grid = dict(learning_rate = [0.1, 0.05, 0.01],
                                                 max_depth = [2, 3, 4, 5],
                                                 n_estimators = [100,200,300]),
                               scoring="neg_mean_squared_error",
                               n_jobs=-1,
                               verbose=1)

# 開始搜尋最佳參數
gbdt_GridSearch_result = gbdt_GridSearch.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:   23.3s finished
  y = column_or_1d(y, warn=True)


In [23]:
gbdt_GridSearch_result.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=300,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [14]:
print(f'AUC: {roc_auc_score(y_val[0], gbdt_GridSearch_result.predict_proba(x_val)[:,1])}')
print(f'F1-score: {f1_score(y_val[0], gbdt_GridSearch_result.predict(x_val))}')

AUC: 0.9451282051282052
F1-score: 0.8825910931174088


## RF

In [20]:
rf_GridSearch = GridSearchCV(estimator = RandomForestClassifier(),
                               param_grid = dict(n_estimators = [10, 100, 300, 500],
                                                 max_features = [5, 6, 7, 8]),
                               scoring="neg_mean_squared_error",
                               n_jobs=-1,
                               verbose=1)

# 開始搜尋最佳參數
rf_GridSearch_result = rf_GridSearch.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   13.2s finished
  self.best_estimator_.fit(X, y, **fit_params)


In [22]:
rf_GridSearch_result.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
print(f'AUC: {roc_auc_score(y_val[0], rf_GridSearch_result.predict_proba(x_val)[:,1])}')
print(f'F1-score: {f1_score(y_val[0], rf_GridSearch_result.predict(x_val))}')

AUC: 0.9419230769230769
F1-score: 0.8790322580645161


# export


In [38]:
gbdt_output = gbdt.predict(X_test)
gbdt_output = pd.DataFrame({'Id': list(range(1,len(gbdt_output)+1)), 'Solution': gbdt_output})
gbdt_output.to_csv('./Day_048_HW/gbdt_output.csv', index = False)

In [39]:
rf_output = rf.predict(X_test)
rf_output = pd.DataFrame({'Id': list(range(1,len(rf_output)+1)), 'Solution': rf_output})
rf_output.to_csv('./Day_048_HW/rf_output.csv', index = False)