In [1]:
import pandas as pd
import numpy as np
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


In [13]:
X_train = pd.read_csv('x_train.csv', sep=';', header = -1) 
X_test = pd.read_csv('x_test.csv', sep=';', header = -1)
y_train = pd.read_csv('y_train.csv', sep=';', names = ['class'])
y_train = y_train.squeeze()

In [5]:
### берем необученную модель
skf_cv = StratifiedShuffleSplit(n_splits=8, test_size=0.2)
rfc = RandomForestClassifier(n_estimators=30, random_state=0)

In [6]:
### отбираем признаки
sfs3 = SFS(rfc, 
           k_features=25, 
           forward=True,
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=skf_cv,
          n_jobs=-1)

sfs3.fit(X_train.as_matrix(), y_train.as_matrix())


[2017-05-20 22:42:38] Features: 1/25 -- score: 0.502328080229
[2017-05-20 22:54:18] Features: 2/25 -- score: 0.518624641834
[2017-05-20 23:04:52] Features: 3/25 -- score: 0.587750716332
[2017-05-20 23:14:48] Features: 4/25 -- score: 0.599032951289
[2017-05-20 23:24:59] Features: 5/25 -- score: 0.609061604585
[2017-05-20 23:36:11] Features: 6/25 -- score: 0.618194842407
[2017-05-20 23:47:10] Features: 7/25 -- score: 0.609419770774
[2017-05-20 23:57:39] Features: 8/25 -- score: 0.610494269341
[2017-05-21 00:08:57] Features: 9/25 -- score: 0.620881088825
[2017-05-21 00:19:29] Features: 10/25 -- score: 0.618373925501
[2017-05-21 00:29:59] Features: 11/25 -- score: 0.621955587393
[2017-05-21 00:41:08] Features: 12/25 -- score: 0.620881088825
[2017-05-21 00:52:02] Features: 13/25 -- score: 0.618373925501
[2017-05-21 01:03:31] Features: 14/25 -- score: 0.609419770774
[2017-05-21 01:14:03] Features: 15/25 -- score: 0.608703438395
STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

SequentialFeatureSelector(clone_estimator=True,
             cv=StratifiedShuffleSplit(n_splits=8, random_state=None, test_size=0.2,
            train_size=None),
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=30, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False),
             floating=False, forward=True, k_features=25, n_jobs=-1,
             pre_dispatch='2*n_jobs', scoring='accuracy',
             skip_if_stuck=True, verbose=2)

In [10]:
### лучший результат с 9 признаками, оставляем их
top_feats_1 = list(sfs3.subsets_[9]['feature_idx'])
X_train = X_train[top_feats_1]
X_test = X_test[top_feats_1]

In [11]:
### попытка пойти в обратном направлении
### отбираем признаки
sfs4 = SFS(rfc, 
           k_features=6, 
           forward=False,
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=skf_cv,
          n_jobs=-1)

sfs4.fit(X_train.as_matrix(), y_train.as_matrix())


[2017-05-21 01:23:20] Features: 8/6 -- score: 0.61676217765
[2017-05-21 01:23:49] Features: 7/6 -- score: 0.611926934097
[2017-05-21 01:24:13] Features: 6/6 -- score: 0.617657593123

SequentialFeatureSelector(clone_estimator=True,
             cv=StratifiedShuffleSplit(n_splits=8, random_state=None, test_size=0.2,
            train_size=None),
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=30, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False),
             floating=False, forward=False, k_features=6, n_jobs=-1,
             pre_dispatch='2*n_jobs', scoring='accuracy',
             skip_if_stuck=True, verbose=2)

In [23]:
### поиск оптимальных гиперпараметров
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

def score(params):
    print('Params: ')
    print(params)
    xgb_rf = RandomForestClassifier(**params)
    skf_cv = StratifiedKFold(n_splits=5)
    score = -cross_val_score(xgb_rf, X_cleared, y_train, scoring='accuracy', cv=skf_cv).mean()
    print ("\tScore {0}\n\n".format(score))
    return {'loss': score, 'status': STATUS_OK}

def optimize(trials):
    space = {
            'max_features' : hp.quniform('max_features',  0.02, 0.4, 0.02),
            'min_samples_leaf' : hp.choice('min_samples_leaf', np.arange(1, 10, 1, dtype=int)),
            'n_estimators' : hp.choice('n_estimators', np.arange(30, 251, 5, dtype=int)),
            'random_state': 0
    }
    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)

    print(best)

In [14]:
### получили лучшие фичи, отделили данные
top_lst = [96, 131, 200, 138, 11, 76, 107, 79, 180, 156]
X_cleared = X_train[top_lst]
X_test_cleared = X_test[top_lst]

In [15]:
### пробуем дотюнивать на отобранных признаках
trials = Trials()
optimize(trials)

NameError: name 'Trials' is not defined

In [31]:
### выбираем лучший набор гиперпараметров
top_params = {'max_features': 0.08, 'n_estimators': 190, 'min_samples_leaf': 1, 'random_state': 0}
rfc = RandomForestClassifier(**top_params)

In [17]:
### глянем на разброс результатов по сидам
skf_cv = StratifiedKFold(n_splits=5)
for rs in [0, 42, 250, 121, 4]:
    top_params['random_state'] = rs
    rfc = RandomForestClassifier(**top_params)
    print(rs, '   ', cross_val_score(rfc, X_cleared, y_train, scoring='accuracy', cv=skf_cv).mean())

0     0.633990556002
42     0.635423642445
250     0.632846061528
121     0.629131381197
4     0.630850174837


In [15]:
### попробуем получить предсказание
rfc.fit(X_cleared, y_train)
answer = rfc.predict(X_test[top_lst])

In [16]:
np.savetxt('tuned_rf_select_from_all_feat_forward.csv', answer, fmt='%i', delimiter='\n')

In [17]:
### сгенерируем полинамиальные фичи на выбранном наборе
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)

In [18]:
X_cleared_and_poly = poly.fit_transform(X_cleared)

In [19]:
X_cleared_test = poly.transform(X_test_cleared)

In [9]:
### новый отбор фич

### отбираем признаки
sfs4 = SFS(rfc, 
           k_features=30, 
           forward=True,
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=skf_cv,
          n_jobs=-1)

sfs4.fit(X_cleared_and_poly, y_train.as_matrix())


[2017-05-19 02:54:19] Features: 1/30 -- score: 0.503861871046
[2017-05-19 03:03:04] Features: 2/30 -- score: 0.524512401392
[2017-05-19 03:11:59] Features: 3/30 -- score: 0.583277691791
[2017-05-19 03:20:25] Features: 4/30 -- score: 0.597312551548
[2017-05-19 03:28:45] Features: 5/30 -- score: 0.613362153132
[2017-05-19 03:37:00] Features: 6/30 -- score: 0.618252109643
[2017-05-19 03:45:05] Features: 7/30 -- score: 0.624256597567
[2017-05-19 03:52:57] Features: 8/30 -- score: 0.626556683809
[2017-05-19 04:00:43] Features: 9/30 -- score: 0.633720874468
[2017-05-19 04:08:19] Features: 10/30 -- score: 0.634857152875
[2017-05-19 04:15:46] Features: 11/30 -- score: 0.634566107292
[2017-05-19 04:23:08] Features: 12/30 -- score: 0.636300898427
[2017-05-19 04:30:16] Features: 13/30 -- score: 0.635718811947
[2017-05-19 04:37:17] Features: 14/30 -- score: 0.639444564845
[2017-05-19 04:44:11] Features: 15/30 -- score: 0.638032437303
[2017-05-19 04:50:56] Features: 16/30 -- score: 0.638600187165


SequentialFeatureSelector(clone_estimator=True,
             cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.08, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=190, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False),
             floating=False, forward=True, k_features=30, n_jobs=-1,
             pre_dispatch='2*n_jobs', scoring='accuracy',
             skip_if_stuck=True, verbose=2)

In [35]:
#top_feats2 = [2, 3, 4, 5, 6, 11, 14, 20, 21, 22, 23, 25, 27, 29, 30, 
#              32, 35, 39, 40, 42, 44, 46, 47, 54, 55, 56, 59, 60, 62]
top_feats2 = [2, 3, 4, 6, 11, 14, 20, 21, 22, 
              27, 29, 32, 39, 42, 44, 47, 56]

In [10]:
sfs4.subsets_

{1: {'avg_score': 0.50386187104607039,
  'cv_scores': array([ 0.51857143,  0.50859599,  0.47994269,  0.50932568,  0.50287356]),
  'feature_idx': (4,)},
 2: {'avg_score': 0.52451240139154653,
  'cv_scores': array([ 0.51571429,  0.53295129,  0.52292264,  0.52223816,  0.52873563]),
  'feature_idx': (27, 4)},
 3: {'avg_score': 0.5832776917910486,
  'cv_scores': array([ 0.55857143,  0.57593123,  0.59169054,  0.6241033 ,  0.56609195]),
  'feature_idx': (27, 4, 39)},
 4: {'avg_score': 0.59731255154841356,
  'cv_scores': array([ 0.58714286,  0.58739255,  0.60601719,  0.6169297 ,  0.58908046]),
  'feature_idx': (27, 4, 20, 39)},
 5: {'avg_score': 0.61336215313190956,
  'cv_scores': array([ 0.60428571,  0.61318052,  0.61318052,  0.62984218,  0.60632184]),
  'feature_idx': (42, 27, 4, 20, 39)},
 6: {'avg_score': 0.61825210964275645,
  'cv_scores': array([ 0.59285714,  0.61747851,  0.61891117,  0.6384505 ,  0.62356322]),
  'feature_idx': (4, 20, 39, 42, 27, 44)},
 7: {'avg_score': 0.62425659756746

In [36]:
#top_feats2 = list(sfs4.subsets_[30]['feature_idx'])
X_cleared_2 = pd.DataFrame(X_cleared_and_poly)[top_feats2]
X_test_cleared_2 = pd.DataFrame(X_cleared_test)[top_feats2]

In [37]:
X_cleared_2.shape

(3489, 17)

In [30]:
np.savetxt('X_train_top.csv', X_cleared_2, fmt='%f', delimiter=';')
np.savetxt('X_test_top.csv', X_test_cleared_2, fmt='%f', delimiter=';')

In [38]:
np.savetxt('X_train_top_2.csv', X_cleared_2, fmt='%f', delimiter=';')
np.savetxt('X_test_top_2.csv', X_test_cleared_2, fmt='%f', delimiter=';')

In [32]:
### проверка разброса по сидам
top_params['n_estimators'] = 300
seeds = [0, 42, 250, 121, 4, 78, 145, 38, 201, 17]
for i in range(10):
    top_params['random_state'] = seeds[i]
    rfc = RandomForestClassifier(**top_params)
    print(cross_val_score(rfc, X_cleared_2, y_train, scoring='accuracy', cv=skf_cv).mean())


0.635924068768
0.637893982808
0.637893982808
0.630372492837
0.634312320917
0.633775071633
0.630193409742
0.640759312321


KeyboardInterrupt: 

In [13]:
### пробуем сделать предсказание
rfc.fit(X_cleared_2, y_train)
ans = rfc.predict(X_test_cleared_2)

In [14]:
np.savetxt('tuned_rf_double_select_forward_30feats.csv', ans, fmt='%i', delimiter='\n')

In [15]:
import stacking

##### Создаем файлы разных типов для последующего стэкинга/голосования

In [16]:
### классификация с усреднением
X_train_new, X_test_new, cval_sc = stacking.one_classifier_voting_stacking(X_cleared_2, y_train, X_test_cleared_2, rfc)
np.savetxt('rf_30feats_stacking_class_mean_train.csv', X_train_new, fmt='%i', delimiter='\n')
np.savetxt('rf_30feats_stacking_class_mean_test.csv', X_test_new, fmt='%i', delimiter='\n')

In [17]:
### если не усреднять ответы для теста
X_train_new_2, X_test_new_2, cval_sc_2 = stacking.one_classifier_voting_stacking(X_cleared_2, y_train, X_test_cleared_2, rfc, test_mean=False)
np.savetxt('rf_30feats_stacking_class_not_mean_train.csv', X_train_new_2, fmt='%i', delimiter='\n')
np.savetxt('rf_30feats_stacking_class_not_mean_test.csv', X_test_new_2, fmt='%i', delimiter='\n')

In [18]:
### вероятностые результаты
X_train_new_3, X_test_new_3, cval_sc_3 = stacking.one_classifier_proba_stacking(X_cleared_2, y_train, X_test_cleared_2, rfc, 5)
np.savetxt('rf_30feats_stacking_proba_mean_train.csv', X_train_new_3, fmt='%f')
np.savetxt('rf_30feats_stacking_proba_mean_test.csv', X_test_new_3, fmt='%f')

In [19]:
### неусредненные вероятностые результаты
X_train_new_4, X_test_new_4, cval_sc_4 = stacking.one_classifier_proba_stacking(X_cleared_2, y_train, X_test_cleared_2, rfc, 5, test_mean=False)
np.savetxt('rf_30feats_stacking_proba_not_mean_train.csv', X_train_new_4, fmt='%f')
np.savetxt('rf_30feats_stacking_proba_not_mean_test.csv', X_test_new_4, fmt='%f')