In [1]:
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.impute import SimpleImputer

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

label_df = pd.read_csv('label.csv', header=None)
y_train = label_df[0].values

le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

In [3]:
def feature_generate_tsfresh():
    train_df = pd.read_csv('../history/train.csv')
    X_train = train_df.drop(columns=['type'])
    y_train = train_df['type']

    test_df = pd.read_csv('../history/testB.csv')
    X_test = test_df[X_train.columns]

    base_model = lgb.LGBMClassifier(n_estimators=500, objective='multiclass', num_leaves=63,
                                    max_depth=7, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8)
    base_model.fit(X_train.values, y_train, verbose=100)

    selected_columns = X_train.columns[np.argsort(base_model.feature_importances_)[::-1][:35]]
    selected_columns = selected_columns[~selected_columns.isin(['x__maximum', 'x__minimum', 'y__maximum', 'y__minimum'])]

    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    selector = SelectPercentile(score_func=f_classif, percentile=50)
    selector.fit(imputer.fit_transform(X_train.replace([np.inf, -np.inf], np.nan).values), y_train)
    selected_features = X_train.columns[np.argsort(selector.scores_)[::-1][:5]]
    selected_features = selected_features[~selected_features.isin(['x__maximum', 'x__minimum', 'y__maximum', 'y__minimum'])]

#     selected_features = np.union1d(selected_columns, selected_features)
    selected_features = selected_columns
    print(selected_features)
    
    X_train = X_train[selected_features]
    X_test = X_test[selected_features]

    return X_train, y_train, X_test

In [4]:
X_train_tsfresh, _, X_test_tsfresh = feature_generate_tsfresh()

Index(['x__quantile__q_0.1', 'y__quantile__q_0.9', 'y__quantile__q_0.8',
       '速度__number_crossing_m__m_1', '速度__quantile__q_0.7',
       '速度__agg_autocorrelation__f_agg_"median"__maxlag_40',
       'y__quantile__q_0.7',
       'y__percentage_of_reoccurring_datapoints_to_all_datapoints',
       'y__quantile__q_0.1', 'x__quantile__q_0.2', 'x__quantile__q_0.3',
       'x__quantile__q_0.9', '速度__ratio_beyond_r_sigma__r_2',
       '方向__ar_coefficient__k_10__coeff_0', 'x__quantile__q_0.4',
       '速度__quantile__q_0.6', 'y__quantile__q_0.4',
       '方向__approximate_entropy__m_2__r_0.5', '方向__quantile__q_0.9',
       'y__quantile__q_0.6', 'y__number_cwt_peaks__n_1',
       '速度__count_above_mean', 'x__number_peaks__n_1',
       '速度__linear_trend__attr_"pvalue"', 'y__quantile__q_0.3',
       'y__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_1__w_20',
       '速度__fft_coefficient__coeff_6__attr_"real"',
       '方向__approximate_entropy__m_2__r_0.7',
       '速度__agg_autocorrelation__f_agg_"var"_

In [5]:
from sklearn.feature_selection import SelectPercentile, f_classif

def feature_dv():
    train_data_1 = pd.read_csv('train_preprocess_v1.csv')
    test_data_1 = pd.read_csv('testB_preprocess_v1.csv')

    base_model = lgb.LGBMClassifier(n_estimators=300, objective='multiclass', num_leaves=63,
                                    max_depth=7, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8)
    base_model.fit(train_data_1.values, y_train)
#     imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
#     selector = SelectPercentile(score_func=f_classif, percentile=48)
#     selector.fit(imputer.fit_transform(train_data_1.replace([np.inf, -np.inf], np.nan).values), y_train)
#     selected_features = train_data_1.columns[np.argsort(selector.scores_)[::-1][:10]]
    
    selected_features = train_data_1.columns[np.argsort(base_model.feature_importances_)[::-1][:15]]
    print(selected_features)
   
    return train_data_1[selected_features], test_data_1[selected_features]


In [6]:
X_train_dv, X_test_dv = feature_dv()

Index(['dist__sum_of_reoccurring_values', 'dist__quantile__q_0.6',
       'speed__quantile__q_0.7', 'speed__number_cwt_peaks__n_5',
       'dist__quantile__q_0.9',
       'speed__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.6',
       'dist__length', 'speed__quantile__q_0.8',
       'dist__range_count__max_1__min_-1',
       'speed__partial_autocorrelation__lag_4',
       'speed__approximate_entropy__m_2__r_0.3',
       'dist__longest_strike_below_mean',
       'dist__percentage_of_reoccurring_datapoints_to_all_datapoints',
       'dist__quantile__q_0.7', 'speed__linear_trend__attr_"pvalue"'],
      dtype='object')


In [7]:
train_data_3 = pd.read_csv('train_preprocess_v3.csv')
test_data_3 = pd.read_csv('testB_preprocess_v3.csv')

In [8]:
X_train_all = np.concatenate([X_train_tsfresh.values, train_data_3.values, X_train_dv.values], axis=1)
X_test_all = np.concatenate([X_test_tsfresh.values, test_data_3.values, X_test_dv.values], axis=1)

In [17]:
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.feature_selection import RFE, VarianceThreshold

def get_model():
    exported_pipeline = make_pipeline(
        RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.8000000000000001, n_estimators=100),
            step=0.1),
        VarianceThreshold(threshold=0.001),
        StandardScaler(),
        StackingEstimator(estimator=SGDClassifier(alpha=0.001, eta0=0.001, fit_intercept=False,
                                                  l1_ratio=1.0, learning_rate="invscaling", loss="perceptron",
                                                  penalty="elasticnet", power_t=0.5)),
        GradientBoostingClassifier(learning_rate=0.05, max_depth=7, max_features=0.15000000000000002,
                                   min_samples_leaf=2, min_samples_split=2, n_estimators=400,
                                   subsample=0.8500000000000001)
    )
    set_param_recursive(exported_pipeline.steps, 'random_state', 42)
    return exported_pipeline

In [18]:
from sklearn.metrics import f1_score

from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

X_train_var = imputer.fit_transform(pd.DataFrame(X_train_all).replace([np.inf, -np.inf], np.nan).values)
X_test_var = imputer.fit_transform(pd.DataFrame(X_test_all).replace([np.inf, -np.inf], np.nan).values)

kf = KFold(n_splits=5, random_state=42, shuffle=True)

model_list_2 = []
score_list = []
for train_index, test_index in kf.split(X_train_var):
    model = get_model()
    model.fit(X_train_var[train_index], y_train[train_index])
    model_list_2.append(model)
    score_list.append(f1_score(y_train[test_index], model.predict(X_train_var[test_index]), average='macro'))
    
print(score_list)
print(np.mean(score_list), np.std(score_list))

[0.9179583653014433, 0.9251995674453738, 0.9174200682067202, 0.909435453378657, 0.9056389968948304]
0.915130490245405 0.00688690425619603


In [40]:
# from sklearn.model_selection import KFold
# from sklearn.metrics import f1_score

# def evaluate_macroF1_lgb(truth, predictions):  
#     pred_labels = predictions.reshape(len(np.unique(truth)),-1).argmax(axis=0)
#     f1 = f1_score(truth, pred_labels, average='macro')
#     return ('macroF1', f1, True)

# kf = KFold(n_splits=5, random_state=2020, shuffle=True)

# model_list_1 = []
# score_list = []
# for train_index, test_index in kf.split(X_train_all):
#     model = lgb.LGBMClassifier(n_estimators=1000, objective='multiclass', num_leaves=63, metric='custom',
#                                max_depth=7, learning_rate=0.027, subsample=0.8,
#                                colsample_bytree=0.8, reg_lambda=1)
#     eval_set = (X_train_all[test_index], y_train[test_index])
#     model.fit(X=X_train_all[train_index], y=y_train[train_index], eval_metric=evaluate_macroF1_lgb,
#               eval_set=eval_set, early_stopping_rounds=200, verbose=100)
#     model_list_1.append(model)
#     score_list.append(f1_score(y_train[test_index], model.predict(X_train_all[test_index]), average='macro'))
    
# print(score_list)
# print(np.mean(score_list), np.std(score_list))

Training until validation scores don't improve for 200 rounds
[100]	valid_0's macroF1: 0.842398
[200]	valid_0's macroF1: 0.85418
[300]	valid_0's macroF1: 0.865635
[400]	valid_0's macroF1: 0.863698
[500]	valid_0's macroF1: 0.86921
[600]	valid_0's macroF1: 0.873166
[700]	valid_0's macroF1: 0.873004
[800]	valid_0's macroF1: 0.876324
[900]	valid_0's macroF1: 0.878369
Early stopping, best iteration is:
[761]	valid_0's macroF1: 0.880383
Training until validation scores don't improve for 200 rounds
[100]	valid_0's macroF1: 0.862143
[200]	valid_0's macroF1: 0.870641
[300]	valid_0's macroF1: 0.868517
[400]	valid_0's macroF1: 0.869039
[500]	valid_0's macroF1: 0.876344
[600]	valid_0's macroF1: 0.876434
[700]	valid_0's macroF1: 0.877585
[800]	valid_0's macroF1: 0.878051
[900]	valid_0's macroF1: 0.879198
Early stopping, best iteration is:
[724]	valid_0's macroF1: 0.87972
Training until validation scores don't improve for 200 rounds
[100]	valid_0's macroF1: 0.832019
[200]	valid_0's macroF1: 0.854633

In [13]:
# from sklearn.metrics import f1_score

# def evaluate_macroF1_xgb(predictions, dtrain):
#     labels = dtrain.get_label()
#     pred_labels = predictions.reshape(len(np.unique(labels)), -1).argmax(axis=0)
#     f1 = f1_score(labels, pred_labels, average='macro')
#     return 'macroF1', 1-f1

# import xgboost as xgb

# kf = KFold(n_splits=5, random_state=2020, shuffle=True)

# model_list_2 = []
# score_list = []
# for train_index, test_index in kf.split(X_train_all):
#     model = xgb.XGBClassifier(n_estimators=1000, objective='multi:softmax', num_leaves=63,
#                               max_depth=7, learning_rate=0.035, subsample=0.8,
#                               colsample_bytree=0.8, reg_lambda=1)
#     eval_set = [(X_train_all[test_index], y_train[test_index])]
#     model.fit(X_train_all[train_index], y_train[train_index], eval_set=eval_set,
#               early_stopping_rounds=150, verbose=0)
#     model_list_2.append(model)
#     score_list.append(f1_score(y_train[test_index], model.predict(X_train_all[test_index]), average='macro'))
    
# print(score_list)
# print(np.mean(score_list), np.std(score_list))

In [11]:
result_list = None

for model in model_list_2:
    if result_list is None:
        result_list = model.predict_proba(X_test_var)
    else:
        result_list = result_list + model.predict_proba(X_test_var)
        
# for model in model_list_2:
#     result_list = result_list + model.predict_proba(X_test_all)
    
# result = le.inverse_transform(np.argmax(result_list / 5, axis=1))
# pd.DataFrame(result, index=range(9000, 11000)).to_csv('result.csv', header=None)

In [16]:
pd.DataFrame(result_list).to_csv('result_prob.csv', index=False, header=False)