In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
train_data_1 = pd.read_csv('train_preprocess_v1.csv').values
test_data_1 = pd.read_csv('test_preprocess_v1.csv').values

In [3]:
train_data_2 = pd.read_csv('train_preprocess_v2.csv').values
test_data_2 = pd.read_csv('test_preprocess_v2.csv').values

In [4]:
train_data_3 = pd.read_csv('train_preprocess_v3.csv').values
test_data_3 = pd.read_csv('test_preprocess_v3.csv').values

In [5]:
train_data_4 = pd.read_csv('train_preprocess_v4.csv').values
test_data_4 = pd.read_csv('test_preprocess_v4.csv').values

In [6]:
train_data_1.shape, train_data_2.shape, train_data_3.shape, train_data_4.shape

((7000, 920), (7000, 1537), (7000, 38), (7000, 1752))

In [7]:
X_train_all = np.concatenate([train_data_1, train_data_2, train_data_3, train_data_4], axis=1)
X_test_all = np.concatenate([test_data_1, test_data_2, test_data_3, test_data_4], axis=1)

In [106]:
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

label_df = pd.read_csv('label.csv', header=None)
y_train = label_df[0].values

le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

In [9]:
from feature_selector import FeatureSelector

fs = FeatureSelector(data=pd.DataFrame(X_train_all), labels=y_train)

fs.identify_collinear(correlation_threshold=0.90)

train_no_coll_df = fs.remove(methods = ['collinear'])
test_no_coll_df = pd.DataFrame(X_test_all)[train_no_coll_df.columns]

502 features with a correlation magnitude greater than 0.90.

Removed 502 features.


In [10]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

selector = VarianceThreshold(0.2)
X_train_var = selector.fit_transform(imputer.fit_transform(train_no_coll_df.replace([np.inf, -np.inf], np.nan).values))
X_test_var = selector.transform(imputer.fit_transform(test_no_coll_df.replace([np.inf, -np.inf], np.nan).values))

In [11]:
X_train_var.shape, X_test_var.shape

((7000, 3280), (2000, 3280))

In [12]:
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFE, VarianceThreshold

rfe = RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.9000000000000001,
                                         n_estimators=100), step=0.15000000000000002)

rfe.fit(X_train_var, y_train)


X_train_sub_1 = X_train_var[:, rfe.support_]
X_test_sub_1 = X_test_var[:, rfe.support_]

X_train_sub_1.shape, X_test_sub_1.shape

((7000, 1640), (2000, 1640))

In [13]:
rfe = RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.9000000000000001,
                                         n_estimators=100), step=0.15000000000000002)

rfe.fit(X_train_sub_1, y_train)

X_train_sub_2 = X_train_sub_1[:, rfe.support_]
X_test_sub_2 = X_test_sub_1[:, rfe.support_]

X_train_sub_2.shape, X_test_sub_2.shape

((7000, 820), (2000, 820))

In [14]:
rfe = RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.9000000000000001,
                                         n_estimators=100), step=0.15000000000000002)

rfe.fit(X_train_sub_2, y_train)

X_train_sub_3 = X_train_sub_2[:, rfe.support_]
X_test_sub_3 = X_test_sub_2[:, rfe.support_]

X_train_sub_3.shape, X_test_sub_3.shape

((7000, 410), (2000, 410))

In [15]:
rfe = RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.9000000000000001,
                                         n_estimators=100), step=0.15000000000000002)

rfe.fit(X_train_sub_3, y_train)

X_train_sub_4 = X_train_sub_3[:, rfe.support_]
X_test_sub_4 = X_test_sub_3[:, rfe.support_]

X_train_sub_4.shape, X_test_sub_4.shape

((7000, 205), (2000, 205))

In [23]:
rfe = RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.9000000000000001,
                                         n_estimators=100), step=0.1)

rfe.fit(X_train_sub_4, y_train)

X_train_sub_5 = X_train_sub_4[:, rfe.support_]
X_test_sub_5 = X_test_sub_4[:, rfe.support_]

X_train_sub_5.shape, X_test_sub_5.shape

((7000, 102), (2000, 102))

In [26]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

def evaluate_macroF1_lgb(truth, predictions):  
    pred_labels = predictions.reshape(len(np.unique(truth)),-1).argmax(axis=0)
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', f1, True) 

kf = KFold(n_splits=5, random_state=2020, shuffle=True)

model_list_1 = []
score_list = []
for train_index, test_index in kf.split(X_train_sub_5):
    model = lgb.LGBMClassifier(n_estimators=1000, objective='multiclass', num_leaves=63,
                               max_depth=7, learning_rate=0.03, subsample=0.8, colsample_bytree=0.8)
    eval_set = (X_train_sub_5[test_index], y_train[test_index])
    model.fit(X=X_train_sub_5[train_index], y=y_train[train_index], eval_metric=evaluate_macroF1_lgb,
              eval_set=eval_set, early_stopping_rounds=100, verbose=0)
    model_list_1.append(model)
    score_list.append(f1_score(y_train[test_index], model.predict(X_train_sub_5[test_index]), average='macro'))
    
print(score_list)
print(np.mean(score_list), np.std(score_list))

[0.8450006921602006, 0.8771330042547852, 0.8629475921657391, 0.8615475971309241, 0.8629758108193352]
0.8619209393061968 0.010198115883483


In [31]:
importances_1 = None
for model in model_list_1:
    if importances_1 is None:
        importances_1 = model.feature_importances_
    else:
        importances_1 = importances_1 + model.feature_importances_
print(importances_1.argsort()[::-1])

[ 17  62  71  61   8  66  12  53  93  16  63  19  76  49  98  72  99   5
  37  78  96  21  55  31  54  43  95  39  29   2  97  81  10  27  51  86
  52  77  92  30  28  22  33  89  73  85  75  45   1  32   3  35  20   6
  82  40  90 101  65  11  42  84  64  80  18  87   9  74   4  13  25  70
  94  68  69  59  83  26  91  60  15  41  36   0  48  88  46  44  58   7
 100  79  38  47  50  23  34  24  14  56  57  67]


In [32]:
from sklearn.metrics import f1_score

def evaluate_macroF1_xgb(predictions, dtrain):
    labels = dtrain.get_label()
    pred_labels = predictions.reshape(len(np.unique(labels)), -1).argmax(axis=0)
    f1 = f1_score(labels, pred_labels, average='macro')
    return 'macroF1', 1-f1

import xgboost as xgb

kf = KFold(n_splits=5, random_state=2020, shuffle=True)

model_list_2 = []
score_list = []
for train_index, test_index in kf.split(X_train_sub_5):
    model = xgb.XGBClassifier(n_estimators=1000, objective='multi:softmax', num_leaves=63,
                               max_depth=7, learning_rate=0.03, subsample=0.8, colsample_bytree=0.8)
    eval_set = [(X_train_sub_5[test_index], y_train[test_index])]
    model.fit(X_train_sub_5[train_index], y_train[train_index], eval_set=eval_set,
              early_stopping_rounds=100, verbose=0)
    model_list_2.append(model)
    score_list.append(f1_score(y_train[test_index], model.predict(X_train_sub_5[test_index]), average='macro'))
    
print(score_list)
print(np.mean(score_list), np.std(score_list))

[0.8579684624940369, 0.8763748252006675, 0.8697491357188434, 0.8557879852009572, 0.8782115525252889]
0.8676183922279588 0.009235907162900412


In [33]:
importances_2 = None
for model in model_list_2:
    if importances_2 is None:
        importances_2 = model.feature_importances_
    else:
        importances_2 = importances_2 + model.feature_importances_
print(importances_2.argsort()[::-1])

[ 71   4  17  30  31  61  40  53  19  51   2  49  56   8  62  43  57  38
  12  55  78  50  29  81   6   5  86  66   3  59  60  96  80 100  16  20
  41  15  52  99  21  48  90  10  28  64  45   0  24  95  39  33  54  37
  58   7  14  25  42  93  87  73  76  35  77  22  84  79  13  68  97  11
  89  70  85  63  32  27  82 101  65  44  67  46  98  34  23  74  47   1
  92  18  88  75  91  36  69   9  72  83  26  94]


In [87]:
important_features = np.union1d(importances_1.argsort()[::-1][:5], importances_2.argsort()[::-1][:5])
important_features.shape

(8,)

In [88]:
X_train_important = X_train_sub_5[:, important_features]
X_test_important = X_test_sub_5[:, important_features]

In [44]:
def feature_generate_manually():
    train_path = '../data/hy_round1_train_20200102'
    test_path = '../data/hy_round1_testA_20200102'

    train_df_list = []
    for file_name in os.listdir(train_path):
        df = pd.read_csv(os.path.join(train_path, file_name))
        train_df_list.append(df)

    test_df_list = []
    for file_name in os.listdir(test_path):
        df = pd.read_csv(os.path.join(test_path, file_name))
        test_df_list.append(df)

    train_df = pd.concat(train_df_list)
    test_df = pd.concat(test_df_list)

    train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
    test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

    all_df = pd.concat([train_df, test_df], sort=False)

    new_df = all_df.groupby('渔船ID').agg(x_min=('x', 'min'), x_max=('x', 'max'), x_mean=('x', 'mean'), x_std=('x', 'std'), x_skew=('x', 'skew'), x_sum=('x', 'sum'),
                y_min=('y', 'min'), y_max=('y', 'max'), y_mean=('y', 'mean'), y_std=('y', 'std'), y_skew=('y', 'skew'), y_sum=('y', 'sum'),
                v_min=('速度', 'min'), v_max=('速度', 'max'), v_mean=('速度', 'mean'), v_std=('速度', 'std'), v_skew=('速度', 'skew'), v_sum=('速度', 'sum'),
                d_min=('方向', 'min'), d_max=('方向', 'max'), d_mean=('方向', 'mean'), d_std=('方向', 'std'), d_skew=('方向', 'skew'), d_sum=('方向', 'sum'))
    new_df['x_max-x_min'] = new_df['x_max'] - new_df['x_min']
    new_df['y_max-y_min'] = new_df['y_max'] - new_df['y_min']
    new_df['x_max-y_min'] = new_df['x_max'] - new_df['y_min']
    new_df['y_max-x_min'] = new_df['y_max'] - new_df['x_min']

    new_df['slope'] = new_df['y_max-y_min'] / np.where(new_df['x_max-x_min']==0, 0.001, new_df['x_max-x_min'])
    new_df['area'] = new_df['x_max-x_min'] * new_df['y_max-y_min']

    new_df['type'] = all_df.groupby('渔船ID').agg(type=('type', 'first'))['type'].values

    X_train = new_df.drop(columns=['type']).iloc[:7000]
    y_train = new_df.iloc[:7000]['type']

    X_test = new_df.drop(columns=['type']).iloc[7000:]

    return X_train, y_train, X_test

def feature_generate_tsfresh():
    train_df = pd.read_csv('../history/train.csv')
    X_train = train_df.drop(columns=['type'])
    y_train = train_df['type']

    test_df = pd.read_csv('../history/test.csv')
    X_test = test_df[X_train.columns]

    base_model = lgb.LGBMClassifier(n_estimators=1000, subsample=0.8)
    base_model.fit(X_train.values, y_train)

    selected_columns = X_train.columns[np.argsort(base_model.feature_importances_)[::-1][:24]]
    print(selected_columns)

    X_train = X_train[selected_columns]
    X_test = X_test[selected_columns]

    X_train_manully, _, X_test_manully = feature_generate_manually()

    X_train['x_max-x_min'] = X_train_manully['x_max-x_min'].values
    X_test['x_max-x_min'] = X_test_manully['x_max-x_min'].values
    X_train['x_max-y_min'] = X_train_manully['x_max-y_min'].values
    X_test['x_max-y_min'] = X_test_manully['x_max-y_min'].values
    X_train['y_max-x_min'] = X_train_manully['y_max-x_min'].values
    X_test['y_max-x_min'] = X_test_manully['y_max-x_min'].values
    X_train['y_max-y_min'] = X_train_manully['y_max-y_min'].values
    X_test['y_max-y_min'] = X_test_manully['y_max-y_min'].values

    X_train['slope'] = X_train_manully['slope'].values
    X_test['slope'] = X_test_manully['slope'].values
    X_train['area'] = X_train_manully['area'].values
    X_test['area'] = X_test_manully['area'].values
    
    for column in list(X_test.columns[X_test.isnull().sum() > 0]):
        mean_val = X_test[column].mean()
        X_test[column].fillna(mean_val, inplace=True)

    return X_train, y_train, X_test

In [45]:
X_train_tsfresh, y_train, X_test_tsfresh = feature_generate_tsfresh()

Index(['x__quantile__q_0.1', 'x__minimum', 'y__quantile__q_0.9', 'y__maximum',
       'x__quantile__q_0.2', 'y__quantile__q_0.8', 'y__quantile__q_0.7',
       '速度__number_crossing_m__m_1', 'y__minimum', 'x__quantile__q_0.3',
       '速度__agg_autocorrelation__f_agg_"median"__maxlag_40',
       'y__number_cwt_peaks__n_1', 'x__maximum', 'x__quantile__q_0.9',
       '速度__quantile__q_0.7', 'x__quantile__q_0.4',
       '方向__ar_coefficient__k_10__coeff_0', 'y__quantile__q_0.6',
       '速度__fft_coefficient__coeff_6__attr_"real"',
       '方向__fft_coefficient__coeff_64__attr_"abs"',
       '速度__ratio_beyond_r_sigma__r_2',
       'x__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_0__w_2',
       'y__fft_coefficient__coeff_6__attr_"angle"',
       '速度__agg_autocorrelation__f_agg_"mean"__maxlag_40'],
      dtype='object')


In [92]:
X_train_concat = np.concatenate([X_train_tsfresh.values], axis=1)
X_test_concat = np.concatenate([X_test_tsfresh.values], axis=1)

In [108]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

kf = KFold(n_splits=5, random_state=2020, shuffle=True)

model_list_1 = []
score_list = []
for train_index, test_index in kf.split(X_train_concat):
    model = lgb.LGBMClassifier(n_estimators=1000, objective='multiclass', num_leaves=63,
                               max_depth=8, learning_rate=0.035, subsample=0.8, colsample_bytree=0.8)
    eval_set = (X_train_concat[test_index], y_train[test_index])
    model.fit(X=X_train_concat[train_index], y=y_train[train_index], eval_metric=evaluate_macroF1_lgb,
              eval_set=eval_set, early_stopping_rounds=100, verbose=100)
    model_list_1.append(model)
    score_list.append(f1_score(y_train[test_index], model.predict(X_train_concat[test_index]), average='macro'))
    
print(score_list)
print(np.mean(score_list), np.std(score_list))

Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 0.303469	valid_0's macroF1: 0.863354
[200]	valid_0's multi_logloss: 0.262285	valid_0's macroF1: 0.874823
[300]	valid_0's multi_logloss: 0.248333	valid_0's macroF1: 0.879888
[400]	valid_0's multi_logloss: 0.245759	valid_0's macroF1: 0.883678
Early stopping, best iteration is:
[355]	valid_0's multi_logloss: 0.244523	valid_0's macroF1: 0.883678
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 0.291884	valid_0's macroF1: 0.866632
[200]	valid_0's multi_logloss: 0.252675	valid_0's macroF1: 0.877922
[300]	valid_0's multi_logloss: 0.244308	valid_0's macroF1: 0.886179
[400]	valid_0's multi_logloss: 0.241206	valid_0's macroF1: 0.883278
Early stopping, best iteration is:
[308]	valid_0's multi_logloss: 0.244085	valid_0's macroF1: 0.888171
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 0.31689	valid_0's macroF1: 0.8400

In [109]:
from sklearn.metrics import f1_score

def evaluate_macroF1_xgb(predictions, dtrain):
    labels = dtrain.get_label()
    pred_labels = predictions.reshape(len(np.unique(labels)), -1).argmax(axis=0)
    f1 = f1_score(labels, pred_labels, average='macro')
    return 'macroF1', 1-f1

import xgboost as xgb

kf = KFold(n_splits=5, random_state=2020, shuffle=True)

model_list_2 = []
score_list = []
for train_index, test_index in kf.split(X_train_concat):
    model = xgb.XGBClassifier(n_estimators=1000, objective='multi:softmax', num_leaves=63,
                               max_depth=7, learning_rate=0.035, subsample=0.8, colsample_bytree=0.8)
    eval_set = [(X_train_concat[test_index], y_train[test_index])]
    model.fit(X_train_concat[train_index], y_train[train_index], eval_set=eval_set,
              early_stopping_rounds=100, verbose=100)
    model_list_2.append(model)
    score_list.append(f1_score(y_train[test_index], model.predict(X_train_concat[test_index]), average='macro'))
    
print(score_list)
print(np.mean(score_list), np.std(score_list))

[0]	validation_0-merror:0.158571
Will train until validation_0-merror hasn't improved in 100 rounds.
[100]	validation_0-merror:0.110714
[200]	validation_0-merror:0.099286
[300]	validation_0-merror:0.092857
[400]	validation_0-merror:0.085714
[500]	validation_0-merror:0.082143
Stopping. Best iteration:
[495]	validation_0-merror:0.081429

[0]	validation_0-merror:0.158571
Will train until validation_0-merror hasn't improved in 100 rounds.
[100]	validation_0-merror:0.100714
[200]	validation_0-merror:0.090714
[300]	validation_0-merror:0.09
Stopping. Best iteration:
[263]	validation_0-merror:0.087857

[0]	validation_0-merror:0.180714
Will train until validation_0-merror hasn't improved in 100 rounds.
[100]	validation_0-merror:0.117143
[200]	validation_0-merror:0.107857
[300]	validation_0-merror:0.105
[400]	validation_0-merror:0.1
[500]	validation_0-merror:0.102143
Stopping. Best iteration:
[433]	validation_0-merror:0.098571

[0]	validation_0-merror:0.164286
Will train until validation_0-merro

In [111]:
X_train_concat.shape

(7000, 30)

In [169]:
result = []

for model in model_list_1:
    result.append(model.predict(X_test_concat))

for model in model_list_2:
    result.append(model.predict(X_test_concat))

result

[array([1, 2, 1, ..., 1, 2, 0]),
 array([1, 2, 1, ..., 1, 2, 1]),
 array([1, 2, 1, ..., 1, 2, 1]),
 array([1, 2, 1, ..., 1, 2, 0]),
 array([1, 2, 1, ..., 0, 2, 0]),
 array([1, 2, 1, ..., 1, 2, 0]),
 array([1, 2, 1, ..., 1, 2, 1]),
 array([1, 2, 1, ..., 0, 2, 1]),
 array([1, 2, 1, ..., 1, 2, 0]),
 array([1, 2, 1, ..., 0, 2, 0])]

In [170]:
for v in pd.DataFrame(np.array(result).T).mode(axis=1).values:
    if not np.isnan(v[1]):
        print(v)

[1. 2.]
[1. 2.]
[1. 2.]
[1. 2.]
[1. 2.]
[1. 2.]
[0. 1.]
[0. 1.]
[1. 2.]
[1. 2.]
[1. 2.]
[0. 2.]
[1. 2.]
[0. 2.]
[1. 2.]
[1. 2.]


In [162]:
prediction = pd.DataFrame(np.array(result).T).mode(axis=1)[0].values

In [157]:
result_list = [le.inverse_transform([int(p)])[0] for p in prediction]

In [158]:
len(result_list)

2000

In [160]:
pd.DataFrame(result_list, index=range(7000, 9000)).to_csv('result.csv', header=None)