In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
train_path = '../data/hy_round1_train_20200102'
test_path = '../data/hy_round1_testA_20200102'

train_df_list = []
for file_name in os.listdir(train_path):
    df = pd.read_csv(os.path.join(train_path, file_name))
    train_df_list.append(df)
    
test_df_list = []
for file_name in os.listdir(test_path):
    df = pd.read_csv(os.path.join(test_path, file_name))
    test_df_list.append(df)

train_df = pd.concat(train_df_list)
test_df = pd.concat(test_df_list)

train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

all_df = pd.concat([train_df, test_df], sort=False)

In [3]:
d_sin = []
d_cos = []
for value in all_df['方向'].values:
    d_sin.append(np.sin(value * 2 * np.pi / 360))
    d_cos.append(np.cos(value * 2 * np.pi / 360))
all_df['d_sin'] = d_sin
all_df['d_cos'] = d_cos

In [4]:
dist = []
x = all_df['x'].values
y = all_df['y'].values
min_x = np.min(x)
min_y = np.min(y)
for i in range(len(x)):
    dist.append(np.sqrt((x[i] - min_x) ** 2 + (y[i] - min_y) ** 2))
all_df['dist'] = dist

In [5]:
v = []
for value in all_df['速度'].values:
    if value <= 3:
        v.append(0)
    elif value <= 14:
        v.append(1)
    elif value <= 23:
        v.append(2)
    else:
        v.append(3)
all_df['v'] = v

In [7]:
all_df = all_df[['渔船ID', 'x', 'y', 'time', 'type', 'd_sin', 'd_cos', 'dist', 'v']]

In [8]:
group_list = []
for ship_id, group in all_df.groupby('渔船ID'):
    group = group.sort_values(by=['time'])
    group_list.append(group)

In [10]:
all_df = pd.concat(group_list)

In [11]:
df = all_df.drop(columns=['type'])
y = all_df['type']

In [12]:
from tsfresh import extract_features
extracted_df = extract_features(df, column_id='渔船ID', column_sort='time')

Feature Extraction: 100%|██████████| 30/30 [1:34:04<00:00, 188.15s/it]


In [13]:
train_df = extracted_df.iloc[:7000]
test_df = extracted_df.iloc[7000:]

In [14]:
y = []
for name, group in all_df.groupby('渔船ID'):
    y.append(group.iloc[0]['type'])

In [15]:
y_train = y[:7000]
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

In [16]:
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

impute(train_df)
filtered_train_df = select_features(train_df, y_train)
filtered_test_df = test_df[filtered_train_df.columns]

 'v__friedrich_coefficients__m_3__r_30__coeff_1'
 'v__friedrich_coefficients__m_3__r_30__coeff_2'
 'v__friedrich_coefficients__m_3__r_30__coeff_3'
 'v__max_langevin_fixed_point__m_3__r_30'] did not have any finite values. Filling with zeros.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cond, other, inplace, axis, level, errors=errors, try_cast=try_cast




In [18]:
filtered_train_df['type'] = le.inverse_transform(y_train)
filtered_train_df.to_csv('train_v6.csv')
filtered_test_df.to_csv('test_v6.csv')

In [19]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

import os
import lightgbm as lgb
from sklearn import preprocessing

In [20]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectPercentile, VarianceThreshold, f_classif
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import FunctionTransformer
from copy import copy

import os
import lightgbm as lgb
from sklearn import preprocessing

In [21]:
def get_model_v1():
    exported_pipeline = make_pipeline(
        SelectPercentile(score_func=f_classif, percentile=48),
        StackingEstimator(estimator=SGDClassifier(alpha=0.01, eta0=0.01, fit_intercept=False, l1_ratio=0.25, learning_rate="invscaling", loss="modified_huber", penalty="elasticnet", power_t=10.0)),
        ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.6000000000000001, min_samples_leaf=1, min_samples_split=3, n_estimators=100)
    )

    set_param_recursive(exported_pipeline.steps, 'random_state', 42)
    return exported_pipeline

In [22]:
def get_model_v2():
    exported_pipeline = make_pipeline(
        make_union(
            make_pipeline(
                make_union(
                    FunctionTransformer(copy),
                    FunctionTransformer(copy)
                ),
                SelectPercentile(score_func=f_classif, percentile=18)
            ),
            FunctionTransformer(copy)
        ),
        StackingEstimator(estimator=SGDClassifier(alpha=0.01, eta0=0.1, fit_intercept=False, l1_ratio=1.0, learning_rate="constant", loss="hinge", penalty="elasticnet", power_t=0.1)),
        VarianceThreshold(threshold=0.05),
        ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.55, min_samples_leaf=1, min_samples_split=4, n_estimators=100)
    )
    set_param_recursive(exported_pipeline.steps, 'random_state', 42)
    return exported_pipeline

In [23]:
def feature_generate_manually():
    train_path = '../data/hy_round1_train_20200102'
    test_path = '../data/hy_round1_testA_20200102'

    train_df_list = []
    for file_name in os.listdir(train_path):
        df = pd.read_csv(os.path.join(train_path, file_name))
        train_df_list.append(df)

    test_df_list = []
    for file_name in os.listdir(test_path):
        df = pd.read_csv(os.path.join(test_path, file_name))
        test_df_list.append(df)

    train_df = pd.concat(train_df_list)
    test_df = pd.concat(test_df_list)

    train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
    test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

    all_df = pd.concat([train_df, test_df], sort=False)

    new_df = all_df.groupby('渔船ID').agg(x_min=('x', 'min'), x_max=('x', 'max'), x_mean=('x', 'mean'), x_std=('x', 'std'), x_skew=('x', 'skew'), x_sum=('x', 'sum'),
                y_min=('y', 'min'), y_max=('y', 'max'), y_mean=('y', 'mean'), y_std=('y', 'std'), y_skew=('y', 'skew'), y_sum=('y', 'sum'),
                v_min=('速度', 'min'), v_max=('速度', 'max'), v_mean=('速度', 'mean'), v_std=('速度', 'std'), v_skew=('速度', 'skew'), v_sum=('速度', 'sum'),
                d_min=('方向', 'min'), d_max=('方向', 'max'), d_mean=('方向', 'mean'), d_std=('方向', 'std'), d_skew=('方向', 'skew'), d_sum=('方向', 'sum'))
    new_df['x_max-x_min'] = new_df['x_max'] - new_df['x_min']
    new_df['y_max-y_min'] = new_df['y_max'] - new_df['y_min']
    new_df['x_max-y_min'] = new_df['x_max'] - new_df['y_min']
    new_df['y_max-x_min'] = new_df['y_max'] - new_df['x_min']

    new_df['slope'] = new_df['y_max-y_min'] / np.where(new_df['x_max-x_min']==0, 0.001, new_df['x_max-x_min'])
    new_df['area'] = new_df['x_max-x_min'] * new_df['y_max-y_min']

    new_df['type'] = all_df.groupby('渔船ID').agg(type=('type', 'first'))['type'].values

    X_train = new_df.drop(columns=['type']).iloc[:7000]
    y_train = new_df.iloc[:7000]['type']

    X_test = new_df.drop(columns=['type']).iloc[7000:]

    return X_train, y_train, X_test


In [26]:
def feature_generate_tsfresh():
    train_df = pd.read_csv('./train_v6.csv')
    X_train = train_df.drop(columns=['type'])
    y_train = train_df['type']

    test_df = pd.read_csv('./test_v6.csv')
    X_test = test_df[X_train.columns]
    
    X_train_manully, _, X_test_manully = feature_generate_manually()

    X_train['x_max-x_min'] = X_train_manully['x_max-x_min'].values
    X_test['x_max-x_min'] = X_test_manully['x_max-x_min'].values
    X_train['x_max-y_min'] = X_train_manully['x_max-y_min'].values
    X_test['x_max-y_min'] = X_test_manully['x_max-y_min'].values
    X_train['y_max-x_min'] = X_train_manully['y_max-x_min'].values
    X_test['y_max-x_min'] = X_test_manully['y_max-x_min'].values
    X_train['y_max-y_min'] = X_train_manully['y_max-y_min'].values
    X_test['y_max-y_min'] = X_test_manully['y_max-y_min'].values

    X_train['slope'] = X_train_manully['slope'].values
    X_test['slope'] = X_test_manully['slope'].values
    X_train['area'] = X_train_manully['area'].values
    X_test['area'] = X_test_manully['area'].values

    base_model = lgb.LGBMClassifier(n_estimators=500, subsample=0.8)
    base_model.fit(X_train.values, y_train)

    selected_columns = X_train.columns[np.argsort(base_model.feature_importances_)[::-1][:200]]
    print(selected_columns)

    X_train = X_train[selected_columns]
    X_test = X_test[selected_columns]
    
    for column in list(X_test.columns[X_test.isnull().sum() > 0]):
        mean_val = X_test[column].mean()
        X_test[column].fillna(mean_val, inplace=True)

    return X_train.values, y_train.values, X_test.values

In [27]:
X_train_tsfresh, y_train, X_test_tsfresh = feature_generate_tsfresh()

Index(['y_max-x_min', 'x_max-y_min', 'x__quantile__q_0.1', 'y__minimum',
       'y__quantile__q_0.9', 'y__quantile__q_0.7', 'y__quantile__q_0.8',
       'x__minimum', 'v__agg_autocorrelation__f_agg_"var"__maxlag_40',
       'x__quantile__q_0.2',
       ...
       'y__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_2__w_20',
       'y__change_quantiles__f_agg_"mean"__isabs_False__qh_1.0__ql_0.6',
       'y__fft_coefficient__coeff_76__attr_"angle"',
       'd_cos__change_quantiles__f_agg_"var"__isabs_True__qh_0.6__ql_0.2',
       'd_cos__fft_coefficient__coeff_55__attr_"abs"', 'y__number_peaks__n_50',
       'dist__agg_autocorrelation__f_agg_"var"__maxlag_40',
       'y__fft_coefficient__coeff_0__attr_"real"', 'x__sample_entropy',
       'd_cos__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_14__w_2'],
      dtype='object', length=200)


In [28]:
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

X_train = np.concatenate([X_train_tsfresh], axis=1)
X_test = np.concatenate([X_test_tsfresh], axis=1)

In [48]:
X_train = np.concatenate([X_train_tsfresh], axis=1)[:, :100]
X_test = np.concatenate([X_test_tsfresh], axis=1)[:, :100]

In [55]:
from sklearn.metrics import f1_score

def evaluate_macroF1_lgb(truth, predictions):  
    pred_labels = predictions.reshape(len(np.unique(truth)),-1).argmax(axis=0)
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', f1, True) 

In [56]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

kf = KFold(n_splits=5, random_state=2020, shuffle=True)

model_list_v1 = []
score_list_v1 = []
for train_index, test_index in kf.split(X_train):
    model = get_model_v1()
    eval_set = (X_train[test_index], y_train[test_index])
    model.fit(X_train[train_index], y_train[train_index])
    model_list_v1.append(model)
    score_list_v1.append(f1_score(y_train[test_index], model.predict(X_train[test_index]), average='macro'))
    
print(score_list_v1)
print(np.mean(score_list_v1), np.std(score_list_v1))

[0.9014835513267121, 0.9118059060257062, 0.8786452202057298, 0.8937887748305907, 0.9060600001864018]
0.8983566905150282 0.011483967535097507


In [57]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

kf = KFold(n_splits=5, random_state=2020, shuffle=True)

model_list_v2 = []
score_list_v2 = []
for train_index, test_index in kf.split(X_train):
    model = get_model_v2()
    eval_set = (X_train[test_index], y_train[test_index])
    model.fit(X_train[train_index], y_train[train_index])
    model_list_v2.append(model)
    score_list_v2.append(f1_score(y_train[test_index], model.predict(X_train[test_index]), average='macro'))
    
print(score_list_v2)
print(np.mean(score_list_v2), np.std(score_list_v2))

[0.8995059849676208, 0.9029816218299925, 0.8829321990968673, 0.9063841998432226, 0.9040702027679505]
0.8991748417011307 0.008418599341669069


In [58]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

kf = KFold(n_splits=5, random_state=2020, shuffle=True)

model_list = []
score_list = []
for train_index, test_index in kf.split(X_train):
    model = lgb.LGBMClassifier(n_estimators=1000, objective='multiclass')
    eval_set = (X_train[test_index], y_train[test_index])
    model.fit(X=X_train[train_index], y=y_train[train_index], eval_metric=evaluate_macroF1_lgb,
              eval_set=eval_set, early_stopping_rounds=100, verbose=False)
    model_list.append(model)
    score_list.append(f1_score(y_train[test_index], model.predict(X_train[test_index]), average='macro'))
    
print(score_list)
print(np.mean(score_list), np.std(score_list))

[0.8789113806707363, 0.8921258111890232, 0.8527342066236795, 0.8842066146413973, 0.8877675088820176]
0.8791491044013707 0.013900025624801547
