In [13]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import f1_score
import lightgbm as lgb
from sklearn import preprocessing
from datetime import timedelta

import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectPercentile, VarianceThreshold, f_classif
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import FunctionTransformer
from copy import copy

In [2]:
train_path = '../data/hy_round1_train_20200102'
test_path = '../data/hy_round1_testA_20200102'

train_df_list = []
for file_name in os.listdir(train_path):
    df = pd.read_csv(os.path.join(train_path, file_name))
    train_df_list.append(df)

test_df_list = []
for file_name in os.listdir(test_path):
    df = pd.read_csv(os.path.join(test_path, file_name))
    test_df_list.append(df)

train_df = pd.concat(train_df_list)
test_df = pd.concat(test_df_list)

train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

all_df = pd.concat([train_df, test_df], sort=False)

group_list = []
for ship_id, group in all_df.groupby('渔船ID'):
    type_ = group['type'].values[0]
    group = group.sort_values(by=['time'])
    group = group[['time', 'x', 'y', '速度', '方向']]
    group = group.set_index('time')
    if (group.index[-1] - group.index[0]) < timedelta(days=3):
        group = group.append(pd.DataFrame(index=[group.index[0] + timedelta(days=3)]), sort=False)
    group = group.resample('10min').mean().ffill()
    group['type'] = type_
    group['渔船ID'] = ship_id
    group['time'] = group.index.values
    group = group.set_index(pd.Index(range(group.shape[0])))
    group_list.append(group)
new_df = pd.concat(group_list)

In [3]:
all_df.shape, new_df.shape

((3482016, 7), (3897000, 7))

In [6]:
def feature_generate_tsfresh():
    train_df = pd.read_csv('./train_v2.csv')
    X_train = train_df.drop(columns=['type'])
    y_train = train_df['type']
    
    le = preprocessing.LabelEncoder()
    y_train = le.fit_transform(y_train)

    test_df = pd.read_csv('./test_v2.csv')
    X_test = test_df[X_train.columns]
    
    base_model =  lgb.LGBMClassifier(n_estimators=400, objective='multiclass')
    base_model.fit(X_train.values, y_train)
    
    selected_columns = X_train.columns[np.argsort(base_model.feature_importances_)[::-1][:24]]
    print(selected_columns)
    
    X_train = X_train[selected_columns].values

    X_test = X_test[selected_columns]
    for column in list(X_test.columns[X_test.isnull().sum() > 0]):
        mean_val = X_test[column].mean()
        X_test[column].fillna(mean_val, inplace=True)
    X_test = X_test.values
    
    return X_train, le.inverse_transform(y_train), X_test, selected_columns

In [7]:
X_train_tsfresh, y_train, X_test_tsfresh, feature_tsfresh = feature_generate_tsfresh()

Index(['x__quantile__q_0.1', 'y__maximum', 'x__minimum', 'y__quantile__q_0.8',
       'y__quantile__q_0.9', 'y__quantile__q_0.7', 'y__minimum',
       'x__quantile__q_0.2', '速度__number_crossing_m__m_1',
       'x__quantile__q_0.9', 'x__maximum', 'x__quantile__q_0.4',
       'y__number_cwt_peaks__n_1', '方向__quantile__q_0.9',
       '速度__agg_autocorrelation__f_agg_"median"__maxlag_40', 'x__median',
       'y__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_1__w_20',
       '速度__agg_autocorrelation__f_agg_"var"__maxlag_40', 'x__quantile__q_0.3',
       'y__quantile__q_0.6', '速度__quantile__q_0.7',
       'y__change_quantiles__f_agg_"mean"__isabs_True__qh_0.2__ql_0.0',
       'y__fft_coefficient__coeff_66__attr_"angle"',
       'x__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_0__w_2'],
      dtype='object')


In [8]:
def get_model():
    exported_pipeline = make_pipeline(
        make_union(
            make_pipeline(
                make_union(
                    FunctionTransformer(copy),
                    FunctionTransformer(copy)
                ),
                SelectPercentile(score_func=f_classif, percentile=18)
            ),
            FunctionTransformer(copy)
        ),
        StackingEstimator(estimator=SGDClassifier(alpha=0.01, eta0=0.1, fit_intercept=False, l1_ratio=1.0, learning_rate="constant", loss="hinge", penalty="elasticnet", power_t=0.1)),
        VarianceThreshold(threshold=0.05),
        ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.55, min_samples_leaf=1, min_samples_split=4, n_estimators=100)
    )
    set_param_recursive(exported_pipeline.steps, 'random_state', 42)
    return exported_pipeline

In [9]:
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

In [10]:
X_train = np.concatenate([X_train_tsfresh], axis=1)
X_test = np.concatenate([X_test_tsfresh], axis=1)

In [11]:
from sklearn.metrics import f1_score

def evaluate_macroF1_lgb(truth, predictions):  
    pred_labels = predictions.reshape(len(np.unique(truth)),-1).argmax(axis=0)
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', f1, True) 

In [14]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

kf = KFold(n_splits=5, random_state=42, shuffle=True)

model_list = []
score_list = []
for train_index, test_index in kf.split(X_train):
    model = get_model()
    eval_set = (X_train[test_index], y_train[test_index])
    model.fit(X_train[train_index], y_train[train_index])
    model_list.append(model)
    score_list.append(f1_score(y_train[test_index], model.predict(X_train[test_index]), average='macro'))
    
print(score_list)
print(np.mean(score_list), np.std(score_list))

[0.8990782336522005, 0.9169748183797771, 0.9219911473589092, 0.917974346596199, 0.9036324067282578]
0.9119301905430687 0.00891323686006115


In [24]:
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler

In [28]:
new_df['x_transformed'] = MinMaxScaler().fit_transform(new_df[['x']].values)
new_df['y_transformed'] = MinMaxScaler().fit_transform(new_df[['y']].values)
new_df['v_transformed'] = MinMaxScaler().fit_transform(new_df[['速度']].values)
new_df['d_transformed'] = MinMaxScaler().fit_transform(new_df[['方向']].values)

In [31]:
train_data = []
label_data = []
for ship_id, group in new_df.groupby('渔船ID'):
    train_data.append(group[['x_transformed', 'y_transformed', 'v_transformed', 'd_transformed']].values)
    label_data.append(group['type'].values[0])

In [35]:
train_data = train_data[:7000]
test_data = train_data[7000:]
label_data = label_data[:7000]

In [37]:
X_train = np.array(train_data)

le = preprocessing.LabelEncoder()
y_train = le.fit_transform(label_data)

In [48]:
kf = KFold(5, shuffle=True, random_state=42)

In [49]:
X = X_train
y = y_train

In [64]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(y)
y = keras.utils.to_categorical(y)

In [71]:
for index, (train_index, valid_index) in enumerate(kf.split(X, y)):
    X_train, X_valid, y_train, y_valid = X[train_index], X[valid_index], y[train_index], y[valid_index]
    
    inputs = keras.Input(shape=(433, 4))
    bilstm = keras.layers.Bidirectional(keras.layers.LSTM(30, return_sequences=True))(inputs)
    x = keras.layers.Flatten()(bilstm)
    x = keras.layers.Dense(32, activation='relu')(x)
    output = keras.layers.Dense(3, activation='softmax')(x)
    model = keras.models.Model(inputs=inputs, outputs=output)
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(x=X_train, y=y_train, validation_data=(X_valid, y_valid), batch_size=32, epochs=20, verbose=2)
    y_pred = model.predict(X_valid)
    y_valid = np.argmax(y_valid, axis=1)
    y_pred = np.argmax(y_pred, axis=1)
    valid_score = f1_score(y_valid, y_pred, average='macro')
    print(valid_score)

Train on 5600 samples, validate on 1400 samples
Epoch 1/20
5600/5600 - 21s - loss: 0.8576 - accuracy: 0.6650 - val_loss: 0.6899 - val_accuracy: 0.7200
Epoch 2/20
5600/5600 - 18s - loss: 0.6706 - accuracy: 0.7129 - val_loss: 0.6474 - val_accuracy: 0.7557
Epoch 3/20
5600/5600 - 19s - loss: 0.6350 - accuracy: 0.7171 - val_loss: 0.6674 - val_accuracy: 0.7093
Epoch 4/20
5600/5600 - 19s - loss: 0.6000 - accuracy: 0.7371 - val_loss: 0.6088 - val_accuracy: 0.7286
Epoch 5/20
5600/5600 - 19s - loss: 0.5834 - accuracy: 0.7402 - val_loss: 0.5947 - val_accuracy: 0.7536
Epoch 6/20
5600/5600 - 19s - loss: 0.5688 - accuracy: 0.7421 - val_loss: 0.5638 - val_accuracy: 0.7557
Epoch 7/20
5600/5600 - 19s - loss: 0.5611 - accuracy: 0.7400 - val_loss: 0.5904 - val_accuracy: 0.7493
Epoch 8/20
5600/5600 - 19s - loss: 0.5452 - accuracy: 0.7532 - val_loss: 0.5645 - val_accuracy: 0.7657
Epoch 9/20
5600/5600 - 19s - loss: 0.5459 - accuracy: 0.7575 - val_loss: 0.6032 - val_accuracy: 0.7429
Epoch 10/20
5600/5600 - 1

KeyboardInterrupt: 

In [74]:
for ship_id, group in all_df.groupby('渔船ID'):
    if np.mean(group['速度'].values) == 0:
        print(group)
#     print(group[group['速度'] > 0].shape)

    渔船ID             x             y   速度   方向                time type
0   1709  6.165599e+06  5.202660e+06  0.0  360 1900-11-17 10:31:37   刺网
1   1709  6.165599e+06  5.202660e+06  0.0  360 1900-11-17 09:58:20   刺网
2   1709  6.165599e+06  5.202660e+06  0.0  360 1900-11-17 08:27:46   刺网
3   1709  6.165599e+06  5.202660e+06  0.0  360 1900-11-17 07:57:58   刺网
4   1709  6.165599e+06  5.202660e+06  0.0  360 1900-11-17 03:57:26   刺网
..   ...           ...           ...  ...  ...                 ...  ...
71  1709  6.165599e+06  5.202660e+06  0.0  360 1900-11-14 19:49:01   刺网
72  1709  6.165599e+06  5.202660e+06  0.0  360 1900-11-14 19:18:16   刺网
73  1709  6.165599e+06  5.202660e+06  0.0  360 1900-11-14 18:48:06   刺网
74  1709  6.165599e+06  5.202660e+06  0.0  360 1900-11-14 18:18:00   刺网
75  1709  6.165599e+06  5.202660e+06  0.0  360 1900-11-14 17:48:16   刺网

[76 rows x 7 columns]
     渔船ID             x             y   速度   方向                time type
0    3423  6.165599e+06  5.202660e+06  0

In [75]:
runing_df = all_df[all_df['速度'] > 0]

In [77]:
df = runing_df.drop(columns=['type'])
y = runing_df['type']

In [79]:
from tsfresh import extract_features
extracted_df = extract_features(df, column_id='渔船ID', column_sort='time')

Feature Extraction: 100%|██████████| 30/30 [35:13<00:00, 70.47s/it]  
