In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

train_path = '../data/hy_round1_train_20200102'
test_path = '../data/hy_round1_testA_20200102'

train_df_list = []
for file_name in os.listdir(train_path):
    df = pd.read_csv(os.path.join(train_path, file_name))
    train_df_list.append(df)

test_df_list = []
for file_name in os.listdir(test_path):
    df = pd.read_csv(os.path.join(test_path, file_name))
    test_df_list.append(df)

train_df = pd.concat(train_df_list)
test_df = pd.concat(test_df_list)

train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

all_df = pd.concat([train_df, test_df], sort=False)

start_x_list = []
start_y_list = []
for ship_id, group in all_df.groupby('渔船ID'):
    group = group.sort_values(by=['time'])
    start_x = group['x'].values[0]
    start_y = group['y'].values[0]

    start_x_list.append(start_x)
    start_y_list.append(start_y)

In [25]:
data_list = []
for ship_id, group in all_df.groupby('渔船ID'):
    data_list.append([])
    group = group.sort_values(by=['time'])
    hours = pd.DatetimeIndex(group['time'].values).hour
    x = group['x'].values
    y = group['y'].values
    for i in range(24):
        if len(hours[hours == i]) == 0:
            data_list[-1].append(-1)
            data_list[-1].append(-1)
        else:
            data_list[-1].append(np.mean(x[hours == i]))
            data_list[-1].append(np.mean(y[hours == i]))

In [28]:
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

def get_model():
    exported_pipeline = make_pipeline(
        RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.7000000000000001, n_estimators=100), step=0.1),
        StandardScaler(),
        StackingEstimator(estimator=SGDClassifier(alpha=0.001, eta0=0.01, fit_intercept=False, l1_ratio=1.0, learning_rate="invscaling", loss="perceptron", penalty="elasticnet", power_t=0.5)),
        GradientBoostingClassifier(learning_rate=0.5, max_depth=7, max_features=0.15000000000000002, min_samples_leaf=2, min_samples_split=2, n_estimators=100, subsample=0.8500000000000001)
    )
    set_param_recursive(exported_pipeline.steps, 'random_state', 2020)
    return exported_pipeline