In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
train_path = '../data/hy_round1_train_20200102'
test_path = '../data/hy_round1_testA_20200102'

train_df_list = []
for file_name in os.listdir(train_path):
    df = pd.read_csv(os.path.join(train_path, file_name))
    train_df_list.append(df)

test_df_list = []
for file_name in os.listdir(test_path):
    df = pd.read_csv(os.path.join(test_path, file_name))
    test_df_list.append(df)

train_df = pd.concat(train_df_list)
test_df = pd.concat(test_df_list)

train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

all_df = pd.concat([train_df, test_df], sort=False)

new_df = all_df.groupby('渔船ID').agg(x_min=('x', 'min'), x_max=('x', 'max'), x_mean=('x', 'mean'), x_std=('x', 'std'), x_skew=('x', 'skew'), x_sum=('x', 'sum'),
            y_min=('y', 'min'), y_max=('y', 'max'), y_mean=('y', 'mean'), y_std=('y', 'std'), y_skew=('y', 'skew'), y_sum=('y', 'sum'),
            v_min=('速度', 'min'), v_max=('速度', 'max'), v_mean=('速度', 'mean'), v_std=('速度', 'std'), v_skew=('速度', 'skew'), v_sum=('速度', 'sum'),
            d_min=('方向', 'min'), d_max=('方向', 'max'), d_mean=('方向', 'mean'), d_std=('方向', 'std'), d_skew=('方向', 'skew'), d_sum=('方向', 'sum'))

new_df['x_max-x_min'] = new_df['x_max'] - new_df['x_min']
new_df['y_max-y_min'] = new_df['y_max'] - new_df['y_min']
new_df['x_max-y_min'] = new_df['x_max'] - new_df['y_min']
new_df['y_max-x_min'] = new_df['y_max'] - new_df['x_min']

new_df['x_max/x_min'] = new_df['x_max'] / new_df['x_min']
new_df['y_max/y_min'] = new_df['y_max'] / new_df['y_min']
new_df['x_max/y_min'] = new_df['x_max'] / new_df['y_min']
new_df['y_max/x_min'] = new_df['y_max'] / new_df['x_min']

new_df['slope'] = new_df['y_max-y_min'] / np.where(new_df['x_max-x_min']==0, 0.001, new_df['x_max-x_min'])
new_df['area'] = new_df['x_max-x_min'] * new_df['y_max-y_min']

xy_cov = []
vd_cov = []
xy_corr = []
vd_corr = []
for ship_id, group in all_df.groupby('渔船ID'):
    xy_cov.append(group['x'].cov(group['y']))
    vd_cov.append(group['速度'].cov(group['方向']))
    xy_corr.append(group['x'].corr(group['y']))
    vd_corr.append(group['速度'].corr(group['方向']))
    
new_df['xy_cov'] = xy_cov
new_df['vd_cov'] = vd_cov
new_df['xy_corr'] = xy_corr
new_df['vd_corr'] = vd_corr

new_df['type'] = all_df.groupby('渔船ID').agg(type=('type', 'first'))['type'].values

X_train = new_df.drop(columns=['type']).iloc[:7000]
y_train = new_df.iloc[:7000]['type']

X_test = new_df.drop(columns=['type']).iloc[7000:]

In [4]:
X_train.to_csv('train_preprocess_v3.csv', index=False)
X_test.to_csv('test_preprocess_v3.csv', index=False)