In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train_path = '../data/hy_round1_train_20200102'
test_path = '../data/hy_round1_testA_20200102'

train_df_list = []
for file_name in os.listdir(train_path):
    df = pd.read_csv(os.path.join(train_path, file_name))
    train_df_list.append(df)
    
test_df_list = []
for file_name in os.listdir(test_path):
    df = pd.read_csv(os.path.join(test_path, file_name))
    test_df_list.append(df)

train_df = pd.concat(train_df_list)
test_df = pd.concat(test_df_list)

train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

all_df = pd.concat([train_df, test_df], sort=False)

In [3]:
from datetime import timedelta

data_list = []
for ship_id, group in all_df.groupby('渔船ID'):
    type_ = group['type'].values[0]
    group = group.sort_values(by=['time'])

    x = group['x'].values
    y = group['y'].values
    time_ = group['time'].values

    data_list.append([])
    time_list = []
    id_list = []
    dist_list = []
    speed_list = []
    for i in range(len(x) - 2):
        if i == 0:
            continue
        a = np.sqrt((x[i-1] - x[i]) ** 2 + (y[i-1] - y[i]) ** 2)
        during = (time_[i] - time_[i-1]) / np.timedelta64(1, 'h')

        dist_list.append(a)
        speed_list.append(a / during)
        id_list.append(int(ship_id))
        time_list.append(time_[i])

    dist_list = pd.Series(dist_list).values
    speed_list = pd.Series(speed_list).values

    data_list[-1].append(id_list)
    data_list[-1].append(time_list)
    data_list[-1].append(dist_list)
    data_list[-1].append(speed_list)

In [4]:
df_list = []
for i in range(len(data_list)):
    df = pd.DataFrame(np.array(data_list[i]).T, columns=['id', 'time', 'dist', 'speed'])
    df_list.append(df)
all_df_new = pd.concat(df_list)

In [5]:
all_df_new = all_df_new.fillna(-1)

In [6]:
from tsfresh import extract_features
extracted_df = extract_features(all_df_new, column_id='id', column_sort='time')

Feature Extraction: 100%|██████████| 30/30 [30:11<00:00, 60.37s/it]  


In [7]:
y = []
for name, group in all_df.groupby('渔船ID'):
    y.append(group.iloc[0]['type'])

y_train = y[:7000]
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

In [8]:
from feature_selector import FeatureSelector

In [9]:
train_df = extracted_df.iloc[:7000]
test_df = extracted_df.iloc[7000:]

In [13]:
train_df.to_csv('train_v2.csv', index=False)
test_df.to_csv('test_v2.csv', index=False)

In [15]:
train_df = pd.read_csv('train_v2.csv')
test_df = pd.read_csv('test_v2.csv')

In [16]:
fs = FeatureSelector(data=train_df, labels=y_train)

In [20]:
fs.identify_missing(missing_threshold=0.6)

10 features with greater than 0.60 missing values.



In [21]:
fs.identify_single_unique()

37 features with a single unique value.



In [22]:
fs.identify_collinear(correlation_threshold=0.98)

472 features with a correlation magnitude greater than 0.98.



In [24]:
fs.identify_zero_importance(task='classification', eval_metric='multi_logloss', 
                            n_iterations=20, early_stopping=True)

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[193]	valid_0's multi_logloss: 0.510096
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[179]	valid_0's multi_logloss: 0.507505
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[135]	valid_0's multi_logloss: 0.541779
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[186]	valid_0's multi_logloss: 0.532689
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[128]	valid_0's multi_logloss: 0.507977
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[127]	valid_0's multi_logloss: 0.511174
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[137]	valid_0's multi_logloss: 0.521454
Trainin

In [25]:
fs.identify_low_importance(cumulative_importance=0.99)

1297 features required for cumulative importance of 0.99 after one hot encoding.
211 features do not contribute to cumulative importance of 0.99.



In [26]:
fs.ops

{'missing': ['dist__friedrich_coefficients__m_3__r_30__coeff_0',
  'dist__friedrich_coefficients__m_3__r_30__coeff_1',
  'dist__friedrich_coefficients__m_3__r_30__coeff_2',
  'dist__friedrich_coefficients__m_3__r_30__coeff_3',
  'dist__max_langevin_fixed_point__m_3__r_30',
  'speed__friedrich_coefficients__m_3__r_30__coeff_0',
  'speed__friedrich_coefficients__m_3__r_30__coeff_1',
  'speed__friedrich_coefficients__m_3__r_30__coeff_2',
  'speed__friedrich_coefficients__m_3__r_30__coeff_3',
  'speed__max_langevin_fixed_point__m_3__r_30'],
 'single_unique': ['dist__fft_coefficient__coeff_0__attr_"angle"',
  'dist__fft_coefficient__coeff_0__attr_"imag"',
  'dist__large_standard_deviation__r_0.5',
  'dist__large_standard_deviation__r_0.55',
  'dist__large_standard_deviation__r_0.6000000000000001',
  'dist__large_standard_deviation__r_0.65',
  'dist__large_standard_deviation__r_0.7000000000000001',
  'dist__large_standard_deviation__r_0.75',
  'dist__large_standard_deviation__r_0.8',
  'dist

In [28]:
fs.remove(methods = ['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'])

Removed 588 features.


Unnamed: 0,dist__abs_energy,"dist__agg_autocorrelation__f_agg_""mean""__maxlag_40","dist__agg_autocorrelation__f_agg_""median""__maxlag_40","dist__agg_autocorrelation__f_agg_""var""__maxlag_40","dist__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","dist__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","dist__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""intercept""","dist__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""rvalue""","dist__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""stderr""","dist__agg_linear_trend__f_agg_""mean""__chunk_len_10__attr_""rvalue""",...,speed__ratio_beyond_r_sigma__r_6,speed__ratio_beyond_r_sigma__r_7,speed__sample_entropy,speed__skewness,speed__spkt_welch_density__coeff_2,speed__sum_of_reoccurring_data_points,speed__symmetry_looking__r_0.1,speed__time_reversal_asymmetry_statistic__lag_1,speed__time_reversal_asymmetry_statistic__lag_2,speed__time_reversal_asymmetry_statistic__lag_3
0,8.502048e+07,0.154229,-0.002052,0.067793,-333.211973,0.379570,-854.321799,0.676204,175.105245,0.427994,...,0.014599,0.002433,0.099516,5.643435,4.798438e+02,0.000000,1.0,1.059136e+08,-1.083863e+09,-3.991336e+09
1,3.615321e+08,0.477598,0.402533,0.035973,-470.249751,0.571745,-373.390262,0.609963,276.138520,0.650535,...,0.000000,0.000000,0.771483,1.730982,1.272873e+05,3055.209780,0.0,9.204880e+08,1.129006e+09,4.847130e+08
2,6.662529e+05,0.018031,-0.006706,0.004836,115.907950,-0.226858,409.912952,-0.715490,57.095436,-0.189717,...,0.004348,0.004348,0.246190,8.982205,2.280769e+05,2410.673574,1.0,8.040876e+05,6.067487e+05,6.401649e+05
3,2.990019e+08,0.344827,0.246602,0.051397,1014.773657,-0.160105,2013.509317,-0.329337,325.242906,-0.143634,...,0.000000,0.000000,0.630813,1.986827,6.372508e+08,17315.462853,0.0,-2.206569e+09,-2.132749e+09,-1.027519e+09
4,3.185586e+08,-0.000178,-0.082480,0.052468,1015.180429,0.028286,3445.443349,-0.372198,191.368991,0.040231,...,0.000000,0.000000,0.643937,2.198007,2.287240e+08,1675.704583,0.0,7.068736e+09,1.278099e+10,1.709082e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,8.883228e+08,0.305008,0.358928,0.060839,1221.410419,0.276469,2812.570494,0.075905,407.396019,0.331983,...,0.000000,0.000000,1.267992,0.920439,2.945048e+09,3613.147784,1.0,-1.511410e+09,-7.555274e+09,-1.762101e+10
6996,5.613266e+08,0.019762,-0.005820,0.013272,2085.144961,-0.088875,3886.696386,-0.357014,210.413320,-0.196791,...,0.000000,0.000000,1.751236,1.931891,6.390505e+07,0.000000,1.0,-4.466334e+09,-7.419109e+09,-9.757936e+09
6997,3.780000e+08,0.218592,0.164225,0.032468,1971.106171,-0.357971,4657.016752,-0.592879,493.832332,-0.344077,...,0.000000,0.000000,0.646043,2.043544,1.448885e+08,10807.516891,0.0,-7.624368e+09,-5.359380e+09,-1.368612e+10
6998,3.170230e+08,0.494707,0.488403,0.005041,-56.544848,0.618470,521.510170,0.520709,329.470098,0.795845,...,0.000000,0.000000,1.235688,0.696609,6.532409e+08,3095.287624,0.0,-1.649296e+09,1.803980e+08,-3.712039e+09


In [29]:
train_new_df = fs.remove(methods = ['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'])
test_new_df = test_df[train_new_df.columns]

Removed 588 features.


In [30]:
train_new_df.shape, test_new_df.shape

((7000, 920), (2000, 920))

In [31]:
train_new_df.to_csv('train_preprocess_v1.csv', index=False)
test_new_df.to_csv('test_preprocess_v1.csv', index=False)