In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train_path = '../data/hy_round1_train_20200102'
test_path = '../data/hy_round1_testB_20200221'

train_df_list = []
for file_name in os.listdir(train_path):
    df = pd.read_csv(os.path.join(train_path, file_name))
    train_df_list.append(df)
    
test_df_list = []
for file_name in os.listdir(test_path):
    df = pd.read_csv(os.path.join(test_path, file_name))
    test_df_list.append(df)

train_df = pd.concat(train_df_list)
test_df = pd.concat(test_df_list)

train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

all_df = pd.concat([train_df, test_df], sort=False)

In [3]:
from datetime import timedelta

data_list = []
for ship_id, group in all_df.groupby('渔船ID'):
    type_ = group['type'].values[0]
    group = group.sort_values(by=['time'])

    x = group['x'].values
    y = group['y'].values
    time_ = group['time'].values

    data_list.append([])
    time_list = []
    id_list = []
    dist_list = []
    speed_list = []
    for i in range(len(x) - 2):
        if i == 0:
            continue
        a = np.sqrt((x[i-1] - x[i]) ** 2 + (y[i-1] - y[i]) ** 2)
        during = (time_[i] - time_[i-1]) / np.timedelta64(1, 'h')

        dist_list.append(a)
        speed_list.append(a / during)
        id_list.append(int(ship_id))
        time_list.append(time_[i])

    dist_list = pd.Series(dist_list).values
    speed_list = pd.Series(speed_list).values

    data_list[-1].append(id_list)
    data_list[-1].append(time_list)
    data_list[-1].append(dist_list)
    data_list[-1].append(speed_list)

In [4]:
df_list = []
for i in range(len(data_list)):
    df = pd.DataFrame(np.array(data_list[i]).T, columns=['id', 'time', 'dist', 'speed'])
    df_list.append(df)
all_df_new = pd.concat(df_list)

In [5]:
all_df_new = all_df_new.fillna(-1)

In [6]:
from tsfresh import extract_features
extracted_df = extract_features(all_df_new, column_id='id', column_sort='time')

Feature Extraction: 100%|██████████| 30/30 [32:52<00:00, 65.75s/it]  


In [7]:
y = []
for name, group in all_df.groupby('渔船ID'):
    y.append(group.iloc[0]['type'])

y_train = y[:7000]
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

In [8]:
from feature_selector import FeatureSelector

In [9]:
train_df = extracted_df.iloc[:7000]
test_df = extracted_df.iloc[7000:]

In [10]:
train_df.to_csv('train_temp.csv', index=False)
test_df.to_csv('test_temp.csv', index=False)

In [11]:
train_df = pd.read_csv('train_temp.csv')
test_df = pd.read_csv('test_temp.csv')

In [12]:
fs = FeatureSelector(data=train_df, labels=y_train)

In [13]:
fs.identify_missing(missing_threshold=0.6)

10 features with greater than 0.60 missing values.



In [14]:
fs.identify_single_unique()

37 features with a single unique value.



In [18]:
fs.identify_collinear(correlation_threshold=0.95)

566 features with a correlation magnitude greater than 0.95.



In [19]:
fs.identify_zero_importance(task='classification', eval_metric='multi_logloss', 
                            n_iterations=10, early_stopping=True)

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[152]	valid_0's multi_logloss: 0.518897
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[182]	valid_0's multi_logloss: 0.524332
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[115]	valid_0's multi_logloss: 0.562417
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[161]	valid_0's multi_logloss: 0.527295
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[145]	valid_0's multi_logloss: 0.521978
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[148]	valid_0's multi_logloss: 0.548216
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[163]	valid_0's multi_logloss: 0.533949
Trainin

In [20]:
fs.identify_low_importance(cumulative_importance=0.99)

1294 features required for cumulative importance of 0.99 after one hot encoding.
214 features do not contribute to cumulative importance of 0.99.



In [21]:
fs.ops

{'missing': ['dist__friedrich_coefficients__m_3__r_30__coeff_0',
  'dist__friedrich_coefficients__m_3__r_30__coeff_1',
  'dist__friedrich_coefficients__m_3__r_30__coeff_2',
  'dist__friedrich_coefficients__m_3__r_30__coeff_3',
  'dist__max_langevin_fixed_point__m_3__r_30',
  'speed__friedrich_coefficients__m_3__r_30__coeff_0',
  'speed__friedrich_coefficients__m_3__r_30__coeff_1',
  'speed__friedrich_coefficients__m_3__r_30__coeff_2',
  'speed__friedrich_coefficients__m_3__r_30__coeff_3',
  'speed__max_langevin_fixed_point__m_3__r_30'],
 'single_unique': ['dist__fft_coefficient__coeff_0__attr_"angle"',
  'dist__fft_coefficient__coeff_0__attr_"imag"',
  'dist__large_standard_deviation__r_0.5',
  'dist__large_standard_deviation__r_0.55',
  'dist__large_standard_deviation__r_0.6000000000000001',
  'dist__large_standard_deviation__r_0.65',
  'dist__large_standard_deviation__r_0.7000000000000001',
  'dist__large_standard_deviation__r_0.75',
  'dist__large_standard_deviation__r_0.8',
  'dist

In [22]:
fs.remove(methods = ['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'])

Removed 672 features.


Unnamed: 0,dist__abs_energy,"dist__agg_autocorrelation__f_agg_""mean""__maxlag_40","dist__agg_autocorrelation__f_agg_""var""__maxlag_40","dist__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","dist__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""","dist__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""rvalue""","dist__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""stderr""","dist__agg_linear_trend__f_agg_""mean""__chunk_len_50__attr_""rvalue""","dist__agg_linear_trend__f_agg_""mean""__chunk_len_50__attr_""stderr""","dist__agg_linear_trend__f_agg_""min""__chunk_len_10__attr_""intercept""",...,speed__ratio_beyond_r_sigma__r_3,speed__ratio_beyond_r_sigma__r_5,speed__ratio_beyond_r_sigma__r_6,speed__ratio_beyond_r_sigma__r_7,speed__sample_entropy,speed__skewness,speed__spkt_welch_density__coeff_2,speed__sum_of_reoccurring_data_points,speed__time_reversal_asymmetry_statistic__lag_2,speed__time_reversal_asymmetry_statistic__lag_3
0,8.502048e+07,0.154229,0.067793,0.379570,26.125016,0.676204,175.105245,0.639512,55.032681,-74.973825,...,0.026764,0.021898,0.014599,0.002433,0.099516,5.643435,4.798438e+02,0.000000,-1.083863e+09,-3.991336e+09
1,3.615321e+08,0.477598,0.035973,0.571745,67.073959,0.609963,276.138520,0.738509,83.063237,-227.852358,...,0.036649,0.000000,0.000000,0.000000,0.771483,1.730982,1.272873e+05,3055.209780,1.129006e+09,4.847130e+08
2,6.662529e+05,0.018031,0.004836,-0.226858,-4.311280,-0.715490,57.095436,-0.472466,2.977608,0.000000,...,0.004348,0.004348,0.004348,0.004348,0.246190,8.982205,2.280769e+05,2410.673574,6.067487e+05,6.401649e+05
3,2.990019e+08,0.344827,0.051397,-0.160105,-17.798107,-0.329337,325.242906,-0.174147,143.515528,375.962124,...,0.027108,0.000000,0.000000,0.000000,0.630813,1.986827,6.372508e+08,17315.462853,-2.132749e+09,-1.027519e+09
4,3.185586e+08,-0.000178,0.052468,0.028286,3.315642,-0.372198,191.368991,0.071064,47.281040,18.595279,...,0.040201,0.000000,0.000000,0.000000,0.643937,2.198007,2.287240e+08,1675.704583,1.278099e+10,1.709082e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,8.883228e+08,0.305008,0.060839,0.276469,53.874488,0.075905,407.396019,0.365710,172.688433,313.404543,...,0.000000,0.000000,0.000000,0.000000,1.267992,0.920439,2.945048e+09,3613.147784,-7.555274e+09,-1.762101e+10
6996,5.613266e+08,0.019762,0.013272,-0.088875,-8.265576,-0.357014,210.413320,-0.465123,25.008031,651.532537,...,0.036842,0.002632,0.000000,0.000000,1.751236,1.931891,6.390505e+07,0.000000,-7.419109e+09,-9.757936e+09
6997,3.780000e+08,0.218592,0.032468,-0.357971,-64.683701,-0.592879,493.832332,-0.519323,95.246307,99.155986,...,0.038585,0.000000,0.000000,0.000000,0.646043,2.043544,1.448885e+08,10807.516891,-5.359380e+09,-1.368612e+10
6998,3.170230e+08,0.494707,0.005041,0.618470,83.063162,0.520709,329.470098,0.801398,77.119980,-126.391098,...,0.006231,0.000000,0.000000,0.000000,1.235688,0.696609,6.532409e+08,3095.287624,1.803980e+08,-3.712039e+09


In [23]:
train_new_df = fs.remove(methods = ['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'])
test_new_df = test_df[train_new_df.columns]

Removed 672 features.


In [24]:
train_new_df.shape, test_new_df.shape

((7000, 836), (2000, 836))

In [25]:
train_new_df.to_csv('train_preprocess_v1.csv', index=False)
test_new_df.to_csv('testB_preprocess_v1.csv', index=False)