In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
train_path = '../data/hy_round1_train_20200102'
test_path = '../data/hy_round1_testA_20200102'

train_df_list = []
for file_name in os.listdir(train_path):
    df = pd.read_csv(os.path.join(train_path, file_name))
    train_df_list.append(df)
    
test_df_list = []
for file_name in os.listdir(test_path):
    df = pd.read_csv(os.path.join(test_path, file_name))
    test_df_list.append(df)

train_df = pd.concat(train_df_list)
test_df = pd.concat(test_df_list)

train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

all_df = pd.concat([train_df, test_df], sort=False)

In [3]:
df = all_df.drop(columns=['type'])
y = all_df['type']

In [5]:
from tsfresh import extract_features
extracted_df = extract_features(df, column_id='渔船ID', column_sort='time')

Feature Extraction: 100%|██████████| 30/30 [56:54<00:00, 113.80s/it] 


In [6]:
train_df = extracted_df.iloc[:7000]
test_df = extracted_df.iloc[7000:]

In [7]:
train_df.to_csv('train_temp.csv', index=False)
test_df.to_csv('test_temp.csv', index=False)

train_df = pd.read_csv('train_temp.csv')
test_df = pd.read_csv('test_temp.csv')

In [8]:
from feature_selector import FeatureSelector

In [9]:
y = []
for name, group in all_df.groupby('渔船ID'):
    y.append(group.iloc[0]['type'])

y_train = y[:7000]
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

In [10]:
fs = FeatureSelector(data=train_df, labels=y_train)

In [11]:
fs.identify_missing(missing_threshold=0.6)

20 features with greater than 0.60 missing values.



In [12]:
fs.identify_single_unique()

51 features with a single unique value.



In [14]:
fs.identify_collinear(correlation_threshold=0.96)

732 features with a correlation magnitude greater than 0.96.



In [15]:
fs.identify_zero_importance(task='classification', eval_metric='multi_logloss', 
                            n_iterations=10, early_stopping=True)

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[236]	valid_0's multi_logloss: 0.28863
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[318]	valid_0's multi_logloss: 0.236665
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[195]	valid_0's multi_logloss: 0.294681
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[197]	valid_0's multi_logloss: 0.314341
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[149]	valid_0's multi_logloss: 0.339961
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[250]	valid_0's multi_logloss: 0.292286
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[198]	valid_0's multi_logloss: 0.316347
Training

In [18]:
fs.identify_low_importance(cumulative_importance=0.96)

2152 features required for cumulative importance of 0.96 after one hot encoding.
864 features do not contribute to cumulative importance of 0.96.



In [19]:
fs.remove(methods = ['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'])

Removed 1264 features.


Unnamed: 0,x__abs_energy,x__absolute_sum_of_changes,"x__agg_autocorrelation__f_agg_""mean""__maxlag_40","x__agg_autocorrelation__f_agg_""var""__maxlag_40","x__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","x__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""","x__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""stderr""","x__agg_linear_trend__f_agg_""mean""__chunk_len_10__attr_""stderr""","x__agg_linear_trend__f_agg_""mean""__chunk_len_50__attr_""stderr""","x__agg_linear_trend__f_agg_""min""__chunk_len_10__attr_""stderr""",...,速度__spkt_welch_density__coeff_5,速度__spkt_welch_density__coeff_8,速度__standard_deviation,速度__sum_of_reoccurring_values,速度__symmetry_looking__r_0.05,速度__symmetry_looking__r_0.1,速度__time_reversal_asymmetry_statistic__lag_1,速度__time_reversal_asymmetry_statistic__lag_2,速度__time_reversal_asymmetry_statistic__lag_3,速度__value_count__value_0
0,1.550284e+16,33690.547057,0.185206,0.087132,6.115128e+06,83.249805,1153.372742,73.916996,1020.868730,64.322082,...,0.006591,0.005671,1.319651,14.83,1.0,1.0,0.091496,-0.248293,-0.457245,369.0
1,1.428587e+16,127842.730154,0.863539,0.010430,6.111946e+06,139.080111,1278.736309,148.888266,1521.146349,158.568694,...,124.460879,59.702687,2.409553,109.98,0.0,0.0,0.111732,-0.019020,-0.132295,131.0
2,8.907502e+15,1444.368738,0.376594,0.115759,6.182832e+06,5.303632,19.390261,5.008848,43.888637,5.314295,...,75.081499,32.177442,3.408486,5.02,1.0,1.0,-5.297964,2.993956,0.578998,54.0
3,9.195447e+15,123298.502649,0.714708,0.055327,5.245016e+06,337.523491,5369.168019,308.580179,3180.617199,277.702500,...,59.834443,23.925398,2.524816,108.78,0.0,0.0,-0.302473,-0.133740,-0.305196,70.0
4,1.999865e+16,87290.066782,0.470109,0.107981,7.064092e+06,81.075765,490.138830,77.233644,593.453124,78.266891,...,135.702669,21.729297,2.493721,123.75,0.0,0.0,-0.985539,1.144961,0.964508,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,1.266336e+16,157651.711649,0.874213,0.008204,6.232871e+06,404.425763,5108.338990,394.806545,3979.656907,388.730315,...,265.160642,29.926829,2.797322,181.67,1.0,1.0,-0.408664,-1.130359,-2.039815,30.0
6996,1.600072e+16,174210.500941,0.724258,0.032199,6.464921e+06,187.137640,2608.434477,185.984234,1932.740946,186.796668,...,27.829023,48.768572,1.349902,110.06,1.0,1.0,-0.784376,-1.466676,-1.557038,0.0
6997,8.622519e+15,122587.630518,0.722386,0.052827,5.252540e+06,366.424765,4770.332022,336.905536,3057.147261,310.713540,...,19.618620,17.796027,2.466203,93.52,0.0,0.0,-0.682603,-0.603062,-0.939807,65.0
6998,1.316108e+16,62496.924051,0.770850,0.023609,6.363303e+06,110.142229,1368.180381,107.658757,1044.557933,103.183571,...,15.503903,13.991880,2.030258,112.48,0.0,1.0,0.295247,0.242054,-0.410336,28.0


In [20]:
train_new_df = fs.remove(methods = ['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'])
test_new_df = test_df[train_new_df.columns]

Removed 1264 features.


In [21]:
train_new_df.to_csv('train_preprocess_v4.csv', index=False)
test_new_df.to_csv('test_preprocess_v4.csv', index=False)