In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train_path = '../data/hy_round1_train_20200102'
test_path = '../data/hy_round1_testA_20200102'

train_df_list = []
for file_name in os.listdir(train_path):
    df = pd.read_csv(os.path.join(train_path, file_name))
    train_df_list.append(df)
    
test_df_list = []
for file_name in os.listdir(test_path):
    df = pd.read_csv(os.path.join(test_path, file_name))
    test_df_list.append(df)

train_df = pd.concat(train_df_list)
test_df = pd.concat(test_df_list)

train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

all_df = pd.concat([train_df, test_df], sort=False)

In [3]:
group_list = []
for ship_id, group in all_df.groupby('渔船ID'):
    group = group.sort_values(by=['time'])
    values = group.values
    idx = []
    for i in range(values.shape[0]):
        if i == 0:
            idx.append(i)
            continue
        if (values[i][1] == values[idx[-1]][1]) and (values[i][2] == values[idx[-1]][2]):
            if i < (values.shape[0] - 1) and (values[i+1][1] != values[idx[-1]][1]) and \
                (values[i+1][2] != values[idx[-1]][2]):
                idx.append(i)
            elif i == (values.shape[0] - 1):
                idx.append(i)
        else:
            idx.append(i)
    group_list.append(group.iloc[idx].copy())

In [4]:
new_df = pd.concat(group_list)

In [5]:
df = new_df.drop(columns=['type'])
y = new_df['type']

In [6]:
from tsfresh import extract_features
extracted_df = extract_features(df, column_id='渔船ID', column_sort='time')

Feature Extraction: 100%|██████████| 30/30 [19:51<00:00, 39.72s/it]  


In [7]:
y = []
for name, group in all_df.groupby('渔船ID'):
    y.append(group.iloc[0]['type'])

y_train = y[:7000]
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

In [8]:
train_df = extracted_df.iloc[:7000]
test_df = extracted_df.iloc[7000:]

In [9]:
train_df.to_csv('train_temp.csv', index=False)
test_df.to_csv('test_temp.csv', index=False)

In [10]:
train_df = pd.read_csv('train_temp.csv')
test_df = pd.read_csv('test_temp.csv')

In [11]:
from feature_selector import FeatureSelector

fs = FeatureSelector(data=train_df, labels=y_train)

In [16]:
fs.identify_missing(missing_threshold=0.5)

20 features with greater than 0.50 missing values.



In [17]:
fs.identify_single_unique()

78 features with a single unique value.



In [20]:
fs.identify_collinear(correlation_threshold=0.96)

642 features with a correlation magnitude greater than 0.96.



In [18]:
fs.identify_zero_importance(task='classification', eval_metric='multi_logloss', 
                            n_iterations=10, early_stopping=True)

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[219]	valid_0's multi_logloss: 0.304588
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[228]	valid_0's multi_logloss: 0.30879
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[211]	valid_0's multi_logloss: 0.27868
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[270]	valid_0's multi_logloss: 0.270058
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[211]	valid_0's multi_logloss: 0.302644
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[169]	valid_0's multi_logloss: 0.303408
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[264]	valid_0's multi_logloss: 0.280363
Training 

In [23]:
fs.identify_low_importance(cumulative_importance=0.95)

1907 features required for cumulative importance of 0.95 after one hot encoding.
1109 features do not contribute to cumulative importance of 0.95.



In [25]:
fs.remove(methods = ['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'])

Removed 1479 features.


Unnamed: 0,x__abs_energy,x__absolute_sum_of_changes,"x__agg_autocorrelation__f_agg_""mean""__maxlag_40","x__agg_autocorrelation__f_agg_""var""__maxlag_40","x__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","x__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","x__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""","x__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""","x__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""rvalue""","x__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""slope""",...,速度__spkt_welch_density__coeff_5,速度__spkt_welch_density__coeff_8,速度__standard_deviation,速度__sum_of_reoccurring_values,速度__symmetry_looking__r_0.05,速度__symmetry_looking__r_0.1,速度__time_reversal_asymmetry_statistic__lag_1,速度__time_reversal_asymmetry_statistic__lag_2,速度__time_reversal_asymmetry_statistic__lag_3,速度__value_count__value_0
0,1.015798e+15,33690.547057,-0.544305,0.864339,6.126324e+06,0.945490,14276.868050,4917.335900,,,...,8.713979,0.877411,3.436702,14.62,0.0,1.0,1.508892,-4.427531,-8.883531,5.0
1,5.348298e+15,127842.730154,0.287245,0.177655,6.087388e+06,-0.493192,-1504.860165,736.186036,-0.631487,-9006.430268,...,12.532110,11.142516,2.188740,108.58,1.0,1.0,0.298941,-0.051265,-0.359078,2.0
2,1.605675e+15,1444.368738,-0.115474,0.096877,6.183191e+06,-0.577350,-20.252467,16.536070,,,...,304.086227,331.849073,7.676900,3.83,1.0,1.0,-31.198307,-6.538440,7.671854,7.0
3,5.202202e+15,123298.502649,0.597626,0.105392,5.266374e+06,-0.482720,-1933.220908,850.655555,-0.894165,-18259.322872,...,143.286696,58.867825,2.986908,107.59,0.0,0.0,-0.531040,-0.203989,-0.317883,19.0
4,1.226863e+16,87290.066782,0.146953,0.245593,7.065129e+06,-0.164974,-132.700631,165.424942,-0.716999,-1353.503594,...,359.719096,16.933681,2.922024,122.94,0.0,0.0,-1.579079,1.622056,0.799536,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,9.943028e+15,157651.711649,0.614133,0.068381,6.272655e+06,0.721162,3042.704551,609.466912,0.807046,9023.394621,...,27.198934,25.711786,2.620994,181.67,1.0,1.0,-0.523965,-1.390346,-2.560391,6.0
6996,1.600072e+16,174210.500941,0.724258,0.032199,6.464921e+06,0.015168,17.267696,187.137640,-0.161090,-1042.880009,...,27.829023,48.768572,1.349902,110.06,1.0,1.0,-0.784376,-1.466676,-1.557038,0.0
6997,4.738671e+15,122587.630518,0.535359,0.146582,5.259090e+06,-0.226418,-953.517077,1025.488678,-0.688335,-12659.395978,...,14.495332,66.493119,2.895133,92.82,0.0,1.0,-1.257698,-1.516622,-1.503026,18.0
6998,8.096422e+15,62496.924051,0.297433,0.205359,6.376790e+06,0.266816,329.550372,280.567502,-0.744432,-1881.168264,...,0.697823,14.748912,1.844245,106.60,1.0,1.0,-0.066371,0.417902,-0.542198,5.0


In [26]:
train_new_df = fs.remove(methods = ['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'])
test_new_df = test_df[train_new_df.columns]

Removed 1479 features.


In [27]:
train_new_df.shape, test_new_df.shape

((7000, 1537), (2000, 1537))

In [28]:
train_new_df.to_csv('train_preprocess_v2.csv', index=False)
test_new_df.to_csv('test_preprocess_v2.csv', index=False)