In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
train_path = '../data/hy_round1_train_20200102'
test_path = '../data/hy_round1_testA_20200102'

train_df_list = []
for file_name in os.listdir(train_path):
    df = pd.read_csv(os.path.join(train_path, file_name))
    train_df_list.append(df)

test_df_list = []
for file_name in os.listdir(test_path):
    df = pd.read_csv(os.path.join(test_path, file_name))
    test_df_list.append(df)

train_df = pd.concat(train_df_list)
test_df = pd.concat(test_df_list)

train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

all_df = pd.concat([train_df, test_df], sort=False)

In [3]:
from datetime import timedelta
data_list = []
for ship_id, group in all_df.groupby('渔船ID'):
    type_ = group['type'].values[0]
    group = group.sort_values(by=['time'])
    
    group = group.set_index('time')
    if (group.index[-1] - group.index[0]) < timedelta(days=3):
        group = group.append(pd.DataFrame(index=[group.index[0] + timedelta(days=3)]), sort=False)
    group = group.resample('10min').last().bfill()
    x = group['x'].values
    y = group['y'].values
    time_ = group.index.values
    data_list.append([])
    dist_list = []
    dist_diff_list = []
    angle_list = []
    time_list = []
    id_list = []
    for i in range(len(x) - 2):
        if i == 0:
            continue
        a = np.sqrt((x[i-1] - x[i]) ** 2 + (y[i-1] - y[i]) ** 2)
        b = np.sqrt((x[i+1] - x[i]) ** 2 + (y[i+1] - y[i]) ** 2)
        c = np.sqrt((x[i+1] - x[i-1]) ** 2 + (y[i+1] - y[i-1]) ** 2)
        dist_list.append(a)
        dist_diff_list.append(b - a)
        angle_list.append(np.arccos((a ** 2 + b ** 2 - c ** 2) / (2 * a * b)))
        time_list.append(time_[i])
        id_list.append(int(ship_id))
    data_list[-1].append(id_list)
    data_list[-1].append(time_list)
    data_list[-1].append(angle_list)
    data_list[-1].append(dist_list)
    data_list[-1].append(dist_diff_list)



In [4]:
df_list = []
for i in range(len(data_list)):
    df = pd.DataFrame(np.array(data_list[i]).T, columns=['id', 'time', 'angle', 'dist', 'dist_diff'])
    df_list.append(df)
all_df_new = pd.concat(df_list)

In [5]:
all_df_new = all_df_new.fillna(-1)

In [6]:
from tsfresh import extract_features
extracted_df = extract_features(all_df_new, column_id='id', column_sort='time')

Feature Extraction: 100%|██████████| 30/30 [54:55<00:00, 109.86s/it]  


In [7]:
extracted_df

variable,angle__abs_energy,angle__absolute_sum_of_changes,"angle__agg_autocorrelation__f_agg_""mean""__maxlag_40","angle__agg_autocorrelation__f_agg_""median""__maxlag_40","angle__agg_autocorrelation__f_agg_""var""__maxlag_40","angle__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","angle__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","angle__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""","angle__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""","angle__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""intercept""",...,dist_diff__symmetry_looking__r_0.9,dist_diff__symmetry_looking__r_0.9500000000000001,dist_diff__time_reversal_asymmetry_statistic__lag_1,dist_diff__time_reversal_asymmetry_statistic__lag_2,dist_diff__time_reversal_asymmetry_statistic__lag_3,dist_diff__value_count__value_-1,dist_diff__value_count__value_0,dist_diff__value_count__value_1,dist_diff__variance,dist_diff__variance_larger_than_standard_deviation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,566.088052,13.443763,0.210820,-0.002160,0.087908,-1.344664,0.364881,0.025552,0.010183,-1.644234,...,1.0,1.0,1.229803e+08,6.050465e+07,1.241170e+07,0.0,403.0,0.0,1.026739e+05,1.0
1,1312.968628,110.704674,0.646027,0.645547,0.004970,-2.216865,0.844368,0.132099,0.013090,-1.743616,...,1.0,1.0,2.400691e+08,-5.643202e+07,-7.119081e+07,1.0,276.0,0.0,5.135877e+05,1.0
2,425.000000,10.000000,0.008434,-0.012823,0.004117,-0.858351,-0.046768,-0.001208,0.004030,-0.666667,...,1.0,1.0,1.121639e+06,-4.877405e+03,1.720875e+04,1.0,385.0,0.0,2.829004e+03,1.0
3,1114.779659,74.050222,0.609500,0.584823,0.022624,0.251463,0.041493,0.005806,0.021833,1.806206,...,1.0,1.0,1.434127e+08,-1.203991e+07,-1.809027e+07,1.0,247.0,0.0,1.551720e+05,1.0
4,930.426206,204.473726,-0.028277,-0.116590,0.039695,1.181313,0.034841,0.004563,0.020440,3.448895,...,1.0,1.0,1.723460e+08,1.752289e+08,3.587929e+08,0.0,187.0,0.0,4.794494e+05,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,1117.366468,39.773034,0.757009,0.758979,0.015294,1.928385,-0.634805,-0.087966,0.016722,2.424272,...,1.0,1.0,1.659673e+06,-1.761717e+06,1.128162e+06,0.0,309.0,0.0,1.951023e+04,1.0
8996,1276.372078,293.108027,0.054712,0.027339,0.016537,1.462756,0.100181,0.015026,0.023306,3.101542,...,1.0,1.0,2.083220e+11,-9.079134e+08,-5.869457e+09,0.0,175.0,0.0,1.020810e+07,1.0
8997,427.000000,4.000000,0.000671,-0.007729,0.002771,-0.959831,0.017798,0.000302,0.002650,-0.777778,...,1.0,1.0,1.366301e+04,-4.694836e-03,-7.075472e-03,72.0,348.0,0.0,2.287471e+02,1.0
8998,1505.920017,172.190128,0.507764,0.487105,0.009995,0.492192,0.188457,0.028437,0.023144,1.599692,...,1.0,1.0,2.587827e+09,-1.421425e+07,-6.828271e+08,72.0,124.0,0.0,1.014267e+06,1.0


In [8]:
train_df = extracted_df.iloc[:7000]
test_df = extracted_df.iloc[7000:]

In [9]:
y = []
for name, group in all_df.groupby('渔船ID'):
    y.append(group.iloc[0]['type'])

In [10]:
y_train = y[:7000]
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

In [11]:
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

impute(train_df)
filtered_train_df = select_features(train_df, y_train)
filtered_test_df = test_df[filtered_train_df.columns]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cond, other, inplace, axis, level, errors=errors, try_cast=try_cast




In [12]:
filtered_test_df

variable,dist__count_below_mean,dist__count_above_mean,"dist_diff__fft_coefficient__coeff_0__attr_""abs""",dist__quantile__q_0.6,"dist__change_quantiles__f_agg_""mean""__isabs_False__qh_0.8__ql_0.0",dist__percentage_of_reoccurring_datapoints_to_all_datapoints,dist__quantile__q_0.7,angle__ar_coefficient__k_10__coeff_2,dist__median,angle__quantile__q_0.6,...,"angle__fft_coefficient__coeff_49__attr_""angle""","dist_diff__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""",dist_diff__first_location_of_minimum,"dist_diff__fft_coefficient__coeff_28__attr_""abs""","dist_diff__linear_trend__attr_""pvalue""","dist_diff__fft_coefficient__coeff_1__attr_""angle""",dist__symmetry_looking__r_0.15000000000000002,"angle__fft_coefficient__coeff_36__attr_""real""","dist__fft_coefficient__coeff_28__attr_""abs""","dist_diff__fft_coefficient__coeff_30__attr_""abs"""
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7000,350.0,80.0,2.429590e+03,95.065513,2.101180e+00,0.078788,110.852662,-0.141445,0.000000,-1.000000,...,-49.036453,-0.105245,0.606977,6577.483065,0.902902,160.685698,1.0,-7.592766,22123.445994,4982.721859
7001,217.0,213.0,2.805950e+03,1187.244432,-4.641217e+00,0.008427,1271.953370,-0.163329,1040.894689,3.023707,...,163.596940,0.044504,0.200000,15074.058817,0.958259,-177.256326,1.0,26.191342,30475.083894,13588.379247
7002,273.0,157.0,2.366717e+03,508.964592,1.982332e+01,0.002525,1494.922983,-0.369476,324.143194,3.063300,...,-158.668637,-0.259283,0.016279,1476.761258,0.893588,165.287043,1.0,-11.802323,9452.215531,397.948008
7003,377.0,53.0,8.026086e-04,0.000000,1.696818e-16,0.115385,0.000000,-0.153596,0.000000,-1.000000,...,81.217465,0.030126,0.620930,8920.503337,0.935429,-114.454533,1.0,-7.695798,21955.999859,2423.234447
7004,334.0,96.0,5.552821e+02,0.000000,5.621720e+00,0.030000,0.000000,-0.268745,0.000000,-1.000000,...,-149.931699,0.077859,0.090698,1652.008205,0.951370,113.757683,1.0,-2.347371,5142.983679,2639.025419
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,344.0,86.0,1.731255e+03,0.000000,-3.214462e-01,0.063158,0.000000,0.208279,0.000000,-1.000000,...,170.086474,-0.315233,0.197674,1758.889658,0.584552,90.323012,1.0,1.047066,5306.655433,5277.440497
8996,308.0,122.0,7.275958e-12,600.391188,-1.211039e+01,0.005376,825.354974,-0.166997,0.000000,-1.000000,...,145.535238,-0.027348,0.537209,15184.999851,0.965166,-122.527528,1.0,46.978812,37374.782416,14093.291135
8997,422.0,8.0,7.200000e+01,0.000000,-2.403846e-03,1.000000,0.000000,-0.148673,0.000000,-1.000000,...,-142.500775,-0.164956,0.179070,222.323900,0.726017,-143.766809,1.0,0.769597,542.365438,169.010846
8998,289.0,141.0,8.436174e+02,109.427135,5.285792e+00,0.034682,827.195762,-0.234709,0.000000,-1.000000,...,-143.707413,0.161370,0.639535,5729.888722,0.966504,-134.455575,1.0,-46.364182,16184.389940,3687.461237


In [13]:
filtered_train_df.to_csv('train.csv')
filtered_test_df.to_csv('test.csv')