In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
train_path = '../data/hy_round1_train_20200102'
test_path = '../data/hy_round1_testA_20200102'

train_df_list = []
for file_name in os.listdir(train_path):
    df = pd.read_csv(os.path.join(train_path, file_name))
    train_df_list.append(df)
    
test_df_list = []
for file_name in os.listdir(test_path):
    df = pd.read_csv(os.path.join(test_path, file_name))
    test_df_list.append(df)

In [3]:
train_df = pd.concat(train_df_list)
test_df = pd.concat(test_df_list)

In [4]:
train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

In [5]:
all_df = pd.concat([train_df, test_df], sort=False)

In [6]:
from tsfresh import extract_features

In [7]:
all_df.shape

(3482016, 7)

In [8]:
runing_df = all_df[all_df['速度'] > 0]

In [9]:
runing_df.shape

(2783228, 7)

In [10]:
df = runing_df.drop(columns=['type'])
y = runing_df['type']

In [11]:
extracted_df = extract_features(df, column_id='渔船ID', column_sort='time')

Feature Extraction: 100%|██████████| 30/30 [32:42<00:00, 65.41s/it]  


In [20]:
train_df = extracted_df.iloc[:6996]

In [21]:
test_df = extracted_df.iloc[6996:]

In [23]:
y = []
for name, group in all_df.groupby('渔船ID'):
    y.append(group.iloc[0]['type'])

In [24]:
y_train = y[:6996]
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

In [25]:
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

impute(train_df)
filtered_train_df = select_features(train_df, y_train)
filtered_test_df = test_df[filtered_train_df.columns]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cond, other, inplace, axis, level, errors=errors, try_cast=try_cast


In [32]:
filtered_train_df['type'] = le.inverse_transform(y_train)

In [33]:
filtered_train_df.to_csv('train_v3.csv')
filtered_test_df.to_csv('test_v3.csv')

In [6]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

import os
import lightgbm as lgb
from sklearn import preprocessing

In [7]:
def get_model():
    exported_pipeline = make_pipeline(
        SelectPercentile(score_func=f_classif, percentile=48),
        StackingEstimator(estimator=SGDClassifier(alpha=0.01, eta0=0.01, fit_intercept=False, l1_ratio=0.25, learning_rate="invscaling", loss="modified_huber", penalty="elasticnet", power_t=10.0)),
        ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.6000000000000001, min_samples_leaf=1, min_samples_split=3, n_estimators=100)
    )

    set_param_recursive(exported_pipeline.steps, 'random_state', 42)
    return exported_pipeline

In [8]:
def feature_generate_manually():
    train_path = '../data/hy_round1_train_20200102'
    test_path = '../data/hy_round1_testA_20200102'

    train_df_list = []
    for file_name in os.listdir(train_path):
        df = pd.read_csv(os.path.join(train_path, file_name))
        train_df_list.append(df)

    test_df_list = []
    for file_name in os.listdir(test_path):
        df = pd.read_csv(os.path.join(test_path, file_name))
        test_df_list.append(df)

    train_df = pd.concat(train_df_list)
    test_df = pd.concat(test_df_list)

    train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
    test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

    all_df = pd.concat([train_df, test_df], sort=False)

    new_df = all_df.groupby('渔船ID').agg(x_min=('x', 'min'), x_max=('x', 'max'), x_mean=('x', 'mean'), x_std=('x', 'std'), x_skew=('x', 'skew'), x_sum=('x', 'sum'),
                y_min=('y', 'min'), y_max=('y', 'max'), y_mean=('y', 'mean'), y_std=('y', 'std'), y_skew=('y', 'skew'), y_sum=('y', 'sum'),
                v_min=('速度', 'min'), v_max=('速度', 'max'), v_mean=('速度', 'mean'), v_std=('速度', 'std'), v_skew=('速度', 'skew'), v_sum=('速度', 'sum'),
                d_min=('方向', 'min'), d_max=('方向', 'max'), d_mean=('方向', 'mean'), d_std=('方向', 'std'), d_skew=('方向', 'skew'), d_sum=('方向', 'sum'))
    new_df['x_max-x_min'] = new_df['x_max'] - new_df['x_min']
    new_df['y_max-y_min'] = new_df['y_max'] - new_df['y_min']
    new_df['x_max-y_min'] = new_df['x_max'] - new_df['y_min']
    new_df['y_max-x_min'] = new_df['y_max'] - new_df['x_min']

    new_df['slope'] = new_df['y_max-y_min'] / np.where(new_df['x_max-x_min']==0, 0.001, new_df['x_max-x_min'])
    new_df['area'] = new_df['x_max-x_min'] * new_df['y_max-y_min']

    new_df['type'] = all_df.groupby('渔船ID').agg(type=('type', 'first'))['type'].values

    X_train = new_df.drop(columns=['type']).iloc[:7000]
    y_train = new_df.iloc[:7000]['type']

    X_test = new_df.drop(columns=['type']).iloc[7000:]

    return X_train, y_train, X_test


In [23]:
def feature_generate_tsfresh():
    train_df = pd.read_csv('./train_v3.csv', index_col=['id'])
    X_train = train_df.drop(columns=['type'])
    
    y_train = []
    for ship_id, group in all_df.groupby('渔船ID'):
        if int(ship_id) >= 7000:
            break
        if not (int(ship_id) in [1709, 3423, 6605, 6791]):
            y_train.append(group['type'].values[0])
        
    le = preprocessing.LabelEncoder()
    y_train = le.fit_transform(y_train)

    test_df = pd.read_csv('./test_v3.csv', index_col=['id'])
    X_test = test_df[X_train.columns]
    
    print(X_train.shape, y_train.shape, X_test.shape)

    base_model = lgb.LGBMClassifier(n_estimators=1000, subsample=0.8)
    base_model.fit(X_train.values, y_train)

    selected_columns = X_train.columns[np.argsort(base_model.feature_importances_)[::-1][:24]]
    print(selected_columns)

    X_train = X_train[selected_columns]
    X_test = X_test[selected_columns]

    X_train_manully, _, X_test_manully = feature_generate_manually()

    X_train['x_max-x_min'] = X_train_manully.iloc[X_train.index]['x_max-x_min'].values
    X_test['x_max-x_min'] = X_test_manully['x_max-x_min'].values
    X_train['x_max-y_min'] = X_train_manully.iloc[X_train.index]['x_max-y_min'].values
    X_test['x_max-y_min'] = X_test_manully['x_max-y_min'].values
    X_train['y_max-x_min'] = X_train_manully.iloc[X_train.index]['y_max-x_min'].values
    X_test['y_max-x_min'] = X_test_manully['y_max-x_min'].values
    X_train['y_max-y_min'] = X_train_manully.iloc[X_train.index]['y_max-y_min'].values
    X_test['y_max-y_min'] = X_test_manully['y_max-y_min'].values

    X_train['slope'] = X_train_manully.iloc[X_train.index]['slope'].values
    X_test['slope'] = X_test_manully['slope'].values
    X_train['area'] = X_train_manully.iloc[X_train.index]['area'].values
    X_test['area'] = X_test_manully['area'].values
    
    for column in list(X_test.columns[X_test.isnull().sum() > 0]):
        mean_val = X_test[column].mean()
        X_test[column].fillna(mean_val, inplace=True)

    return X_train.values, y_train, X_test.values

In [24]:
X_train_tsfresh, y_train, X_test_tsfresh = feature_generate_tsfresh()

(6996, 962) (6996,) (2000, 962)
Index(['x__minimum', 'y__maximum', 'x__quantile__q_0.1', 'y__quantile__q_0.9',
       'y__quantile__q_0.8', 'y__minimum', 'x__quantile__q_0.2',
       'y__quantile__q_0.7', 'y__number_cwt_peaks__n_1',
       '速度__agg_autocorrelation__f_agg_"median"__maxlag_40',
       'x__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_8__w_2',
       'x__quantile__q_0.3',
       '速度__agg_linear_trend__f_agg_"min"__chunk_len_50__attr_"stderr"',
       '速度__quantile__q_0.7', '速度__time_reversal_asymmetry_statistic__lag_1',
       '速度__agg_autocorrelation__f_agg_"var"__maxlag_40',
       'y__partial_autocorrelation__lag_9', '方向__quantile__q_0.9',
       'x__number_peaks__n_1', '速度__number_crossing_m__m_1',
       '速度__fft_coefficient__coeff_48__attr_"angle"',
       '速度__partial_autocorrelation__lag_6',
       '速度__time_reversal_asymmetry_statistic__lag_2',
       '方向__fft_coefficient__coeff_62__attr_"abs"'],
      dtype='object')


In [25]:
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

In [26]:
X_train = np.concatenate([X_train_tsfresh], axis=1)
X_test = np.concatenate([X_test_tsfresh], axis=1)

In [27]:
from sklearn.metrics import f1_score

def evaluate_macroF1_lgb(truth, predictions):  
    pred_labels = predictions.reshape(len(np.unique(truth)),-1).argmax(axis=0)
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', f1, True) 

In [28]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

kf = KFold(n_splits=5, random_state=42, shuffle=True)

model_list = []
score_list = []
for train_index, test_index in kf.split(X_train):
    model = get_model()
    eval_set = (X_train[test_index], y_train[test_index])
    model.fit(X_train[train_index], y_train[train_index])
    model_list.append(model)
    score_list.append(f1_score(y_train[test_index], model.predict(X_train[test_index]), average='macro'))
    
print(score_list)
print(np.mean(score_list), np.std(score_list))

[0.902171387393112, 0.9216105661128132, 0.9011279063237007, 0.9132802492988671, 0.8986905944376665]
0.907376140713232 0.00870684752206015


In [29]:
import pandas as pd
train_df = pd.read_csv('./train_v3.csv', index_col=['id'])

In [30]:
train_df

Unnamed: 0_level_0,y__minimum,y__quantile__q_0.1,y__quantile__q_0.2,y__quantile__q_0.3,y__quantile__q_0.4,y__median,y__c3__lag_2,y__c3__lag_3,y__c3__lag_1,y__mean,...,"速度__fft_coefficient__coeff_15__attr_""abs""",x__autocorrelation__lag_9,"速度__fft_coefficient__coeff_65__attr_""abs""",方向__median,"速度__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.6",速度__large_standard_deviation__r_0.30000000000000004,速度__ar_coefficient__k_10__coeff_1,x__partial_autocorrelation__lag_1,"y__change_quantiles__f_agg_""var""__isabs_True__qh_1.0__ql_0.8",type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,5.124873e+06,5.126307e+06,5.126719e+06,5.127111e+06,5.129242e+06,5.130672e+06,1.349376e+20,1.349404e+20,1.349338e+20,5.129033e+06,...,8.474355,0.410255,15.395705,0.0,0.031629,1.0,0.892681,0.967901,1236.217301,拖网
1,5.042857e+06,5.049682e+06,5.054848e+06,5.058180e+06,5.061701e+06,5.091435e+06,1.314462e+20,1.314442e+20,1.314481e+20,5.084401e+06,...,9.309036,0.965380,13.515751,39.5,0.051857,0.0,0.760300,1.000674,104.834255,拖网
2,5.193576e+06,5.193685e+06,5.193685e+06,5.193685e+06,5.193685e+06,5.193685e+06,1.400960e+20,1.400960e+20,1.400960e+20,5.193681e+06,...,60.842952,0.612394,43.192536,91.0,0.000000,0.0,0.004636,0.953903,0.824480,拖网
3,4.577467e+06,4.579736e+06,4.581699e+06,4.591708e+06,4.608404e+06,4.608404e+06,9.731194e+19,9.730781e+19,9.731581e+19,4.599709e+06,...,64.267162,0.950035,9.844886,112.0,0.412632,0.0,0.431002,1.001149,0.000000,拖网
4,6.094996e+06,6.095371e+06,6.108676e+06,6.109696e+06,6.114141e+06,6.116621e+06,2.288184e+20,2.288154e+20,2.288213e+20,6.116444e+06,...,141.927654,0.754618,15.912169,117.5,0.269757,0.0,0.538506,0.996226,370398.627927,围网
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,5.150309e+06,5.167117e+06,5.176224e+06,5.181443e+06,5.186291e+06,5.195146e+06,1.408763e+20,1.408629e+20,1.408889e+20,5.203407e+06,...,67.424659,0.952407,13.253961,145.5,0.036273,0.0,0.488132,0.997044,0.274667,刺网
6996,5.432442e+06,5.440002e+06,5.447349e+06,5.452473e+06,5.456563e+06,5.460574e+06,1.628617e+20,1.628667e+20,1.628566e+20,5.460851e+06,...,45.325139,0.909120,6.471273,173.0,0.009037,0.0,0.647951,0.993324,207786.170097,围网
6997,4.577723e+06,4.579870e+06,4.581411e+06,4.584697e+06,4.608071e+06,4.608404e+06,9.725518e+19,9.725023e+19,9.725988e+19,4.598836e+06,...,78.257584,0.948747,5.626785,119.0,0.172758,0.0,0.671539,1.000849,1746.428620,拖网
6998,5.343505e+06,5.360136e+06,5.365466e+06,5.370404e+06,5.382932e+06,5.414303e+06,1.569168e+20,1.569179e+20,1.569161e+20,5.393606e+06,...,15.726933,0.917927,22.248792,153.0,0.041369,0.0,0.552531,0.997847,288991.578913,拖网


In [6]:
for i in range(7000):
    if not (i in train_df.index.values):
        print(i)

1709
3423
6605
6791
