In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import f1_score
import lightgbm as lgb
from sklearn import preprocessing
from datetime import timedelta

In [2]:
def feature_generate_manually():
    train_path = '../data/hy_round1_train_20200102'
    test_path = '../data/hy_round1_testA_20200102'

    train_df_list = []
    for file_name in os.listdir(train_path):
        df = pd.read_csv(os.path.join(train_path, file_name))
        train_df_list.append(df)

    test_df_list = []
    for file_name in os.listdir(test_path):
        df = pd.read_csv(os.path.join(test_path, file_name))
        test_df_list.append(df)

    train_df = pd.concat(train_df_list)
    test_df = pd.concat(test_df_list)

    train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
    test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

    all_df = pd.concat([train_df, test_df], sort=False)
    
    group_list = []
    for ship_id, group in all_df.groupby('渔船ID'):
        type_ = group['type'].values[0]
        group = group.sort_values(by=['time'])
        group = group[['time', 'x', 'y', '速度', '方向']]
        group = group.set_index('time')
        if (group.index[-1] - group.index[0]) < timedelta(days=3):
            group = group.append(pd.DataFrame(index=[group.index[0] + timedelta(days=3)]), sort=False)
        group = group.resample('10min').mean().ffill()
        group['type'] = type_
        group['渔船ID'] = ship_id
        group['time'] = group.index.values
        group = group.set_index(pd.Index(range(group.shape[0])))
        group_list.append(group)
    all_df = pd.concat(group_list)
    
    new_df = all_df.groupby('渔船ID').agg(x_min=('x', 'min'), x_max=('x', 'max'), x_mean=('x', 'mean'), x_std=('x', 'std'), x_skew=('x', 'skew'), x_sum=('x', 'sum'),
                y_min=('y', 'min'), y_max=('y', 'max'), y_mean=('y', 'mean'), y_std=('y', 'std'), y_skew=('y', 'skew'), y_sum=('y', 'sum'),
                v_min=('速度', 'min'), v_max=('速度', 'max'), v_mean=('速度', 'mean'), v_std=('速度', 'std'), v_skew=('速度', 'skew'), v_sum=('速度', 'sum'),
                d_min=('方向', 'min'), d_max=('方向', 'max'), d_mean=('方向', 'mean'), d_std=('方向', 'std'), d_skew=('方向', 'skew'), d_sum=('方向', 'sum'))
    new_df['x_max-x_min'] = new_df['x_max'] - new_df['x_min']
    new_df['y_max-y_min'] = new_df['y_max'] - new_df['y_min']
    new_df['x_max-y_min'] = new_df['x_max'] - new_df['y_min']
    new_df['y_max-x_min'] = new_df['y_max'] - new_df['x_min']

    new_df['slope'] = new_df['y_max-y_min'] / np.where(new_df['x_max-x_min']==0, 0.001, new_df['x_max-x_min'])
    new_df['area'] = new_df['x_max-x_min'] * new_df['y_max-y_min']

    new_df['type'] = all_df.groupby('渔船ID').agg(type=('type', 'first'))['type'].values

    print(new_df.drop(columns=['type']).columns)

    X_train = new_df.drop(columns=['type']).iloc[:7000].values
    y_train = new_df.iloc[:7000]['type'].values

    X_test = new_df.drop(columns=['type']).iloc[7000:].values

    return X_train, y_train, X_test, new_df.drop(columns=['type']).columns

In [3]:
def feature_generate_tsfresh():
    train_df = pd.read_csv('./train_v2.csv')
    X_train = train_df.drop(columns=['type'])
    y_train = train_df['type']
    
    le = preprocessing.LabelEncoder()
    y_train = le.fit_transform(y_train)

    test_df = pd.read_csv('./test_v2.csv')
    X_test = test_df[X_train.columns]
    
    base_model =  lgb.LGBMClassifier(n_estimators=400, objective='multiclass')
    base_model.fit(X_train.values, y_train)
    
    selected_columns = X_train.columns[np.argsort(base_model.feature_importances_)[::-1][:24]]
    selected_columns = selected_columns[~np.isin(selected_columns, ['x__minimum', 'y__maximum', 'y__minimum', 'x__maximum'])]
    print(selected_columns)
    
    X_train = X_train[selected_columns].values
    X_test = X_test[selected_columns].values
    
    return X_train, le.inverse_transform(y_train), X_test, selected_columns

In [4]:
X_train_manually, y_train, X_test_manually, feature_manually = feature_generate_manually()

Index(['x_min', 'x_max', 'x_mean', 'x_std', 'x_skew', 'x_sum', 'y_min',
       'y_max', 'y_mean', 'y_std', 'y_skew', 'y_sum', 'v_min', 'v_max',
       'v_mean', 'v_std', 'v_skew', 'v_sum', 'd_min', 'd_max', 'd_mean',
       'd_std', 'd_skew', 'd_sum', 'x_max-x_min', 'y_max-y_min', 'x_max-y_min',
       'y_max-x_min', 'slope', 'area'],
      dtype='object')


In [5]:
X_train_tsfresh, y_train, X_test_tsfresh, feature_tsfresh = feature_generate_tsfresh()

Index(['x__quantile__q_0.1', 'y__quantile__q_0.8', 'y__quantile__q_0.9',
       'y__quantile__q_0.7', 'x__quantile__q_0.2',
       '速度__number_crossing_m__m_1', 'x__quantile__q_0.9',
       'x__quantile__q_0.4', 'y__number_cwt_peaks__n_1', '方向__quantile__q_0.9',
       '速度__agg_autocorrelation__f_agg_"median"__maxlag_40', 'x__median',
       'y__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_1__w_20',
       '速度__agg_autocorrelation__f_agg_"var"__maxlag_40', 'x__quantile__q_0.3',
       'y__quantile__q_0.6', '速度__quantile__q_0.7',
       'y__change_quantiles__f_agg_"mean"__isabs_True__qh_0.2__ql_0.0',
       'y__fft_coefficient__coeff_66__attr_"angle"',
       'x__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_0__w_2'],
      dtype='object')


In [6]:
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

X_train = np.concatenate([X_train_manually, X_train_tsfresh], axis=1)
X_test = np.concatenate([X_test_manually, X_test_tsfresh], axis=1)

In [7]:
from sklearn.metrics import f1_score

def evaluate_macroF1_lgb(truth, predictions):  
    pred_labels = predictions.reshape(len(np.unique(truth)),-1).argmax(axis=0)
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', f1, True) 

In [8]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

kf = KFold(n_splits=5, random_state=42, shuffle=True)

model_list = []
score_list = []
for train_index, test_index in kf.split(X_train):
    model = lgb.LGBMClassifier(n_estimators=1000, objective='multiclass')
    eval_set = (X_train[test_index], y_train[test_index])
    model.fit(X=X_train[train_index], y=y_train[train_index], eval_metric=evaluate_macroF1_lgb,
              eval_set=eval_set, early_stopping_rounds=100, verbose=False)
    model_list.append(model)
    score_list.append(f1_score(y_train[test_index], model.predict(X_train[test_index]), average='macro'))
    
print(score_list)
print(np.mean(score_list), np.std(score_list))

[0.873776499668245, 0.8933404341669421, 0.8966928987031043, 0.873174568533544, 0.8680298972431643]
0.881002859663 0.011663560729452257


In [9]:
result_list = []
for model in model_list:
    result = model.predict_proba(X_test)
    result_list.append(result)

result = np.argmax(np.sum(np.array(result_list), axis=0) / 5, axis=1)

result = le.inverse_transform(result)
pd.DataFrame(result, index=range(7000, 9000)).to_csv('result.csv', header=None)

In [12]:
ret = []
for index, model in enumerate(model_list):
    df = pd.DataFrame()
    df['name'] = np.concatenate([feature_manually.values, feature_tsfresh])
    df['score'] = model.feature_importances_
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)

df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'], ascending=False)

In [13]:
df

Unnamed: 0,name,score
38,y_max-x_min,546.6
23,x_max-y_min,492.0
12,v_std,443.0
47,"速度__agg_autocorrelation__f_agg_""var""__maxlag_40",438.4
29,"y__change_quantiles__f_agg_""mean""__isabs_True_...",429.6
32,y__number_cwt_peaks__n_1,428.8
48,速度__number_crossing_m__m_1,426.8
30,"y__cwt_coefficients__widths_(2, 5, 10, 20)__co...",414.0
14,"x__cwt_coefficients__widths_(2, 5, 10, 20)__co...",414.0
46,"速度__agg_autocorrelation__f_agg_""median""__maxla...",365.8
