In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import f1_score
import lightgbm as lgb

In [2]:
def rmsle_cv(model=None, X_train=None, y_train=None):
    seed = 42
    n_folds = 5
    kf = KFold(n_folds, shuffle=True, random_state=seed).get_n_splits(X_train)
    score = cross_val_score(model, X_train, y_train,
                           scoring="f1_macro", cv = kf)
    return score

In [3]:
def feature_generate_manually():
    print('reading csv')
    train_path = '../data/hy_round1_train_20200102'
    test_path = '../data/hy_round1_testA_20200102'

    train_df_list = []
    for file_name in os.listdir(train_path):
        df = pd.read_csv(os.path.join(train_path, file_name))
        train_df_list.append(df)

    test_df_list = []
    for file_name in os.listdir(test_path):
        df = pd.read_csv(os.path.join(test_path, file_name))
        test_df_list.append(df)
        
    train_df = pd.concat(train_df_list)
    test_df = pd.concat(test_df_list)

    train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
    test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')
    
    all_df = pd.concat([train_df, test_df], sort=False)

    print('calculating features')
    data_list = []
    type_list = []
    for ship_id, group in all_df.groupby('渔船ID'):
        data_list.append([])
        data_list[-1].append(ship_id)

        group = group.sort_values(by=['time'])

        data_list[-1].append(np.mean(group['速度'].values))
        data_list[-1].append(np.std(group['速度'].values))

        data_list[-1].append(np.mean(group['方向'].values))
        data_list[-1].append(np.std(group['方向'].values))

        data_list[-1].append(np.max(group['x'].values) - np.min(group['x'].values))
        data_list[-1].append(np.max(group['y'].values) - np.min(group['y'].values))
        data_list[-1].append(data_list[-1][-1] * data_list[-1][-2])

        type_list.append(group['type'].values[0])

    all_df = pd.DataFrame(np.array(data_list)[:, 1:], columns=['v_mean', 'v_std', 'd_mean', 'd_std',
                                                               'x_range', 'y_range', 'area'],
                          index=np.array(data_list)[:, 0])
    all_df['type'] = type_list

    X_train = all_df.drop(columns=['type']).iloc[:7000].values
    y_train = all_df.iloc[:7000]['type'].values

    X_test = all_df.drop(columns=['type']).iloc[7000:].values

    return X_train, y_train, X_test

In [4]:
def feature_generate_tsfresh():
    train_df = pd.read_csv('./train.csv')
    X_train = train_df.drop(columns=['type'])
    y_train = train_df['type']

    test_df = pd.read_csv('./test.csv')
    X_test = test_df[X_train.columns]
    
    base_model =  lgb.LGBMClassifier(n_estimators=1000, subsample=0.8)
    base_model.fit(X_train.values, y_train)
    
    selected_columns = X_train.columns[np.argsort(base_model.feature_importances_)[::-1][:20]]
    print(selected_columns)
    
    X_train = X_train[selected_columns].values
    X_test = X_test[selected_columns].values
    
    return X_train, y_train, X_test

In [5]:
X_train_manually, y_train, X_test_manually = feature_generate_manually()

reading csv
calculating features


In [6]:
X_train_tsfresh, y_train, X_test_tsfresh = feature_generate_tsfresh()

Index(['x__quantile__q_0.1', 'x__minimum', 'y__quantile__q_0.9', 'y__maximum',
       'x__quantile__q_0.2', 'y__quantile__q_0.8', 'y__quantile__q_0.7',
       '速度__number_crossing_m__m_1', 'y__minimum', 'x__quantile__q_0.3',
       '速度__agg_autocorrelation__f_agg_"median"__maxlag_40',
       'y__number_cwt_peaks__n_1', 'x__maximum', 'x__quantile__q_0.9',
       '速度__quantile__q_0.7', 'x__quantile__q_0.4',
       '方向__ar_coefficient__k_10__coeff_0', 'y__quantile__q_0.6',
       '速度__fft_coefficient__coeff_6__attr_"real"',
       '方向__fft_coefficient__coeff_64__attr_"abs"'],
      dtype='object')


In [7]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

In [8]:
lc = lgb.LGBMClassifier(n_estimators=1000, subsample=0.8)

cv_result = rmsle_cv(lc, X_train_manually, y_train)
print(np.mean(cv_result), np.std(cv_result))

cv_result = rmsle_cv(lc, X_train_tsfresh, y_train)
print(np.mean(cv_result), np.std(cv_result))

0.6220162695040433 0.012900053865828282
0.8988165642852117 0.008442984669025915


In [9]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

kf = KFold(n_splits=5, random_state=42, shuffle=True)

model_list = []
score_list = []
for train_index, test_index in kf.split(X_train_tsfresh):
    model = lgb.LGBMClassifier(n_estimators=1000, subsample=0.8)
    model.fit(X_train_tsfresh[train_index], y_train[train_index])
    model_list.append(model)
    score_list.append(f1_score(y_train[test_index], model.predict(X_train_tsfresh[test_index]), average='macro'))
    
print(score_list)
print(np.mean(score_list), np.std(score_list))

[0.8988532592994866, 0.899006025314213, 0.9163367360450878, 0.8865767547090165, 0.8938826099175392]
0.8989310770570687 0.0098090258312469


In [10]:
X_train = np.concatenate([X_train_tsfresh], axis=1)
X_test = np.concatenate([X_test_tsfresh], axis=1)

In [11]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)

model_list = []
score_list = []
for train_index, test_index in kf.split(X_train):
    model = lgb.LGBMClassifier(n_estimators=1000, subsample=0.8)
    model.fit(X_train[train_index], y_train[train_index])
    model_list.append(model)
    score_list.append(f1_score(y_train[test_index], model.predict(X_train[test_index]), average='macro'))
    
print(score_list)
print(np.mean(score_list), np.std(score_list))

result_list = []
for model in model_list:
    result = model.predict_proba(X_test)
    result_list.append(result)

result = np.argmax(np.sum(np.array(result_list), axis=0) / 5, axis=1)

[0.8988532592994866, 0.899006025314213, 0.9163367360450878, 0.8865767547090165, 0.8938826099175392]
0.8989310770570687 0.0098090258312469


In [12]:
result = le.inverse_transform(result)
pd.DataFrame(result, index=range(7000, 9000)).to_csv('result.csv', header=None)