In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
def get_feature_manually():
    train_path = '../data/hy_round1_train_20200102'
    test_path = '../data/hy_round1_testA_20200102'

    train_df_list = []
    for file_name in os.listdir(train_path):
        df = pd.read_csv(os.path.join(train_path, file_name))
        train_df_list.append(df)

    test_df_list = []
    for file_name in os.listdir(test_path):
        df = pd.read_csv(os.path.join(test_path, file_name))
        test_df_list.append(df)

    train_df = pd.concat(train_df_list)
    test_df = pd.concat(test_df_list)

    train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
    test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

    all_df = pd.concat([train_df, test_df], sort=False)

    new_df = all_df.groupby('渔船ID').agg(x_min=('x', 'min'), x_max=('x', 'max'), x_mean=('x', 'mean'), x_std=('x', 'std'), x_skew=('x', 'skew'), x_sum=('x', 'sum'),
                y_min=('y', 'min'), y_max=('y', 'max'), y_mean=('y', 'mean'), y_std=('y', 'std'), y_skew=('y', 'skew'), y_sum=('y', 'sum'),
                v_min=('速度', 'min'), v_max=('速度', 'max'), v_mean=('速度', 'mean'), v_std=('速度', 'std'), v_skew=('速度', 'skew'), v_sum=('速度', 'sum'),
                d_min=('方向', 'min'), d_max=('方向', 'max'), d_mean=('方向', 'mean'), d_std=('方向', 'std'), d_skew=('方向', 'skew'), d_sum=('方向', 'sum'))
    new_df['x_max-x_min'] = new_df['x_max'] - new_df['x_min']
    new_df['y_max-y_min'] = new_df['y_max'] - new_df['y_min']
    new_df['x_max-y_min'] = new_df['x_max'] - new_df['y_min']
    new_df['y_max-x_min'] = new_df['y_max'] - new_df['x_min']

    new_df['slope'] = new_df['y_max-y_min'] / np.where(new_df['x_max-x_min']==0, 0.001, new_df['x_max-x_min'])
    new_df['area'] = new_df['x_max-x_min'] * new_df['y_max-y_min']

    new_df['type'] = all_df.groupby('渔船ID').agg(type=('type', 'first'))['type'].values

    X_train = new_df.drop(columns=['type']).iloc[:7000]
    y_train = new_df.iloc[:7000]['type']

    X_test = new_df.drop(columns=['type']).iloc[7000:]

    return X_train.values, y_train, X_test.values


In [3]:
def get_feature_tsfresh():
    train_df = pd.read_csv('../history/train.csv')
    X_train = train_df.drop(columns=['type'])
    y_train = train_df['type']

    test_df = pd.read_csv('../history/test.csv')
    X_test = test_df[X_train.columns]
    X_test = X_test.replace([np.inf, -np.inf], np.nan)
    
    for column in list(X_test.columns[X_test.isnull().sum() > 0]):
        mean_val = X_test[column].mean()
        X_test[column].fillna(mean_val, inplace=True)

    return X_train.values, y_train.values, X_test.values

In [4]:
def get_cross_features():
    train_path = '../data/hy_round1_train_20200102'
    test_path = '../data/hy_round1_testA_20200102'

    train_df_list = []
    for file_name in os.listdir(train_path):
        df = pd.read_csv(os.path.join(train_path, file_name))
        train_df_list.append(df)

    test_df_list = []
    for file_name in os.listdir(test_path):
        df = pd.read_csv(os.path.join(test_path, file_name))
        test_df_list.append(df)

    train_df = pd.concat(train_df_list)
    test_df = pd.concat(test_df_list)

    train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
    test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

    all_df = pd.concat([train_df, test_df], sort=False)
    
    v = []
    for value in all_df['速度'].values:
        if value <= 3:
            v.append(0)
        elif value <= 8:
            v.append(1)
        elif value <= 14:
            v.append(2)
        elif value <= 19:
            v.append(3)
        elif value <= 23:
            v.append(4)
        else:
            v.append(5)
    all_df['v'] = v
    
    d = []
    for value in all_df['方向'].values:
        if value > 337.5 or value <= 22.5:
            d.append(0)
        elif value <= 67.5:
            d.append(1)
        elif value <= 112.5:
            d.append(2)
        elif value <= 157.5:
            d.append(3)
        elif value <= 202.5:
            d.append(4)
        elif value <= 247.5:
            d.append(5)
        elif value <= 292.5:
            d.append(6)
        elif value <= 337.5:
            d.append(7)
    all_df['d'] = d
    
    all_df['d_v'] = all_df['d'].values * all_df['v'].values
    
    cross_features = []
    for ship_id, group in all_df.groupby('渔船ID'):
        group = group.sort_values(by=['time'])
        features = []
        for i in range(8):
            temp = group[group['d'] == i]
            if temp.shape[0] == 0:
                features.append(-1)
                features.append(-1)
                features.append(-1)
                features.append(-1)
                features.append(-1)
            else:
                x = temp['x'].values
                y = temp['y'].values
                v = temp['速度'].values
                features.append(np.min(x))
                features.append(np.max(x))
                features.append(np.min(y))
                features.append(np.max(y))
                features.append(np.mean(v))
        for i in range(6):
            temp = group[group['v'] == i]
            if temp.shape[0] == 0:
                features.append(-1)
                features.append(-1)
                features.append(-1)
                features.append(-1)
                features.append(-1)
            else:
                x = temp['x'].values
                y = temp['y'].values
                d = temp['方向'].values
                features.append(np.min(x))
                features.append(np.max(x))
                features.append(np.min(y))
                features.append(np.max(y))
                features.append(np.mean(d))
        cross_features.append(np.array(features))
    cross_features = np.array(cross_features)
    
    return cross_features[:7000], cross_features[7000:]

In [5]:
def get_strict_features():
    train_df = pd.read_csv('./train.csv')
    X_train = train_df

    test_df = pd.read_csv('./test.csv')
    X_test = test_df[X_train.columns]
    X_test = X_test.replace([np.inf, -np.inf], np.nan)
    
    for column in list(X_test.columns[X_test.isnull().sum() > 0]):
        mean_val = X_test[column].mean()
        X_test[column].fillna(mean_val, inplace=True)

    return X_train.values, X_test.values

In [6]:
X_train_manually, y_train, X_test_manually = get_feature_manually()

In [7]:
X_train_tsfresh, y_train, X_test_tsfresh = get_feature_tsfresh()

In [8]:
X_train_cross, X_test_cross = get_cross_features()

In [9]:
X_train_strict, X_test_strict = get_strict_features()

In [10]:
X_train_manually.shape, X_train_tsfresh.shape, X_train_cross.shape, X_train_strict.shape

((7000, 30), (7000, 1587), (7000, 70), (7000, 1276))

In [11]:
X_train = np.concatenate([X_train_manually, X_train_tsfresh, X_train_cross, X_train_strict], axis=1)
X_test = np.concatenate([X_test_manually, X_test_tsfresh, X_test_cross, X_test_strict], axis=1)

In [12]:
from feature_selector import FeatureSelector

In [15]:
fs = FeatureSelector(data=pd.DataFrame(X_train), labels=y_train)

In [16]:
fs.identify_collinear(correlation_threshold=0.98, one_hot=False)

1047 features with a correlation magnitude greater than 0.98.



In [17]:
train_no_missing = fs.remove(methods=['collinear'])

Removed 1047 features.


In [18]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

print(X_train.shape, X_test.shape)

X_train = X_train.iloc[:, X_train.columns.isin(train_no_missing.columns)].values
X_test = X_test.iloc[:, X_test.columns.isin(train_no_missing.columns)].values

print(X_train.shape, X_test.shape)

(7000, 2963) (2000, 2963)
(7000, 1916) (2000, 1916)


In [13]:
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive
import os
import lightgbm as lgb
from sklearn import preprocessing

In [14]:
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

In [30]:
selector = RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.7000000000000001, n_estimators=100, random_state=42), step=0.1)
selector.fit(X_train, y_train)

RFE(estimator=ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                   class_weight=None, criterion='entropy',
                                   max_depth=None,
                                   max_features=0.7000000000000001,
                                   max_leaf_nodes=None, max_samples=None,
                                   min_impurity_decrease=0.0,
                                   min_impurity_split=None, min_samples_leaf=1,
                                   min_samples_split=2,
                                   min_weight_fraction_leaf=0.0,
                                   n_estimators=100, n_jobs=None,
                                   oob_score=False, random_state=42, verbose=0,
                                   warm_start=False),
    n_features_to_select=None, step=0.1, verbose=0)

In [31]:
X_train = X_train[:, selector.support_]
X_test = X_test[:, selector.support_]

In [32]:
X_train.shape, X_test.shape

((7000, 119), (2000, 119))

In [34]:
def get_model():
    exported_pipeline = make_pipeline(
        RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.7000000000000001, n_estimators=100), step=0.1),
        StandardScaler(),
        StackingEstimator(estimator=SGDClassifier(alpha=0.001, eta0=0.01, fit_intercept=False, l1_ratio=1.0, learning_rate="invscaling", loss="perceptron", penalty="elasticnet", power_t=0.5)),
        GradientBoostingClassifier(learning_rate=0.5, max_depth=7, max_features=0.15000000000000002, min_samples_leaf=2, min_samples_split=2, n_estimators=100, subsample=0.8500000000000001)
    )
    set_param_recursive(exported_pipeline.steps, 'random_state', 2020)
    return exported_pipeline

from sklearn.metrics import f1_score

def evaluate_macroF1_lgb(truth, predictions):  
    pred_labels = predictions.reshape(len(np.unique(truth)),-1).argmax(axis=0)
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', f1, True)

from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

kf = KFold(n_splits=5, random_state=2020, shuffle=True)

model_list = []
score_list = []
for train_index, test_index in kf.split(X_train):
    model = get_model()
    eval_set = (X_train[test_index], y_train[test_index])
    model.fit(X_train[train_index], y_train[train_index])
    model_list.append(model)
    score_list.append(f1_score(y_train[test_index], model.predict(X_train[test_index]), average='macro'))
    
print(score_list)
print(np.mean(score_list), np.std(score_list))

[0.8882297551789078, 0.8972624798711756, 0.8755395133549012, 0.8843454639155391, 0.9030915599258206]
0.8896937544492689 0.009673436230771593


In [50]:
result_list = []
for model in model_list:
    result = model.predict_proba(X_test)
    result_list.append(result)

result = np.argmax(np.sum(np.array(result_list), axis=0) / 5, axis=1)

result = le.inverse_transform(result)
pd.DataFrame(result, index=range(7000, 9000)).to_csv('result.csv', header=None)

In [45]:
X_train.shape

(7000, 92)

In [43]:
new_train_df = pd.DataFrame(X_train)
new_test_df = pd.DataFrame(X_test)
new_train_df['type'] = le.inverse_transform(y_train)

In [44]:
new_train_df.to_csv('new_train.csv')
new_test_df.to_csv('new_test.csv')

In [35]:
X_train

array([[6.11835176e+06, 6.15203843e+06, 5.12487338e+06, ...,
        1.00000000e+00, 8.47637237e-02, 1.91638605e-01],
       [6.04947187e+06, 6.10244988e+06, 5.04285734e+06, ...,
        6.00000000e+00, 5.90770458e-01, 8.60079680e-01],
       [6.18248219e+06, 6.18319102e+06, 5.19357554e+06, ...,
        1.00000000e+00, 2.33085218e-02, 6.33547753e-02],
       ...,
       [5.22870046e+06, 5.28746208e+06, 4.57772301e+06, ...,
        7.00000000e+00, 4.69325158e-01, 7.49281328e-01],
       [6.36501974e+06, 6.38762489e+06, 5.34350537e+06, ...,
        1.20000000e+01, 6.74404855e-01, 9.38994248e-01],
       [6.20637253e+06, 6.24031719e+06, 5.13202799e+06, ...,
        3.00000000e+00, 2.57359193e-01, 4.89900342e-01]])