In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import f1_score
import lightgbm as lgb
from tpot import TPOTClassifier
from sklearn import preprocessing

train_df = pd.read_csv('./new_train.csv', index_col=0)
test_df = pd.read_csv('./new_test.csv', index_col=0)
X_train = train_df.drop(columns=['type']).values
y_train = train_df['type'].values

X_test = test_df.values

le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFE, VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from tpot.export_utils import set_param_recursive

In [3]:
def get_model():
    # Average CV score on the training set was: 0.9211528057469444
    exported_pipeline = make_pipeline(
        VarianceThreshold(threshold=0.1),
        RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.9000000000000001, n_estimators=100), step=0.15000000000000002),
        StandardScaler(),
        GradientBoostingClassifier(learning_rate=0.5, max_depth=7, max_features=0.1, min_samples_leaf=5, min_samples_split=16, n_estimators=100, subsample=0.9500000000000001)
    )
    # Fix random state for all the steps in exported pipeline
    set_param_recursive(exported_pipeline.steps, 'random_state', 42)
    
    return exported_pipeline

In [4]:
from sklearn.metrics import f1_score

def evaluate_macroF1_lgb(truth, predictions):  
    pred_labels = predictions.reshape(len(np.unique(truth)),-1).argmax(axis=0)
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', f1, True) 

In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

kf = KFold(n_splits=5, random_state=2020, shuffle=True)

model_list = []
score_list = []
for train_index, test_index in kf.split(X_train):
    model = get_model()
    eval_set = (X_train[test_index], y_train[test_index])
    model.fit(X_train[train_index], y_train[train_index])
    model_list.append(model)
    score_list.append(f1_score(y_train[test_index], model.predict(X_train[test_index]), average='macro'))
    
print(score_list)
print(np.mean(score_list), np.std(score_list))

[0.9052682474350086, 0.9152470872513735, 0.9046503852301164, 0.9040152678011903, 0.9198496219868453]
0.9098061219409068 0.006498983219123005


In [6]:
result_list = []
for model in model_list:
    result = model.predict_proba(X_test)
    result_list.append(result)

result = np.argmax(np.sum(np.array(result_list), axis=0) / 5, axis=1)

result = le.inverse_transform(result)
pd.DataFrame(result, index=range(7000, 9000)).to_csv('result.csv', header=None)