In [3]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae

In [2]:
import pandas
import numpy
import sklearn
import lightgbm

print('pandas Version :', pandas.__version__)
print('numpy Version :', numpy.__version__)
print('sklearn Version :', sklearn.__version__)
print('lightgbm Version :', lightgbm.__version__)

pandas Version : 1.5.1
numpy Version : 1.21.5
sklearn Version : 1.1.3
lightgbm Version : 3.2.1


In [None]:


def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    
def nmae(true, pred):
    return mae(true, pred) / 406.22247394653374

def permutation_importances(model, vv):
    use_features = vv[0].columns
    oof = model.predict(vv[0])
    base_score = mae(vv[1], oof)

    good_features = []
    for col in use_features:
        test = vv[0].copy()
        test[col] = np.random.permutation(test[col])
        permu_oof = model.predict(test)
        permu_score = mae(vv[1], permu_oof)
        if permu_score>base_score:
            good_features += [col]
    return good_features

def main():
    
    df_train = pd.read_csv('../open/train.csv')
    df_test = pd.read_csv('../open/test.csv')
    sub = pd.read_csv('../open/sample_submission.csv')
    target = '착과량(int)'
    

    seed_permu_oof = []
    seed_permu_prediction = []
    seed_ff_oof = []
    seed_ff_prediction = []
    FOLDS = 10
    SEEDS = [42, 1028, 1234, 0, 24]
    
    for seed in SEEDS:
        seed_everything(seed=seed)

        kf = KFold(n_splits=FOLDS, random_state=seed, shuffle=True)
        splits = list(kf.split(df_train))

        drop_cols = ['ID']
        drop_cols += [col for col in df_train.columns if '새순' not in col]

        feature_importacne_df = pd.DataFrame()
        use_cols = [col for col in df_train.columns if col not in drop_cols + [target]]
        oof = np.zeros(len(df_train))
        prediction = np.zeros(len(df_test))

        for fold, (trn_idx, val_idx) in enumerate(splits):
            print(f'-------------------------- {fold}/{FOLDS} --------------------------')
            tt = (df_train.loc[trn_idx, use_cols], df_train.loc[trn_idx, target])
            vv = (df_train.loc[val_idx, use_cols], df_train.loc[val_idx, target])

            reg = LGBMRegressor(
                            objective='regression',# regression, regression_l1, poisson, poisson
                            n_estimators=10000,
                            learning_rate=0.01,
                            max_depth=-1,
                            # num_leaves=64,
                            subsample=0.7,
                            colsample_bytree=0.3,
                            random_state=seed,
                            # reg_alpha=0.01,
                            # reg_lambda=0.01,
                            )
            reg.fit(tt[0], tt[1], eval_set=[tt, vv], early_stopping_rounds=200, verbose=500)

            permu_features = permutation_importances(reg, vv)
            use_cols = permu_features
            tt = (df_train.loc[trn_idx, use_cols], df_train.loc[trn_idx, target])
            vv = (df_train.loc[val_idx, use_cols], df_train.loc[val_idx, target])
            
            reg.fit(tt[0], tt[1], eval_set=[tt, vv], early_stopping_rounds=200, verbose=500)

            oof[val_idx] = reg.predict(vv[0])
            prediction += reg.predict(df_test[use_cols]) / FOLDS
            feature_importacne_df = feature_importacne_df.append(pd.DataFrame(zip(use_cols, reg.feature_importances_), columns=['feature', 'value']))

        feature_importacne_df = feature_importacne_df.groupby('feature').mean().sort_values('value', ascending=False).reset_index()
        print(nmae(df_train[target], oof), mae(df_train[target], oof), mse(df_train[target], oof))
        seed_permu_oof += [oof]
        seed_permu_prediction += [prediction]

    
    for seed in SEEDS:
        seed_everything(seed=seed)

        kf = KFold(n_splits=FOLDS, random_state=seed, shuffle=True)
        splits = list(kf.split(df_train))

        drop_cols = ['ID']
        drop_cols += [col for col in df_train.columns if '새순' not in col]

        feature_importacne_df = pd.DataFrame()
        use_cols = [col for col in df_train.columns if col not in drop_cols + [target]]
        oof = np.zeros(len(df_train))
        prediction = np.zeros(len(df_test))

        for fold, (trn_idx, val_idx) in enumerate(splits):
            print(f'-------------------------- {fold}/{FOLDS} --------------------------')
            tt = (df_train.loc[trn_idx, use_cols], df_train.loc[trn_idx, target])
            vv = (df_train.loc[val_idx, use_cols], df_train.loc[val_idx, target])

            reg = LGBMRegressor(
                            objective='regression',# regression, regression_l1, poisson, poisson
                            n_estimators=10000,
                            learning_rate=0.01,
                            max_depth=-1,
                            # num_leaves=64,
                            subsample=0.7,
                            colsample_bytree=0.3,
                            random_state=seed,
                            # reg_alpha=0.01,
                            # reg_lambda=0.01,
                            )
            reg.fit(tt[0], tt[1], eval_set=[tt, vv], early_stopping_rounds=200, verbose=500)

            oof[val_idx] = reg.predict(vv[0])
            prediction += reg.predict(df_test[use_cols]) / FOLDS
            feature_importacne_df = feature_importacne_df.append(pd.DataFrame(zip(use_cols, reg.feature_importances_), columns=['feature', 'value']))
        feature_importacne_df = feature_importacne_df.groupby('feature').mean().sort_values('value', ascending=False).reset_index()

        use_cols = feature_importacne_df.iloc[:20, 0].tolist()
        oof = np.zeros(len(df_train))
        prediction = np.zeros(len(df_test))

        for fold, (trn_idx, val_idx) in enumerate(splits):
            print(f'-------------------------- {fold}/{FOLDS} --------------------------')
            tt = (df_train.loc[trn_idx, use_cols], df_train.loc[trn_idx, target])
            vv = (df_train.loc[val_idx, use_cols], df_train.loc[val_idx, target])

            lr = LGBMRegressor(
                            objective='regression',# regression, regression_l1, poisson
                            n_estimators=10000,
                            learning_rate=0.01,
                            max_depth=-1,
                            # num_leaves=64,
                            subsample=0.7,
                            colsample_bytree=0.3,
                            random_state=seed,
                            # reg_alpha=0.01,
                            # reg_lambda=0.01,
                            )
            lr.fit(tt[0], tt[1], eval_set=[tt, vv], early_stopping_rounds=200, verbose=500)

            oof[val_idx] = lr.predict(vv[0])
            prediction += lr.predict(df_test[use_cols]) / FOLDS

        seed_ff_oof += [oof]
        seed_ff_prediction += [prediction]
        
    sub[target] = np.mean(seed_permu_prediction, 0)*0.5 + np.mean(seed_ff_prediction, 0)*0.5
    sub.to_csv('../submit/submission.csv', index=False)


if __name__=='__main__':
    main()