In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split,KFold
import optuna
from optuna.samplers import TPESampler

In [None]:
os.chdir("../input/tabular-playground-series-jan-2021")

# 1. Data loading

In [None]:
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

In [None]:
train.head()

There is no missing data in both train data and test data.

# 2. Data visualization

In [None]:
sns.boxplot(train.target)

There is an possible outlier with a value of 0.

In [None]:
sns.distplot(train.target)

This is an bimodal distribution

# **3. Data Processing**

Feature engineering

In [None]:
X=train[train.columns.drop(["id","target"])]
y=train["target"]

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly=PolynomialFeatures(degree=2,include_bias=False) #interaction_only=Trueにしたらやや悪化した
X_poly=poly.fit_transform(X)
X_poly=pd.DataFrame(X_poly)
X_poly

In [None]:
X_poly.columns=X_poly.columns.astype("str")

In [None]:
#Borutashapで出した特徴量。詳しくはひとつ前のノートみて
selected_columns=['49', '107', '27', '1', '83', '110', '28', '86', '2', '14', '80', '11', '87', '6', '61', '3', '0', '16', '114', '37', '89', '98', '70', '10', '5', '62', '4', '12', '81', '104', '51', '39', '23', '111', '13', '9', '26', '102', '33', '75', '88', '94', '8', '99', '79', '108', '44', '50', '71', '24', '7', '17', '59', '112', '22', '57']
X_poly=X_poly[selected_columns]
X_poly.head()

In [None]:
test1=test[test.columns.drop("id")]
poly=PolynomialFeatures(degree=2,include_bias=False)
test_poly=poly.fit_transform(test1)
test_poly=pd.DataFrame(test_poly)
test_poly.columns=test_poly.columns.astype("str")
test_poly=test_poly[selected_columns]
test_poly.head()

In [None]:
# params from this kernel https://www.kaggle.com/kailex/tabular-playground

params={'random_state': 33,'n_estimators':5000,
 'min_data_per_group': 5,
 'boosting_type': 'gbdt',
 'num_leaves': 256,
 'max_dept': -1,
 'learning_rate': 0.005,
 'subsample_for_bin': 200000,
 'lambda_l1': 1.074622455507616e-05,
 'lambda_l2': 2.0521330798729704e-06,
 'n_jobs': -1,
 'cat_smooth': 1.0,
 'silent': True,
 'importance_type': 'split',
 'metric': 'rmse',
 'feature_pre_filter': False,
 'bagging_fraction': 0.8206341150202605,
 'min_data_in_leaf': 100,
 'min_sum_hessian_in_leaf': 0.001,
 'bagging_freq': 6,
 'feature_fraction': 0.5,
 'min_gain_to_split': 0.0,
 'min_child_samples': 20,
 'num_iterations':5000}


model=lgbm.LGBMRegressor(**params)

# 4. Modeling

lightGBM regressor + StratifiedKFold

In [None]:
tmp=X_poly.copy()
import random
fold_list = [1,2,3,4,5]
folds = []
for i in range(int((tmp.shape[0])/5)):
    random.shuffle(fold_list)
    folds.extend(fold_list)
tmp['fold'] = folds
tmp.head(7)

In [None]:
predictions = np.zeros(len(test_poly))
for fold in range(1,6):
    train_index_list = tmp[tmp['fold'] != fold].index
    test_index_list = tmp[tmp['fold'] == fold].index
                          
    X_train = X_poly.iloc[train_index_list]
    y_train = y.iloc[train_index_list]
    X_val = X_poly.iloc[test_index_list]
    y_val = y.iloc[test_index_list]

    model=lgbm.LGBMRegressor(**params)
    eval_set = [(X_val, y_val)]
    model.fit(X_train, y_train,eval_metric='rmse', eval_set=eval_set, verbose=False)#,early_stopping_rounds=5,
    predictions += model.predict(test_poly)
predictions = predictions/5

In [None]:
test["target"]=predictions

# 5. Submission

In [None]:
sample_submission=pd.read_csv("sample_submission.csv")
sample_submission.head()

In [None]:
sample_submission=pd.merge(sample_submission["id"],test[["id","target"]], on="id")
sample_submission.head()

In [None]:
sample_submission.to_csv("/kaggle/working/submission6.csv", index=False)