In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from scipy.stats import norm, skew

import warnings
warnings.filterwarnings('ignore')

In [2]:
sample_submission=pd.read_csv("../input/tabular-playground-series-jan-2021/sample_submission.csv")
test=pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")
train=pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")


In [3]:
train_ID=train['id']
test_ID=test['id']

del train["id"]
del test["id"]

# data processing

In [4]:
f,ax=plt.subplots(1,2,figsize=(12,4))
sns.distplot(train.target,fit=norm,ax=ax[0])
stats.probplot(train.target,plot=ax[1])
print(norm.fit(train.target))

In [5]:
sel_cols=[col for col in train.columns if "cont" in col]
sel_tar=["target"]

In [6]:
#StandardScaler
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
d1=ss.fit_transform(train[sel_cols].values)
train2=train.copy()
train2[sel_cols]=d1

In [7]:
f,ax=plt.subplots(14,2,figsize=(14,24))
for i in range(14):
    sns.distplot(train[sel_cols[i]],ax=ax[i][0])
    sns.distplot(train2[sel_cols[i]],ax=ax[i][1])    

# feature engineering

In [8]:
#corr
corr=train.corr()
plt.subplots(figsize=(16,16))
sns.heatmap(corr,annot=True, fmt='.1f',cmap="coolwarm")

In [9]:
# merge all data
ntrain=train.shape[0]
ntest=test.shape[0]
y=train['target'].values
all_data=pd.concat((train,test)).reset_index(drop=True)
all_data.drop(['target'],axis=1,inplace=True)
print("All_data size is : {}".format(all_data.shape))
print(f"ntrain:{ntrain}, ntest:{ntest}")

In [10]:
# null data check
all_datat_na=all_data.isnull().sum()/all_data.shape[0]
missing_data=pd.DataFrame({"Missing Rate":all_datat_na})
missing_data[missing_data['Missing Rate']>0].sort_values(['Missing Rate'],ascending=False)[:10]

In [11]:
X=train[sel_cols].values
X2=train2[sel_cols].values
y=train.target

# model

In [12]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, KFold,train_test_split
from sklearn.metrics import mean_squared_error

import datetime

In [13]:
!pip install pytorch-tabnet
from pytorch_tabnet.tab_model import TabNetRegressor

In [59]:
# optuna xgb
#Best_trial={
# 'gpu_id':0,
# 'tree_method': 'gpu_hist',
# 'lambda': 0.001152391713011597,
# 'alpha': 0.2809535002670384,
# 'colsample_bytree': 0.4,
# 'subsample': 0.4,
# 'learning_rate': 0.02,
# 'max_depth': 9,
# 'random_state': 2020,
# 'min_child_weight': 106,
# 'n_estimators': 10000,
#           }
#XGBR = xgb.XGBRegressor(**Best_trial)

In [79]:
#220603
Best_trial={
     'gpu_id':0,
     'tree_method': 'gpu_hist',
     'lambda': 0.27576390147239227, 
     'alpha': 0.5880286226930626, 
    'colsample_bytree': 0.4, 
    'subsample': 0.7, 
    'learning_rate': 0.01, 
    'max_depth': 11, 
    'random_state': 2020, 
    'min_child_weight': 25,
    'n_estimators':10000
}
XGBR = xgb.XGBRegressor(**Best_trial)    

In [61]:
XGBR = xgb.XGBRegressor(
    gpu_id=0,
    tree_method="gpu_hist",
                        learning_rate=0.01,
                       n_estimators=6000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:squarederror',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)

In [80]:
random_state = 2

regressors = []
# regressors.append(RandomForestRegressor(random_state=random_state))
# regressors.append(GradientBoostingRegressor(random_state=random_state))
regressors.append(xgb.XGBRegressor(gpu_id=0,tree_method="gpu_hist"))
regressors.append(XGBR)
regressors.append(lgb.LGBMRegressor(device="gpu",gpu_platform_id=0,gpu_device_id=0))

cv_algorithms=[]
# cv_algorithms.append("RF")
# cv_algorithms.append("GB")
cv_algorithms.append("XGB")
cv_algorithms.append("XGB_Optuna")
cv_algorithms.append("LGBM")

In [81]:
sel_col = [col for col in train.columns if "con" in col]
sel_tar = ["target"]

cv_results_all=[]

CV=True
TABNET=False

In [82]:
#Validation function
if CV:
    cv=KFold(5, shuffle=True, random_state=42)
    for model in regressors :
        print(datetime.datetime.now(),model)
        cv_results_all.append(-cross_val_score(model, X, y, scoring="neg_root_mean_squared_error", cv = cv))
    
    print(datetime.datetime.now())

In [None]:
if TABNET:
    X2=train[sel_col].to_numpy()
    y2=train[sel_tar].to_numpy()
    kf=KFold(5, shuffle=True, random_state=42)
    cv_results=[]
    for t,v in kf.split(X):
        X_train, X_valid = X2[t], X2[v]
        y_train, y_valid = y2[t], y2[v]
        regressor = TabNetRegressor(verbose=0,seed=42)
        regressor.fit(X_train=X_train, y_train=y_train,
                  eval_set=[(X_valid, y_valid)],
                  patience=300, max_epochs=2000,
                  eval_metric=['rmse'])
        cv_results.append(regressor.best_cost)

    cv_results_all.append(cv_results)    
    cv_algorithms.append("TABNET")

In [None]:
if CV:
    cv_means=np.mean(cv_results_all,axis=1)
    cv_std=np.std(cv_results_all,axis=1)    
    cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,"Algorithm":cv_algorithms})
    print(cv_res)   

In [None]:
cv_results_all

# Optuna

In [24]:
import optuna
OPTUNA=False
def objective(trial,data=X,target=y):    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)

    param = {
    'gpu_id':0,
    'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
    'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
    'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
    'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
    'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
    'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
    'n_estimators': 10000,
    'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
    'random_state': trial.suggest_categorical('random_state', [2020]),
    'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }    
    model = xgb.XGBRegressor(**param)  
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    preds = model.predict(test_x)
    rmse = mean_squared_error(test_y, preds,squared=False)
    return rmse

In [25]:
#xgboost optuna 
if OPTUNA:
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=30)
    print('Number of finished trials:', len(study.trials))
    print('Best_trial=', study.best_trial.params)

In [26]:
#xgboost optuna 
if OPTUNA:
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=30)
    print('Number of finished trials:', len(study.trials))
    print('Best_trial=', study.best_trial.params)