In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from scipy.stats import norm, skew

import warnings
warnings.filterwarnings('ignore')

In [47]:
sample_submission=pd.read_csv("../input/tabular-playground-series-jan-2021/sample_submission.csv")
test=pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")
train=pd.read_csv("../input/tsp-dataset/train_2101_0.3.csv")

In [48]:
train_ID=train['id']
test_ID=test['id']

del train["id"]
del test["id"]

# data processing

In [49]:
f,ax=plt.subplots(1,2,figsize=(12,4))
sns.distplot(train.target,fit=norm,ax=ax[0])
stats.probplot(train.target,plot=ax[1])
print(norm.fit(train.target))

In [50]:
#train['target']=np.log1p(train['target'])
#f,ax=plt.subplots(1,2,figsize=(12,4))
#sns.distplot(train['target'],fit=norm,ax=ax[0])
#(mu,sigma)=norm.fit(train['target'])
#print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#res=stats.probplot(train['target'],plot=ax[1])

In [51]:
sel_cols=[col for col in train.columns if "cont" in col]

In [52]:
#StandardScaler
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
d1=ss.fit_transform(train[sel_cols].values)
train2=train.copy()
train2[sel_cols]=d1

In [53]:
f,ax=plt.subplots(14,2,figsize=(14,24))
for i in range(14):
    sns.distplot(train[sel_cols[i]],ax=ax[i][0])
    sns.distplot(train2[sel_cols[i]],ax=ax[i][1])    

# feature engineering

In [54]:
# out lier
#for col in sel_cols:
#    plt.boxplot([train[col],test[col]], labels=['train', 'test'])
#    plt.title(col),plt.legend(),plt.show()

In [55]:
#corr
corr=train.corr()
plt.subplots(figsize=(16,16))
sns.heatmap(corr,annot=True, fmt='.1f',cmap="coolwarm")

In [56]:
# merge all data
ntrain=train.shape[0]
ntest=test.shape[0]
y=train['target'].values
all_data=pd.concat((train,test)).reset_index(drop=True)
all_data.drop(['target'],axis=1,inplace=True)
print("All_data size is : {}".format(all_data.shape))
print(f"ntrain:{ntrain}, ntest:{ntest}")

In [57]:
# null data check
all_datat_na=all_data.isnull().sum()/all_data.shape[0]
missing_data=pd.DataFrame({"Missing Rate":all_datat_na})
missing_data[missing_data['Missing Rate']>0].sort_values(['Missing Rate'],ascending=False)[:10]

In [58]:
X=train[sel_cols].values
X2=train2[sel_cols].values
y=train.target

# model

In [59]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, KFold,train_test_split
from sklearn.metrics import mean_squared_error

import datetime


In [60]:
!pip install pytorch-tabnet
from pytorch_tabnet.tab_model import TabNetRegressor

In [61]:
# optuna xgb
Best_trial={'lambda': 0.001152391713011597,
 'alpha': 0.2809535002670384,
 'colsample_bytree': 0.4,
 'subsample': 0.4,
 'learning_rate': 0.02,
 'max_depth': 9,
 'random_state': 2020,
 'min_child_weight': 106,
 'n_estimators': 10000,
 'tree_method': 'gpu_hist'}
XGBR = xgb.XGBRegressor(**Best_trial)

In [62]:
# optuna lgbm
Best_trial2= {'max_depth': 10, 
             'learning_rate': 0.0020197307704413006, 
             'n_estimators': 2873, 
             'min_child_samples': 92, 
             'subsample': 0.5874209921471439}
LGBMR=lgb.LGBMRegressor(**Best_trial2)

In [63]:
random_state = 2
Cross_Val=True

regressors = []
regressors.append(RandomForestRegressor(random_state=random_state))
regressors.append(GradientBoostingRegressor(random_state=random_state))
regressors.append(xgb.XGBRegressor())
regressors.append(XGBR)
regressors.append(lgb.LGBMRegressor())
regressors.append(LGBMR)

In [65]:
sel_col = [col for col in train.columns if "con" in col]
sel_tar = ["target"]
cv_results=[]
if Cross_Val:
    X2=train[sel_col].to_numpy()
    y2=train[sel_tar].to_numpy()
    kf=KFold(5, shuffle=True, random_state=42)
    for t,v in kf.split(X):
        X_train, X_valid = X2[t], X2[v]
        y_train, y_valid = y2[t], y2[v]
        regressor = TabNetRegressor(verbose=0,seed=42)
        regressor.fit(X_train=X_train, y_train=y_train,
                  eval_set=[(X_valid, y_valid)],
                  patience=300, max_epochs=2000,
                  eval_metric=['rmse'])
        cv_results.append(regressor.best_cost)
tab_mean=np.array(cv_results).mean()
tab_std=np.array(cv_results).std()    

In [None]:
#Validation function
if Cross_Val:

    cv_results = []
    cv=KFold(5, shuffle=True, random_state=42)
    for model in regressors :
        print(datetime.datetime.now(),model)
        cv_results.append(-cross_val_score(model, X, y, scoring="neg_root_mean_squared_error", cv = cv))

    cv_means = []
    cv_std = []
    for cv_result in cv_results:
        cv_means.append(cv_result.mean())
        cv_std.append(cv_result.std())
    cv_means.append(tab_mean)
    cv_std.append(tab_std)

In [None]:
if Cross_Val:
    cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,"Algorithm":["RandomForest","GradientBoosting","XGBoost","XGBoost_optuna","LGBM","LGBM_optuna","TabNet"]})
    f,ax=plt.subplots(1,1,figsize=(12,4))
    ax.set(xlabel='common xlabel', ylabel='common ylabel')
    g = sns.barplot("CrossValMeans","Algorithm",data = cv_res, palette="Set3",orient = "h",**{'xerr':cv_std},ax=ax)
    g.set_xlabel("Mean Accuracy")
    g = g.set_title("Cross validation scores")
    print(cv_res)    

# Optuna

In [None]:
OPTUNA=False

In [None]:
#xgboost optuna 
import optuna
def objective(trial,data=X,target=y):    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)

    param = {
    'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
    'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
    'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
    'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
    'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
    'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
    'n_estimators': 10000,
    'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
    'random_state': trial.suggest_categorical('random_state', [2020]),
    'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }    
    model = xgb.XGBRegressor(**param)  
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    preds = model.predict(test_x)
    rmse = mean_squared_error(test_y, preds,squared=False)
    return rmse

In [None]:
#xgboost optuna 
if OPTUNA:
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=30)
    print('Number of finished trials:', len(study.trials))
    print('Best trial:', study.best_trial.params)

In [None]:
#lgbm optuna
def objective(trial,data=X,target=y):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)

    param = {
        'device':'gpu',
        'objective': 'regression', # 회귀
        'metric': 'rmse', 
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
    }

    model = lgb.LGBMRegressor(**param)
    model.fit(train_x, train_y, eval_set=[(test_x, test_y)],early_stopping_rounds=100,verbose=False)
    preds=model.predict(test_x)
    rmse = mean_squared_error(test_y,preds,squared=False)
    return rmse

In [None]:
# lightgbm optuna
if OPTUNA:
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=120)
    print('Number of finished trials:', len(study.trials))
    print('Best_trial=', study.best_trial.params)