In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from scipy.stats import norm, skew

import warnings
warnings.filterwarnings('ignore')

In [3]:
sample_submission=pd.read_csv("../input/tabular-playground-series-jan-2021/sample_submission.csv")
test=pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")
train=pd.read_csv("../input/tps-jan-21-sample-01/train_0.3.csv")

In [5]:
train

In [6]:
print(train.head())
print(train.columns)

In [7]:
print(test.head())
print(test.columns,len(test.columns))

In [8]:
train_ID=train['id']
test_ID=test['id']

del train["id"]
del test["id"]

# data processing

In [9]:
f,ax=plt.subplots(1,2,figsize=(12,4))
sns.distplot(train.target,fit=norm,ax=ax[0])
stats.probplot(train.target,plot=ax[1])
print(norm.fit(train.target))

In [10]:
#train['target']=np.log1p(train['target'])
#f,ax=plt.subplots(1,2,figsize=(12,4))
#sns.distplot(train['target'],fit=norm,ax=ax[0])
#(mu,sigma)=norm.fit(train['target'])
#print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#res=stats.probplot(train['target'],plot=ax[1])

In [11]:
sel_cols=[col for col in train.columns if "cont" in col]

In [12]:
#StandardScaler
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
d1=ss.fit_transform(train[sel_cols].values)
train2=train.copy()
train2[sel_cols]=d1

In [13]:
f,ax=plt.subplots(14,2,figsize=(14,24))
for i in range(14):
    sns.distplot(train[sel_cols[i]],ax=ax[i][0])
    sns.distplot(train2[sel_cols[i]],ax=ax[i][1])    

# feature engineering

In [14]:
# out lier
#for col in sel_cols:
#    plt.boxplot([train[col],test[col]], labels=['train', 'test'])
#    plt.title(col),plt.legend(),plt.show()

In [15]:
#corr
corr=train.corr()
plt.subplots(figsize=(16,16))
sns.heatmap(corr,annot=True, fmt='.1f',cmap="coolwarm")

In [16]:
# merge all data
ntrain=train.shape[0]
ntest=test.shape[0]
y=train['target'].values
all_data=pd.concat((train,test)).reset_index(drop=True)
all_data.drop(['target'],axis=1,inplace=True)
print("All_data size is : {}".format(all_data.shape))
print(f"ntrain:{ntrain}, ntest:{ntest}")

In [17]:
# null data check
all_datat_na=all_data.isnull().sum()/all_data.shape[0]
missing_data=pd.DataFrame({"Missing Rate":all_datat_na})
missing_data[missing_data['Missing Rate']>0].sort_values(['Missing Rate'],ascending=False)[:10]

In [18]:
X=train[sel_cols].values
X2=train2[sel_cols].values
y=train.target

# model

In [19]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, KFold,train_test_split
from sklearn.metrics import mean_squared_error

In [20]:
# optuna xgb
Best_trial={'lambda': 0.001152391713011597,
 'alpha': 0.2809535002670384,
 'colsample_bytree': 0.4,
 'subsample': 0.4,
 'learning_rate': 0.02,
 'max_depth': 9,
 'random_state': 2020,
 'min_child_weight': 106,
 'n_estimators': 10000,
 'tree_method': 'gpu_hist'}
XGBR = xgb.XGBRegressor(**Best_trial)

In [21]:
# optuna lgbm
Best_trial2= {'max_depth': 10, 
             'learning_rate': 0.0020197307704413006, 
             'n_estimators': 2873, 
             'min_child_samples': 92, 
             'subsample': 0.5874209921471439}
LGBMR=lgb.LGBMRegressor(**Best_trial2)

In [22]:
random_state = 2

regressors = []
regressors.append(RandomForestRegressor(random_state=random_state))
regressors.append(GradientBoostingRegressor(random_state=random_state))
regressors.append(xgb.XGBRegressor())
regressors.append(XGBR)
regressors.append(lgb.LGBMRegressor())
regressors.append(LGBMR)

In [23]:
#Validation function
import datetime
  
cv_results = []
cv=KFold(5, shuffle=True, random_state=42)
for model in regressors :
    print(datetime.datetime.now(),model)
    cv_results.append(-cross_val_score(model, X, y, scoring="neg_root_mean_squared_error", cv = cv))
    
cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

In [1]:
cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,"Algorithm":["RandomForest","GradientBoosting","XGBoost","XGBoost_optuna","LGBM","LGBM_optuna"]})
f,ax=plt.subplots(1,1,figsize=(12,4))
ax.set(xlabel='common xlabel', ylabel='common ylabel')
g = sns.barplot("CrossValMeans","Algorithm",data = cv_res, palette="Set3",orient = "h",**{'xerr':cv_std},ax=ax)
g.set_xlabel("Mean Accuracy")
g = g.set_title("Cross validation scores")
print(cv_res)

# Optuna

In [None]:
#xgboost optuna 
import optuna
def objective(trial,data=X,target=y):    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)

    param = {
    'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
    'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
    'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
    'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
    'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
    'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
    'n_estimators': 10000,
    'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
    'random_state': trial.suggest_categorical('random_state', [2020]),
    'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }    
    model = xgb.XGBRegressor(**param)  
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    preds = model.predict(test_x)
    rmse = mean_squared_error(test_y, preds,squared=False)
    return rmse

In [None]:
#xgboost optuna 
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
#xgboost optuna 
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)
print('Number of finished trials:', len(study.trials))
print('Best_trial=', study.best_trial.params)

In [None]:
#lgbm optuna
def objective(trial,data=X,target=y):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)

    param = {
        'device':'gpu',
        'objective': 'regression', # 회귀
        'metric': 'rmse', 
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
    }

    model = lgb.LGBMRegressor(**param)
    model.fit(train_x, train_y, eval_set=[(test_x, test_y)],early_stopping_rounds=100,verbose=False)
    preds=model.predict(test_x)
    rmse = mean_squared_error(test_y,preds,squared=False)
    return rmse

In [None]:
# lightgbm optuna
import optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=120)
print('Number of finished trials:', len(study.trials))
print('Best_trial=', study.best_trial.params)

# optuna visualization

In [None]:
#plot_optimization_histor: shows the scores from all trials as well as the best score so far at each point.
optuna.visualization.plot_optimization_history(study)

In [None]:
#plot_parallel_coordinate: interactively visualizes the hyperparameters and scores
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
'''plot_slice: shows the evolution of the search. You can see where in the hyperparameter space your search
went and which parts of the space were explored more.'''
optuna.visualization.plot_slice(study)

In [None]:
#plot_contour: plots parameter interactions on an interactive chart. You can choose which hyperparameters you would like to explore.
optuna.visualization.plot_contour(study, params=['alpha',
                            #'max_depth',
                            'lambda',
                            'subsample',
                            'learning_rate',
                            'subsample'])

In [None]:
#Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [None]:
#Visualize empirical distribution function
optuna.visualization.plot_edf(study)

In [None]:
#Let's create an XGBoostRegressor model with the best hyperparameters
Best_trial = study.best_trial.params
Best_trial["n_estimators"], Best_trial["tree_method"] = 10000, 'gpu_hist'
Best_trial

In [None]:
# XGBoost
Best_trial={'lambda': 0.001152391713011597,
 'alpha': 0.2809535002670384,
 'colsample_bytree': 0.4,
 'subsample': 0.4,
 'learning_rate': 0.02,
 'max_depth': 9,
 'random_state': 2020,
 'min_child_weight': 106,
 'n_estimators': 10000,
 'tree_method': 'gpu_hist'}
XGBR = xgb.XGBRegressor(**Best_trial)
XGBR.fit(X, y)  
XGBC_best = gsXGBC.best_estimator_

# Best score
np.sqrt(-gsXGBC.best_score_)

In [None]:
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=5,random_state=48,shuffle=True)
rmse=[]  # list contains rmse for each fold
n=0
for trn_idx, test_idx in kf.split(train[columns],train['target']):
    X_tr,X_val=train[columns].iloc[trn_idx],train[columns].iloc[test_idx]
    y_tr,y_val=train['target'].iloc[trn_idx],train['target'].iloc[test_idx]
    model = xgb.XGBRegressor(**Best_trial)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    preds+=model.predict(test[columns])/kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(f"fold: {n+1} ==> rmse: {rmse[n]}")
    n+=1

# Grid Search

In [None]:
RFC = RandomForestRegressor()

## Search grid for optimal parameters
rf_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300,500]}

gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=cv, scoring="neg_mean_squared_error", n_jobs= 4, verbose = 1)
gsRFC.fit(X,y)
RFC_best = gsRFC.best_estimator_

# Best score
np.sqrt(-gsRFC.best_score_)

In [None]:
# Gradient boosting tunning
GBC = GradientBoostingRegressor()
gb_param_grid = {
              'n_estimators' : [100,200,300],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [3,6,9]
              }

gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=cv, scoring="neg_mean_squared_error", n_jobs= 4, verbose = 1)
gsGBC.fit(X,y)
GBC_best = gsGBC.best_estimator_

# Best score
np.sqrt(-gsGBC.best_score_)

In [None]:
# XGBoost
XGBR = xgb.XGBRegressor()
xgb_param_grid = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'tree_method':'gpu_hist',
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}
 
gsXGBC = GridSearchCV(XGBR,param_grid = xgb_param_grid, cv=cv, scoring="neg_mean_squared_error", n_jobs= 4, verbose = 1)

gsXGBC.fit(X, y)  
XGBC_best = gsXGBC.best_estimator_

# Best score
np.sqrt(-gsXGBC.best_score_)

In [None]:
# LGBM
LGBMR = lgb.LGBMRegressor()
lgbm_param_grid = {
    'num_leaves': [11, 21, 31, 41],
    'learning_rate': [.03, 0.05, .07, .1],
    'n_estimators': [100, 200, 300],
}
 
gsLGBM = GridSearchCV(LGBMR,param_grid = lgbm_param_grid, cv=cv, scoring="neg_mean_squared_error", n_jobs= 4, verbose = 1)

gsLGBM.fit(X, y)  
LGBM_best = gsLGBM.best_estimator_

# Best score
np.sqrt(-gsLGBM.best_score_)

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))