<a href="https://colab.research.google.com/github/jeawoo0594/colabproject/blob/main/tabular_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Basic Imports 
import numpy as np
import pandas as pd

# Plotting 
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline

# Preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder

# Metrics 
from sklearn.metrics import mean_squared_error, mean_absolute_error

# ML Models
import lightgbm as lgb
from lightgbm import LGBMRegressor 
import xgboost as xg 
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm

# Ignore Warnings 
import warnings
warnings.filterwarnings('ignore')

#drive uproad
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install shap

In [None]:
!pip install Bayesian-Optimization

In [None]:
# Feature Importance 
import shap

# Model Tuning 
from bayes_opt import BayesianOptimization

In [None]:
train = pd.read_csv('/content/drive/My Drive//tabular/train.csv')
test = pd.read_csv('/content/drive/My Drive//tabular/test.csv')
train

In [None]:
%tensorflow_version 1.x
import tensorflow as tf
import timeit

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

def cpu():
  with tf.device('/cpu:0'):
    random_image_cpu = tf.random.normal((100, 100, 100, 3))
    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)
    return tf.math.reduce_sum(net_cpu)

def gpu():
  with tf.device('/device:GPU:0'):
    random_image_gpu = tf.random.normal((100, 100, 100, 3))
    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)
    return tf.math.reduce_sum(net_gpu)
  
# We run each op once to warm up; see: https://stackoverflow.com/a/45067900
cpu()
gpu()

# Run the op several times.
print('Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images '
      '(batch x height x width x channel). Sum of ten runs.')
print('CPU (s):')
cpu_time = timeit.timeit('cpu()', number=10, setup="from __main__ import cpu")
print(cpu_time)
print('GPU (s):')
gpu_time = timeit.timeit('gpu()', number=10, setup="from __main__ import gpu")
print(gpu_time)
print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))

In [None]:
train.describe()
train.isnull().sum()
train.corr(method='pearson')
#heatmap으로 상관관계를 표시
import seaborn as sb
plt.rcParams["figure.figsize"] = (40,40)
sb.heatmap(train.corr(),
           annot = True, #실제 값 화면에 나타내기
           cmap = 'Greens', #색상
           vmin = -1, vmax=1 , #컬러차트 영역 -1 ~ +1
          )
#선형회귀
import statsmodels.api as sm
multi_model = sm.OLS(y_train, x_train)
fitted_multi_model = multi_model.fit()
fitted_multi_model.summary()

In [None]:
from sklearn.decomposition import PCA
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder, StandardScaler,RobustScaler
class Preprocessor():
    def __init__(self):
        self.en_dic = None
        self.standard_scaler = None
        self.num_cols = None
        self.cat_cols = None
        # self.test_cats_onehot,self.test_onehot_cols = self.cats_onehot(test_cats_en)
    def preprocess(self,data_df,train=True,combine_min_cats=False, add_pca_feats=False):

        if train:
            self.train_ids = data_df.loc[:, 'id']
            train_cats = data_df.loc[:, data_df.dtypes == object]
            self.cat_cols = train_cats.columns

            if combine_min_cats:
                self._find_minority_cats(train_cats)
                train_cats = self._combine_minority_feats(train_cats)

            self.en_dic = defaultdict(LabelEncoder)
            train_cats_en = train_cats.apply(lambda x: self.en_dic[x.name].fit_transform(x))
            tr_cats_onehot,tr_onehot_cols = self.cats_onehot(train_cats_en)
            
            train_num = data_df.loc[:,data_df.dtypes != object].drop(columns=['target','id'])
            self.num_cols = train_num.columns
            self.Robust_Scaler = RobustScaler()
            train_num_scaler = self.Robust_Scaler.fit_transform(train_num)

            if add_pca_feats:
                pca_feats = self._return_num_pca(train_num_scaler)
                X = pd.DataFrame(np.hstack((train_cats_en,pca_feats)),columns=list(train_cats_en)+list(self.pca_cols))
            else:
                X = pd.DataFrame(np.hstack((train_cats_en,train_num_scaler)),columns = list(train_cats_en)+list(self.num_cols))

        else:
            self.test_ids = data_df.loc[:,'id']
            test_cats = data_df.loc[:, self.cat_cols]
            if combine_min_cats:
                self._find_minority_cats(test_cats)
                test_cats = self._combine_minority_feats(test_cats)
                
            test_cats_en = test_cats.apply(lambda x: self.en_dic[x.name].fit_transform(x))
            test_cats_onehot,test_onehot_cols = self.cats_onehot(test_cats_en)
            test_num = data_df.loc[:,self.num_cols]
            test_num_scaler = self.Robust_Scaler.fit_transform(test_num)

            if add_pca_feats:
                pca_feats = self._return_num_pca(test_num_scaler,train=False)

                X = pd.DataFrame(np.hstack((test_cats_en,pca_feats)),columns = list(test_cats_en)+list(self.pca_cols))
            
            else:
                X = pd.DataFrame(np.hstack((test_cats_en,test_num_scaler)),columns = list(test_cats_en)+list(self.num_cols))

        return X

    def cats_onehot(self,data_df):
        self.cats_df = pd.get_dummies(data=data_df,columns=self.cat_cols, prefix= self.cat_cols)
        self.cats_onehot_cols = self.cats_df.columns
        return self.cats_df, self.cats_onehot_cols
   
    def _find_minority_cats(self, data_df):
        self.composite_category = 'z'
        self.threshold = 0.05
        self.minority_col_dict = {}
        self.minority_map_dic = {}
        for feature in self.cat_cols:
            self.minority_col_dict[feature] = []
            self.minority_map_dic[feature] = {}
            
            for category,proportion in data_df[feature].value_counts(normalize=True).iteritems():
                if proportion < self.threshold:
                    self.minority_col_dict[feature].append(category)
                    self.minority_map_dic[feature] = { x : self.composite_category for x in self.minority_col_dict[feature]}
        return self.minority_map_dic, self.minority_col_dict
    
    def _combine_minority_feats(self, data_df, replace = False):
        new_df = data_df.copy()
        for feat in self.cat_cols:
            col_label = f"{feat}" if replace else f"{feat}_new"
            new_df[feat] = new_df[feat].replace(self.minority_map_dic[feat])
        return new_df

    def _return_num_pca(self,num_df,train=True):
        self.n_components = 0.85
        if train:
            self.pca = PCA(n_components = self.n_components)
            
            num_rd = self.pca.fit_transform(num_df)
            print(f'pca_explain: {self.pca.explained_variance_ratio_}')
            self.pca_cols = [f'pca_{x}' for x in range(num_rd.shape[1])]

        else:
            num_rd = self.pca.transform(num_df)

            self.pca_cols = [f'pca_{x}' for x in range(num_rd.shape[1])]
        
        return pd.DataFrame(num_rd, columns = self.pca_cols)

In [None]:
data_proc = Preprocessor()
X = data_proc.preprocess(train, combine_min_cats=False, add_pca_feats=True)
y = train.loc[:, 'target']
X_test = data_proc.preprocess(test,train=False,combine_min_cats=False, add_pca_feats=True)
pd.set_option('display.max_columns', 500)
print(X)

In [None]:
# ad_x = X.drop(X.columns[10:24],axis=1)
# ad_x
# ad_x = ad_x.drop(['pca_6'],axis=1)
# ad_test = X_test.drop(X_test.columns[10:24],axis=1)
# ad_test = ad_test.drop(['pca_6'],axis=1)
# ad_test

In [None]:
!pip install optuna

In [None]:
def search_best_param(x_train,y_train,cat_features):
    SEED = 123
    cat_features = x_train.columns[0:10]
    train = lgb.Dataset(data=X, label=y,categorical_feature = cat_features,free_raw_data=False)

    def lightGBM_CV(max_depth, num_leaves, n_estimators, learning_rate, subsample, colsample_bytree, 
                lambda_l1, lambda_l2, min_child_weight):
        params = {'boosting_type': 'gbdt', 'objective': 'regression', 'metric':'rmse', 'verbose': -1,
                  'early_stopping_round':100}
        params['max_depth'] = int(round(max_depth))
        params["num_leaves"] = int(round(num_leaves))
        params["n_estimators"] = int(round(n_estimators))
        params['learning_rate'] = learning_rate
        params['subsample'] = subsample
        params['colsample_bytree'] = colsample_bytree
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_child_weight'] = int(round(min_child_weight))
        
        score = lgb.cv(params, train, nfold=5, seed=123, stratified=False, verbose_eval =False, metrics=['rmse'])
        
        return -np.min(score['rmse-mean'])

    lightGBM_BO = BayesianOptimization(lightGBM_CV, {
                                          'max_depth': (5, 50),
                                          'num_leaves': (20, 100),
                                          'n_estimators': (1000, 30000),
                                          'learning_rate': (0.05, 0.3),
                                          'subsample': (0.7, 0.8),
                                          'colsample_bytree' :(0.5, 0.99),
                                          'lambda_l1': (0, 5),
                                          'lambda_l2': (0, 3),
                                          'min_child_weight': (2, 50) 
                                      },
                                       random_state = SEED,
                                       verbose = -1)
    np.random.seed(SEED)
    lightGBM_BO.maximize(init_points=5, n_iter=25)

In [None]:
# save submission in csv format
submission_df = pd.read_csv('/content/drive/My Drive//tabular/sample_submission.csv')
submission_df['target'] = preds
submission_df.to_csv('submission_tabular_ensemble.csv',index=False)
!ls
from google.colab import files
files.download('submission_tabular_ensemble.csv')