<a href="https://colab.research.google.com/github/jeawoo0594/colabproject/blob/main/tabular_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnchoredText
from mpl_toolkits.axes_grid1 import make_axes_locatable
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

In [None]:
train = pd.read_csv('/content/drive/My Drive//tabular/train.csv')
test = pd.read_csv('/content/drive/My Drive//tabular/test.csv')
train

In [None]:
%tensorflow_version 1.x
import tensorflow as tf
import timeit

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

def cpu():
  with tf.device('/cpu:0'):
    random_image_cpu = tf.random.normal((100, 100, 100, 3))
    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)
    return tf.math.reduce_sum(net_cpu)

def gpu():
  with tf.device('/device:GPU:0'):
    random_image_gpu = tf.random.normal((100, 100, 100, 3))
    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)
    return tf.math.reduce_sum(net_gpu)
  
# We run each op once to warm up; see: https://stackoverflow.com/a/45067900
cpu()
gpu()

# Run the op several times.
print('Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images '
      '(batch x height x width x channel). Sum of ten runs.')
print('CPU (s):')
cpu_time = timeit.timeit('cpu()', number=10, setup="from __main__ import cpu")
print(cpu_time)
print('GPU (s):')
gpu_time = timeit.timeit('gpu()', number=10, setup="from __main__ import gpu")
print(gpu_time)
print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))

In [None]:
train.describe()
train.isnull().sum()
train.corr(method='pearson')
#heatmap으로 상관관계를 표시
import seaborn as sb
plt.rcParams["figure.figsize"] = (40,40)
sb.heatmap(train.corr(),
           annot = True, #실제 값 화면에 나타내기
           cmap = 'Greens', #색상
           vmin = -1, vmax=1 , #컬러차트 영역 -1 ~ +1
          )
#선형회귀
import statsmodels.api as sm
multi_model = sm.OLS(y_train, x_train)
fitted_multi_model = multi_model.fit()
fitted_multi_model.summary()

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder, StandardScaler,RobustScaler
class Preprocessor():
    def __init__(self):
        self.en_dic = None
        self.standard_scaler = None
        self.num_cols = None
        self.cat_cols = None
        # self.test_cats_onehot,self.test_onehot_cols = self.cats_onehot(test_cats_en)
    def preprocess(self,data_df,train=True,combine_min_cats=False, add_pca_feats=False):

        if train:
            self.train_ids = data_df.loc[:, 'id']
            train_cats = data_df.loc[:, data_df.dtypes == object]
            self.cat_cols = train_cats.columns

            if combine_min_cats:
                self._find_minority_cats(train_cats)
                train_cats = self._combine_minority_feats(train_cats)

            self.en_dic = defaultdict(LabelEncoder)
            train_cats_en = train_cats.apply(lambda x: self.en_dic[x.name].fit_transform(x))
            tr_cats_onehot,tr_onehot_cols = self.cats_onehot(train_cats_en)
            
            train_num = data_df.loc[:,data_df.dtypes != object].drop(columns=['target','id'])
            self.num_cols = train_num.columns
            self.Robust_Scaler = RobustScaler()
            train_num_scaler = self.Robust_Scaler.fit_transform(train_num)

            if add_pca_feats:
                pca_feats = self._return_num_pca(train_num_scaler)
                X = pd.DataFrame(np.hstack((tr_cats_onehot,pca_feats)),columns=list(tr_onehot_cols)+list(self.pca_cols))
            else:
                X = pd.DataFrame(np.hstack((tr_cats_onehot,train_num_scaler)),columns = list(tr_onehot_cols)+list(self.num_cols))

        else:
            self.test_ids = data_df.loc[:,'id']
            test_cats = data_df.loc[:, cat_cols]
            if combine_min_cats:
                self._find_minority_cats(test_cats)
                test_cats = self._combine_minority_feats(test_cats)
                
            test_cats_en = test_cats.apply(lambda x: self.en_dic[x.name].fit_transform(x))
            test_cats_onehot,test_onehot_cols = self.cats_onehot(test_cats_en)
            test_num = data_df.loc[:,self.num_cols]
            test_num_scaler = self.Robust_Scaler.fit_transform(test_num)

            if add_pca_feats:
                pca_feats = self._return_num_pca(test_num_scaler,train=False)

                X = pd.DataFrame(np.hstack((test_cats_onehot,pca_feats)),columns = list(test_onehot_cols)+list(self.pca_cols))
            
            else:
                X = pd.DataFrame(np.hstack((test_cats_onehot,test_num_scaler)),columns = list(test_onehot_cols)+list(self.num_cols))

        return X

    def cats_onehot(self,data_df):
        self.cats_df = pd.get_dummies(data=data_df,columns=self.cat_cols, prefix= self.cat_cols)
        self.cats_onehot_cols = self.cats_df.columns
        return self.cats_df, self.cats_onehot_cols
   
    def _find_minority_cats(self, data_df):
        self.composite_category = 'z'
        self.threshold = 0.05
        self.minority_col_dict = {}
        self.minority_map_dic = {}
        for feature in cat_cols:
            self.minority_col_dict[feature] = []
            self.minority_map_dic[feature] = {}
            
            for category,proportion in data_df[feature].value_counts(normalize=True).iteritems():
                if proportion < self.threshold:
                    self.minority_col_dict[feature].append(category)
                    self.minority_map_dic[feature] = { x : self.composite_category for x in self.minority_col_dict[feature]}
        return self.minority_map_dic, self.minority_col_dict
    
    def _combine_minority_feats(self, data_df, replace = False):
        new_df = data_df.copy()
        for feat in self.cat_cols:
            col_label = f"{feat}" if replace else f"{feat}_new"
            new_df[feat] = new_df[feat].replace(self.minority_map_dic[feat])
        return new_df

    def _return_num_pca(self,num_df,train=True):
        self.n_components = 0.8
        if train:
            self.pca = PCA(n_components = self.n_components)
            
            num_rd = self.pca.fit_transform(num_df)
            print(f'pca_explain: {self.pca.explained_variance_ratio_}')
            self.pca_cols = [f'pca_{x}' for x in range(num_rd.shape[1])]

        else:
            num_rd = self.pca.transform(num_df)

            self.pca_cols = [f'pca_{x}' for x in range(num_rd.shape[1])]
        
        return pd.DataFrame(num_rd, columns = self.pca_cols)

In [None]:
data_proc = Preprocessor()
X = data_proc.preprocess(train, combine_min_cats=True, add_pca_feats=True)
y = train.loc[:, 'target']
X_test = data_proc.preprocess(test,train=False,combine_min_cats=True,add_pca_feats=True)
pd.set_option('display.max_columns', 500)
print(X)

In [None]:
ad_x = X.drop(X.columns[10:24],axis=1)
ad_x
ad_x = ad_x.drop(['pca_6'],axis=1)
ad_test = X_test.drop(X_test.columns[10:24],axis=1)
ad_test = ad_test.drop(['pca_6'],axis=1)
ad_test

In [None]:
from sklearn.model_selection import KFold
# ------------------------------------------------------------------------------
# Parameters
# ------------------------------------------------------------------------------
N_FOLDS = 5
N_ESTIMATORS = 30000
SEED = 1234
BAGGING_SEED = 481

# ------------------------------------------------------------------------------
# LightGBM: training and inference
# ------------------------------------------------------------------------------
lgb_params = {'random_state': SEED,
          'metric': 'rmse',
          'n_estimators': N_ESTIMATORS,
          'n_jobs': -1,
        #   'cat_feature': [x for x in range(len(cat_cols))],
          'bagging_seed': SEED,
          'feature_fraction_seed': SEED,
          'learning_rate': 0.0011323124124,
          'max_depth': 99,
          'num_leaves': 63,
          'reg_alpha': 9.562925363678952,
          'reg_lambda': 9.355810045480153,
          'colsample_bytree': 0.2256038826485174,
          'min_child_samples': 290,
          'subsample_freq': 1,
          'subsample': 0.8805303688019942,
          'max_bin': 882,
          'min_data_per_group': 127
        #   'cat_smooth': 96,
        #   'cat_l2': 19
          }

ensemble_params = {
    "lgb" : lgb_params,
    'xgb': {
        'random_state': SEED,
        'max_depth': 13,
        'learning_rate': 0.0011323124124,
        'gamma': 3.5746731812451156,
        'min_child_weight': 564,
        'n_estimators': 8000,
        'colsample_bytree': 0.5015940592112956,
        'subsample': 0.6839489639112909,
        'reg_lambda': 18.085502002853246,
        'reg_alpha': 0.17532087359570606,
        'objective': 'reg:squarederror',
        'tree_method': 'gpu_hist',
        'eval_metric': 'rmse',
        'n_jobs': -1
    }
    # 'cat': {
    #     'random_state': SEED,
    #     'depth': 3.0,
    #     'fold_len_multiplier': 1.1425259013471902,
    #     'l2_leaf_reg': 7.567589781752637,
    #     'leaf_estimation_backtracking': 'AnyImprovement',
    #     'learning_rate': 0.25121635918496565,
    #     'max_bin': 107.0,
    #     'min_data_in_leaf': 220.0,
    #     'random_strength': 3.2658690042589726,
    #     'n_estimators': 8000,
    #     'eval_metric': 'RMSE',
    # }
}
colums = ad_x.columns
preds = np.zeros(ad_test.shape[0])
k_fold = KFold(n_splits=N_FOLDS, random_state=122, shuffle=True)
rmse = []
n = 0

for trn_idx, test_idx in k_fold.split(ad_x[colums], y):

    X_tr, X_val=ad_x[colums].iloc[trn_idx], ad_x[colums].iloc[test_idx]
    y_tr, y_val=y.iloc[trn_idx], y.iloc[test_idx]

    model = EnsembleModel(ensemble_params)

    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=1)

    preds += model.predict(ad_test[colums], weights=[1.0, 1.0]) / k_fold.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    
    print(f"Fold {n+1}, RMSE: {rmse[n]}")
    n += 1

    
print("Mean RMSE: ", np.mean(rmse))
# end_time = time.time() - since
# print('Training complete in {:.0f}m {:.0f}s'.format(
#         end_time // 60, end_time % 60))

In [None]:
# save submission in csv format
submission_df = pd.read_csv('/content/drive/My Drive//tabular/sample_submission.csv')
submission_df['target'] = preds
submission_df.to_csv('submission_tabular_ensemble.csv',index=False)
!ls
from google.colab import files
files.download('submission_tabular_ensemble.csv')