<a href="https://colab.research.google.com/github/jaeuHeo/colab_project/blob/main/tabular_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Basic Imports 
import numpy as np
import pandas as pd

# Plotting 
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline

# Preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder

# Metrics 
from sklearn.metrics import mean_squared_error, mean_absolute_error

# ML Models
import lightgbm as lgb
from lightgbm import LGBMRegressor 
import xgboost as xg 
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm

# Ignore Warnings 
import warnings
warnings.filterwarnings('ignore')

#drive uproad
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install shap

In [None]:
!pip install Bayesian-Optimization

In [None]:
# Feature Importance 
# import shap

# Model Tuning 
from bayes_opt import BayesianOptimization

In [None]:
train = pd.read_csv('/content/drive/My Drive//tabular/train.csv')
test = pd.read_csv('/content/drive/My Drive//tabular/test.csv')
train.head()

In [None]:
%tensorflow_version 1.x
import tensorflow as tf
import timeit

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

def cpu():
  with tf.device('/cpu:0'):
    random_image_cpu = tf.random.normal((100, 100, 100, 3))
    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)
    return tf.math.reduce_sum(net_cpu)

def gpu():
  with tf.device('/device:GPU:0'):
    random_image_gpu = tf.random.normal((100, 100, 100, 3))
    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)
    return tf.math.reduce_sum(net_gpu)
  
# We run each op once to warm up; see: https://stackoverflow.com/a/45067900
cpu()
gpu()

# Run the op several times.
print('Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images '
      '(batch x height x width x channel). Sum of ten runs.')
print('CPU (s):')
cpu_time = timeit.timeit('cpu()', number=10, setup="from __main__ import cpu")
print(cpu_time)
print('GPU (s):')
gpu_time = timeit.timeit('gpu()', number=10, setup="from __main__ import gpu")
print(gpu_time)
print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))

In [None]:
train.describe()
train.isnull().sum()
train.corr(method='pearson')
#heatmap으로 상관관계를 표시
import seaborn as sb
plt.rcParams["figure.figsize"] = (40,40)
sb.heatmap(train.corr(),
           annot = True, #실제 값 화면에 나타내기
           cmap = 'Greens', #색상
           vmin = -1, vmax=1 , #컬러차트 영역 -1 ~ +1
          )
#선형회귀
import statsmodels.api as sm
multi_model = sm.OLS(y_train, x_train)
fitted_multi_model = multi_model.fit()
fitted_multi_model.summary()

In [None]:
from sklearn.decomposition import PCA
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder, StandardScaler,RobustScaler
class Preprocessor():
    def __init__(self):
        self.en_dic = None
        self.standard_scaler = None
        self.num_cols = None
        self.cat_cols = None
        # self.test_cats_onehot,self.test_onehot_cols = self.cats_onehot(test_cats_en)
    def preprocess(self,data_df,train=True,combine_min_cats=False, add_pca_feats=False):

        if train:
            self.train_ids = data_df.loc[:, 'id']
            train_cats = data_df.loc[:, data_df.dtypes == object]
            self.cat_cols = train_cats.columns

            if combine_min_cats:
                self._find_minority_cats(train_cats)
                train_cats = self._combine_minority_feats(train_cats)

            self.en_dic = defaultdict(LabelEncoder)
            train_cats_en = train_cats.apply(lambda x: self.en_dic[x.name].fit_transform(x))
            tr_cats_onehot,tr_onehot_cols = self.cats_onehot(train_cats_en)
            
            train_num = data_df.loc[:,data_df.dtypes != object].drop(columns=['target','id'])
            self.num_cols = train_num.columns
            self.Robust_Scaler = RobustScaler()
            train_num_scaler = self.Robust_Scaler.fit_transform(train_num)

            if add_pca_feats:
                pca_feats = self._return_num_pca(train_num_scaler)
                X = pd.DataFrame(np.hstack((train_cats_en,pca_feats)),columns=list(train_cats_en)+list(self.pca_cols))
            else:
                X = pd.DataFrame(np.hstack((train_cats_en,train_num_scaler)),columns = list(train_cats_en)+list(self.num_cols))

        else:
            self.test_ids = data_df.loc[:,'id']
            test_cats = data_df.loc[:, self.cat_cols]
            if combine_min_cats:
                self._find_minority_cats(test_cats)
                test_cats = self._combine_minority_feats(test_cats)
                
            test_cats_en = test_cats.apply(lambda x: self.en_dic[x.name].fit_transform(x))
            test_cats_onehot,test_onehot_cols = self.cats_onehot(test_cats_en)
            test_num = data_df.loc[:,self.num_cols]
            test_num_scaler = self.Robust_Scaler.fit_transform(test_num)

            if add_pca_feats:
                pca_feats = self._return_num_pca(test_num_scaler,train=False)

                X = pd.DataFrame(np.hstack((test_cats_en,pca_feats)),columns = list(test_cats_en)+list(self.pca_cols))
            
            else:
                X = pd.DataFrame(np.hstack((test_cats_en,test_num_scaler)),columns = list(test_cats_en)+list(self.num_cols))

        return X

    def cats_onehot(self,data_df):
        self.cats_df = pd.get_dummies(data=data_df,columns=self.cat_cols, prefix= self.cat_cols)
        self.cats_onehot_cols = self.cats_df.columns
        return self.cats_df, self.cats_onehot_cols
   
    def _find_minority_cats(self, data_df):
        self.composite_category = 'z'
        self.threshold = 0.05
        self.minority_col_dict = {}
        self.minority_map_dic = {}
        for feature in self.cat_cols:
            self.minority_col_dict[feature] = []
            self.minority_map_dic[feature] = {}
            
            for category,proportion in data_df[feature].value_counts(normalize=True).iteritems():
                if proportion < self.threshold:
                    self.minority_col_dict[feature].append(category)
                    self.minority_map_dic[feature] = { x : self.composite_category for x in self.minority_col_dict[feature]}
        return self.minority_map_dic, self.minority_col_dict
    
    def _combine_minority_feats(self, data_df, replace = False):
        new_df = data_df.copy()
        for feat in self.cat_cols:
            col_label = f"{feat}" if replace else f"{feat}_new"
            new_df[feat] = new_df[feat].replace(self.minority_map_dic[feat])
        return new_df

    def _return_num_pca(self,num_df,train=True):
        self.n_components = 0.85
        if train:
            self.pca = PCA(n_components = self.n_components)
            
            num_rd = self.pca.fit_transform(num_df)
            print(f'pca_explain: {self.pca.explained_variance_ratio_}')
            self.pca_cols = [f'pca_{x}' for x in range(num_rd.shape[1])]

        else:
            num_rd = self.pca.transform(num_df)

            self.pca_cols = [f'pca_{x}' for x in range(num_rd.shape[1])]
        
        return pd.DataFrame(num_rd, columns = self.pca_cols)

In [None]:
data_proc = Preprocessor()
X = data_proc.preprocess(train, combine_min_cats=False, add_pca_feats=False)
y = train.loc[:, 'target']
X_test = data_proc.preprocess(test,train=False,combine_min_cats=False, add_pca_feats=False)
pd.set_option('display.max_columns', 500)
print(X.head())

In [None]:
print(f'xshape: {X.shape} \n yshape: {y.shape} \n X_test: {X_test.shape}')

In [None]:
display(X.isnull().sum())
display(y.isnull().sum())
display(X_test.isnull().sum())

In [None]:
def score_log(df:pd.DataFrame, seed: int, num_fold: int, model_name: str, cv:float):
    score_dict = {'seed':seed, 'fold': num_fold, 'model': model_name, 'cv': cv}
    df = pd.concat([df, pd.DataFrame.from_dict([score_dict])])
    print(df)
    return df

In [None]:

cat_features = [f'cat{i}' for i in range(10)]
oof = np.zeros(X.shape[0])
preds = 0
score_df = pd.DataFrame()
feature_importance = pd.DataFrame()
SEED = 2021
kf = KFold(n_splits=5,shuffle =True, random_state=SEED)
rf_params = {'random_state': SEED,
          'metric': 'rmse',
          'n_estimators': 30000,
          'n_jobs': -1,
          'cat_feature': [x for x in range(len(cat_features))],
          'bagging_seed': SEED,
          'feature_fraction_seed': SEED,
          'learning_rate': 0.003899156646724397,
          'max_depth': 99,
          'num_leaves': 63,
          'reg_alpha': 9.562925363678952,
          'reg_lambda': 9.355810045480153,
          'colsample_bytree': 0.2256038826485174,
          'min_child_samples': 290,
          'subsample_freq': 1,
          'subsample': 0.8805303688019942,
          'max_bin': 882,
          'min_data_per_group': 127,
          'cat_smooth': 96,
          'cat_l2': 19}
for fold,(train_idx,val_idx) in enumerate(kf.split(X=X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model_lgbm = LGBMRegressor(**rf_params)
    model_lgbm.fit(X_train,y_train, eval_set=[(X_val, y_val)],
                  eval_metric='rmse',
                  early_stopping_rounds=100,
                  verbose=2)
    
    tmp = pd.DataFrame()
    # tmp['feature'] = model_lgbm.feature_name_
    # tmp['importance'] = model_lgbm.feature_importances_
    # tmp['fold'] = fold
    # tmp['seed'] = SEED
    # feature_importances = feature_importances.append(tmp)

    oof[val_idx] = model_lgbm.predict(X_val)
    # preds += model_rf.predict(test_df)/5
    rmse = mean_squared_error(y_val, oof[val_idx], squared=False)
    score_df = score_log(score_df, SEED, fold, 'lgb', rmse)
    print(f"rmse {rmse}")
print('*'*100)
print(score_df)
    
    
    


In [None]:
model_lgbm.summary()

In [None]:
preds = model_lgbm.predict(X_test)
preds

In [None]:
preds = pd.DataFrame(preds)
preds

In [None]:
preds.to_csv('tabular_Pseudo Labelling.csv',index=False)
!ls
from google.colab import files
files.download('tabular_Pseudo Labelling.csv')

In [None]:
!pip install optuna

In [None]:
import optuna

original_train = train
#test
test_target = pd.read_csv('/content/drive/My Drive//tabular/tabular_Pseudo Labelling.csv')
test_target
test['target'] = test_target
test

In [None]:
train = pd.concat([original_train,test],axis=0)
train

In [None]:
id_feature = ['id']
cont_features = [f'cont{i}' for i in range(14)]
cat_features = [f'cat{i}' for i in range(10)]
target_feature = ['target']
all_features = id_feature + cat_features + cont_features + target_feature

target = train[target_feature]
train_df = train[all_features]
test_df = test[all_features]
test_df

In [None]:
train_x = data_proc.preprocess(train_df, combine_min_cats=False, add_pca_feats=False)
train_y = target
test_x = data_proc.preprocess(test_df,train=False,combine_min_cats=False, add_pca_feats=False)
pd.set_option('display.max_columns', 500)
print(test_x.head())

In [None]:
import optuna
import sklearn
from sklearn.model_selection import cross_val_score
def objective(trial):

    n_estimators = trial.suggest_int('n_estimators',20000,30000)
    # cat_feature = [x for x in range(len(cat_features))]
    learning_rate = trial.suggest_loguniform('learning_rate',0.001,0.01)
    max_depth = trial.suggest_int('max_depth',1,100)
    num_leaves = trial.suggest_int('num_leaves',1,100)
    reg_alpha = trial.suggest_loguniform('reg_alpha',1,10)
    reg_lambda = trial.suggest_loguniform('reg_lambda',1,10)
    colsample_bytree = trial.suggest_loguniform('colsample_bytree',0.01,0.99)
    min_child_samples = trial.suggest_int('min_child_samples',1,500)
    subsample = trial.suggest_loguniform('subsample',0.01,0.99)

    regrs = LGBMRegressor(n_estimators=n_estimators,learning_rate=learning_rate,
                         max_depth=max_depth, num_leaves=num_leaves,reg_alpha=reg_alpha,
                          reg_lambda=reg_lambda,colsample_bytree=colsample_bytree,
                          min_child_samples=min_child_samples,subsample=subsample)
    cross_score = cross_val_score(regrs,train_x,train_y,n_jobs=-1,scoring='neg_mean_squared_error' ,cv=5)
    rmse_scores = np.sqrt(-cross_score)
    return rmse_scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
trial = study.best_trial

print(f'rmse: {trial.value}')
print(f'best_params: {trial.params}')



In [None]:
final_oof = np.zeros(X.shape[0])
final_preds = 0
final_score_df = pd.DataFrame()
feature_importance = pd.DataFrame()
fi_SEED = 1234
kf = KFold(n_splits=5,shuffle =True, random_state=fi_SEED)
rf_params = {'random_state': fi_SEED,
          'metric': 'rmse',
          'n_estimators': 22294,
          'n_jobs': -1,
          'cat_feature': [x for x in range(len(cat_features))],
          'bagging_seed': fi_SEED,
          'feature_fraction_seed': fi_SEED,
          'learning_rate': 0.0022862168892729263,
          'max_depth': 54,
          'num_leaves': 46,
          'reg_alpha': 1.1244285255428748,
          'reg_lambda': 3.163004564816904,
          'colsample_bytree': 0.6505855225331759,
          'min_child_samples': 352,
          'subsample_freq': 1,
          'subsample': 0.010450736902586575,
          'max_bin': 882,
          'min_data_per_group': 127,
          'cat_smooth': 96,
          'cat_l2': 19}
for fold,(train_idx,val_idx) in enumerate(kf.split(X=train_x)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

    fi_lgbm = LGBMRegressor(**rf_params)
    fi_lgbm.fit(X_train,y_train, eval_set=[(X_val, y_val)],
                  eval_metric='rmse',
                  early_stopping_rounds=100,
                  verbose=2)
    
    tmp = pd.DataFrame()
    # tmp['feature'] = model_lgbm.feature_name_
    # tmp['importance'] = model_lgbm.feature_importances_
    # tmp['fold'] = fold
    # tmp['seed'] = SEED
    # feature_importances = feature_importances.append(tmp)

    final_oof[val_idx] = fi_lgbm.predict(X_val)
    # preds += model_rf.predict(test_df)/5
    rmse = mean_squared_error(y_val, final_oof[val_idx], squared=False)
    final_score_df = score_log(final_score_df, fi_SEED, fold, 'lgb', rmse)
    print(f"rmse {rmse}")
print('*'*100)
print(final_score_df)

In [None]:
final_test_pred = fi_lgbm.predict(test_x)
final_test_pred

In [None]:
# save submission in csv format
submission_df2 = pd.read_csv('/content/drive/My Drive//tabular/sample_submission.csv')
submission_df2['target'] = final_test_pred
submission_df2.to_csv('submission_tabular_ensemble_final.csv',index=False)
!ls
from google.colab import files
files.download('submission_tabular_ensemble_final.csv')

In [None]:
word = 'pyhton'
if 'on' in word:
    print(True)