In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# install PyTorch Tabular first
!pip install pytorch_tabular
# This is for a custom optimizer. PyTorch Tabular is flexible enough to use custom optimizers
!pip install torch_optimizer

In [1]:
# packages
# standard
import numpy as np
import pandas as pd
import time
# plots
import matplotlib.pyplot as plt
import seaborn as sns
# NODE and ML tools
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, NodeConfig, TabNetModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.categorical_encoders import CategoricalEmbeddingTransformer
from torch_optimizer import QHAdam
import category_encoders as ce
from lightgbm import LGBMRegressor
#from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings(action='ignore')



In [2]:
# load training data
df_train_a = pd.read_csv('/content/drive/MyDrive/lg/train_a.csv')
df_test_a = pd.read_csv('/content/drive/MyDrive/lg/test_a.csv')
# load training data
df_train_o = pd.read_csv('/content/drive/MyDrive/lg/train_o.csv')
df_test_o = pd.read_csv('/content/drive/MyDrive/lg/test_o.csv')
# load training data
df_train_t = pd.read_csv('/content/drive/MyDrive/lg/train_t.csv')
df_test_t = pd.read_csv('/content/drive/MyDrive/lg/test_t.csv')

In [3]:
# y quality scaling
df_train_t['Y_Quality'] = df_train_t['Y_Quality'].map(lambda x : np.log(x))

In [None]:
'''df_train_a['Y_Quality'] = df_train_a['Y_Quality']*1000
df_train_t['Y_Quality'] = df_train_t['Y_Quality']*1000
df_train_o['Y_Quality'] = df_train_o['Y_Quality']*1000'''

In [4]:
'''df_train_a = df_train_a.drop(columns = ['X_1167','X_1168','X_1169','X_1170','X_1171','X_1274','X_1275','X_1276','X_1277','X_1278'])
df_test_a = df_test_a.drop(columns = ['X_1167','X_1168','X_1169','X_1170','X_1171','X_1274','X_1275','X_1276','X_1277','X_1278'])
'''
df_train_t = df_train_t.drop(columns = ['X_529','X_530','X_531','X_532','X_533','X_600','X_601','X_602','X_603','X_604'])
df_test_t = df_test_t.drop(columns = ['X_529','X_530','X_531','X_532','X_533','X_600','X_601','X_602','X_603','X_604'])

In [5]:
del_col = []
for col in df_train_t.describe().columns:
    if df_train_t[col].std() >= 500 :
        del_col.append(col)

In [None]:
del_col

In [6]:
df_train_t = df_train_t.drop(columns = del_col)
df_test_t = df_test_t.drop(columns = del_col)

In [None]:
df_train_t.shape

In [27]:
def get_configs(train):
    epochs = 70
    batch_size = 25
    steps_per_epoch = int((len(train)//batch_size)*0.9)
    data_config = DataConfig(
        target=['Y_Quality'], 
        continuous_cols= train.columns[:-1].to_list() ,
    )
    trainer_config = TrainerConfig(
        auto_lr_find=False, # Runs the LRFinder to automatically derive a learning rate
        batch_size=batch_size,
        max_epochs=epochs,
        gpus=1, #index of the GPU to use. 0, means CPU
    )
    optimizer_config = OptimizerConfig(lr_scheduler="OneCycleLR", lr_scheduler_params={"max_lr":0.033, "epochs": epochs, "steps_per_epoch":steps_per_epoch})

    model_config = NodeConfig(
        task="regression",
        num_layers=20, # Number of Dense Layers
        num_trees=250, #Number of Trees in each layer
        depth=6, #Depth of each Tree
        embed_categorical= True, #If True, will use a learned embedding, else it will use LeaveOneOutEncoding for categorical columns
        learning_rate = 0.033,
        target_range=[(float(train[col].min()),float(train[col].max())) for col in ['Y_Quality']]
    )
    
    return data_config, trainer_config, optimizer_config, model_config

In [8]:
# random seeds
rnd_seed_cv = 1234
rnd_seed_reg = 1234
# cross validation
kf = KFold(n_splits=5, random_state=rnd_seed_cv, shuffle=True)
df_test_a['Y_Quality'] = 0
df_test_t['Y_Quality'] = 0
df_test_o['Y_Quality'] = 0

In [9]:
def node(train, valid, df_test):
    data_config, trainer_config, optimizer_config, model_config = get_configs(train)
    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config
    )
    # fit model
    tabular_model.fit(train=train, validation=valid, optimizer=QHAdam, 
                  optimizer_params={"nus": (0.7, 1.0), "betas": (0.95, 0.998)})
    result = tabular_model.evaluate(valid)
    print('-------------')
    print(result)
    return np.sqrt(result[0]["test_mean_squared_error"]), tabular_model.predict(valid)["Y_Quality_prediction"].values, tabular_model.predict(df_test)["Y_Quality_prediction"].values, tabular_model

In [None]:
CV_node_a = []
preds_train_node_a = []
preds_test_node_a = []
CV_node_t = []
preds_train_node_t = []
preds_test_node_t = []

t1 = time.time()
for train_index, test_index in kf.split(df_train_t):
    print("------start a-------")
    train = df_train_t.iloc[train_index]
    valid = df_train_t.iloc[test_index]
    cv_val = valid.copy()
    node_score, node_train_pred, node_test_pred, tabular_model = node(train, valid, df_test_t)
    CV_node_a.append(node_score)
    cv_val['pred_node'] = node_train_pred
    preds_train_node_a.append(node_train_pred)
    preds_test_node_a.append(node_test_pred)
t2 = time.time()
print('productcode a :: Elapsed time [s]: ', t2-t1)

In [15]:
t = pd.read_csv('/content/drive/MyDrive/lg/pid_t.csv')
a = pd.read_csv('/content/drive/MyDrive/lg/pid_a.csv')
o = pd.read_csv('/content/drive/MyDrive/lg/pid_o.csv')

In [None]:
preds_test_node_a

In [17]:
ttt = (preds_test_node_a[0]+preds_test_node_a[1]+preds_test_node_a[2]+preds_test_node_a[3]+preds_test_node_a[4])/5

In [None]:
ppp = [5.293466 , 5.323602 , 5.3159866, 5.2875233, 5.301686 , 5.3124886,
       5.312852 , 5.3081994, 5.2980056, 5.327128 , 5.285574 , 5.2929673,
       5.2973633, 5.3043036, 5.304208 , 5.29397  , 5.29274  , 5.3028646,
       5.2828455, 5.317207 , 5.2896957, 5.293864 , 5.3092995, 5.2806263,
       5.285693 , 5.324366 , 5.314264 , 5.2572546, 5.2900324, 5.2625136,
       5.2954617, 5.2545996, 5.280834 , 5.291468 , 5.283338 , 5.274568 ,
       5.2747736, 5.2731113, 5.312507 , 5.321829 , 5.318307 , 5.309747 ,
       5.306521 , 5.3131037, 5.324361 , 5.3144503, 5.325783 , 5.3345814,
       5.320949 , 5.32036  , 5.3153753, 5.3266754, 5.326373 , 5.3323607,
       5.3213644, 5.3256793, 5.3239923, 5.3268714, 5.306283 , 5.326499 ,
       5.318757 , 5.328589 , 5.3119073, 5.325813 , 5.307254 , 5.3227816,
       5.3027487, 5.3261323, 5.317509 , 5.319888 , 5.3200817, 5.3084903,
       5.3342404, 5.3218603, 5.313556 , 5.303539 , 5.3309293, 5.3196945,
       5.3366036, 5.3230104, 5.3166914, 5.3306713, 5.3378615, 5.3216395,
       5.3140507, 5.3219533, 5.309964 , 5.312243 , 5.315069 , 5.310431 ,
       5.31872  , 5.328703 , 5.2971153, 5.3250284, 5.322083 , 5.3211546,
       5.335465 , 5.3231773, 5.327879 , 5.296985 , 5.3130403, 5.3071227,
       5.320326 , 5.3293705, 5.3153977, 5.3115883, 5.3294296, 5.2975845,
       5.3189464, 5.3320637, 5.31026  , 5.3343387, 5.3348727, 5.3204947,
       5.33422  , 5.3021483, 5.3294697, 5.3221784, 5.313323 , 5.313703 ,
       5.3211045, 5.329157 , 5.3162274, 5.325688 , 5.3115015, 5.3252087,
       5.30281  , 5.3283787, 5.3408837, 5.31062  , 5.3282514, 5.331292 ,
       5.322797 , 5.3288217, 5.336482 , 5.322403 , 5.3204603, 5.3191023,
       5.330142 , 5.317251 , 5.326909 , 5.3090773, 5.328071 , 5.3127904,
       5.331016 , 5.3169203, 5.331262 , 5.3148994, 5.3039536, 5.3292103,
       5.314355 , 5.328689 , 5.314101 , 5.3277416, 5.29677  , 5.3288174,
       5.319478 , 5.3320765, 5.3131824, 5.3151608, 5.328886 , 5.318504 ,
       5.316331 , 5.314597 , 5.3273635, 5.3289323, 5.304873 , 5.31398  ,
       5.330834 , 5.307636 , 5.329036 , 5.317323 , 5.313458 , 5.3314157,
       5.303113 , 5.301588 , 5.3081894, 5.3134904, 5.31059  , 5.3215895,
       5.314023 , 5.311284 , 5.308907 , 5.333932 , 5.3255897, 5.305844 ,
       5.3110156, 5.3137407, 5.315883 , 5.325205 , 5.3028135, 5.306897 ,
       5.3198276, 5.2975717, 5.3020663, 5.3024406, 5.3078475, 5.30654  ,
       5.304393 , 5.3158402, 5.31215  , 5.304424 , 5.298312 , 5.318548 ,
       5.295038 , 5.3057055, 5.3157225, 5.3097687, 5.3154836, 5.3116393,
       5.3048534, 5.317833 , 5.300263 , 5.304865 , 5.3151712, 5.304819 ,
       5.30724  , 5.294277 , 5.3012357, 5.306077 , 5.3005614, 5.300459 ,
       5.311871 , 5.2805376, 5.3024087, 5.2882757, 5.3033013, 5.2963567,
       5.293053 , 5.292839 , 5.2934318, 5.3037577, 5.3033304, 5.299841 ,
       5.2982554, 5.29531  , 5.3104777, 5.3024836, 5.304693 ]

In [18]:
t['Y_qaulity'] = ttt
t['Y_Class'] = 1
t.loc[(t['Y_qaulity']<-0.6442119), 'Y_Class'] = 0
t.loc[(t['Y_qaulity']>-0.62578), 'Y_Class'] = 2

In [21]:
t['Y_Class'].value_counts()

1    239
Name: Y_Class, dtype: int64

In [None]:
a['Y_qaulity'] = (preds_test_node_a[0]+preds_test_node_a[1]+preds_test_node_a[2]+preds_test_node_a[3]+preds_test_node_a[4])/5
t['Y_qaulity'] = (preds_test_node_t[0]+preds_test_node_t[1]+preds_test_node_t[2]+preds_test_node_t[3]+preds_test_node_t[4])/5
o['Y_qaulity'] = [0.53,0.53,0.53,0.53]

a['Y_Class'] = 1
t['Y_Class'] = 1
o['Y_Class'] = 1

a.loc[(a['Y_qaulity']<0.52507), 'Y_Class'] = 0
a.loc[(a['Y_qaulity']>0.5349), 'Y_Class'] = 2
t.loc[(t['Y_qaulity']<0.52507), 'Y_Class'] = 0
t.loc[(t['Y_qaulity']>0.5349), 'Y_Class'] = 2
o.loc[(o['Y_qaulity']<0.52507), 'Y_Class'] = 0
o.loc[(o['Y_qaulity']>0.5349), 'Y_Class'] = 2

submita = pd.read_csv('/content/drive/MyDrive/lg/sample_submission.csv')
submitt = pd.read_csv('/content/drive/MyDrive/lg/sample_submission.csv')
submito = pd.read_csv('/content/drive/MyDrive/lg/sample_submission.csv')

submita = pd.merge(submita[['PRODUCT_ID']],a[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')
submitt = pd.merge(submitt[['PRODUCT_ID']],t[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')
submito = pd.merge(submito[['PRODUCT_ID']],o[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')

pd.concat([submita,submitt,submito]).sort_values(by='PRODUCT_ID').to_csv('node0219.csv',index=False)

In [None]:
t['Y_Class'].value_counts()

1    239
Name: Y_Class, dtype: int64