In [199]:
import pandas as pd
import numpy as np
import optuna
import dacon_law_class as dlc
from dacon_law_class import SimpleOps as so
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier as lgb
import re
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
from sklearn.model_selection import GridSearchCV as GSCV
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import xgboost.sklearn as xgb
import warnings
warnings.filterwarnings('ignore')


from pytorch_transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')
# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
# train
# test
# sample_submission

In [3]:
train.info()
print('\n\n\n')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2478 entries, 0 to 2477
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  2478 non-null   object
 1   first_party         2478 non-null   object
 2   second_party        2478 non-null   object
 3   facts               2478 non-null   object
 4   first_party_winner  2478 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 96.9+ KB




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1240 entries, 0 to 1239
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            1240 non-null   object
 1   first_party   1240 non-null   object
 2   second_party  1240 non-null   object
 3   facts         1240 non-null   object
dtypes: object(4)
memory usage: 38.9+ KB


In [4]:
train_cleansed = dlc.alpha_only_3_cols(train, 'first_party', 'second_party', 'facts')
train_cleansed = train.drop(columns='ID')
test_cleansed = dlc.alpha_only_3_cols(test, 'first_party', 'second_party', 'facts')
test_cleansed = test.drop(columns='ID')

## BERT

@article{turc2019,
  title={Well-Read Students Learn Better: On the Importance of Pre-training Compact Models},
  author={Turc, Iulia and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  journal={arXiv preprint arXiv:1908.08962v2 },
  year={2019}
}


In [None]:
train_facts = pd.DataFrame(train_cleansed['facts'])
test_fact = pd.DataFrame(test_cleansed['facts'])

In [None]:
test_fact

In [None]:
train_to_ml, test_ready_to_ml = dlc.rename_tokenized(train_cleansed, test_cleansed, 'first_party', 'second_party', 'facts', 'first_party_winner')


# 여기

In [163]:
train_to_ml = pd.read_csv('./embeddings/1_train_legal.csv')
test_ready_to_ml = pd.read_csv('./embeddings/2_test_legal.csv')


train_to_ml

test_ready_to_ml

to_be_X, to_be_test_x = dlc.X2_T2(train_to_ml, test_ready_to_ml, 'first_party_winner')

In [164]:
train_fp_df = pd.DataFrame(train_to_ml['first_party_berted'])
train_sp_df = pd.DataFrame(train_to_ml['second_party_berted'])
train_facts_df = pd.DataFrame(train_to_ml['facts_berted'])

test_fp_df = pd.DataFrame(test_ready_to_ml['first_party_berted'])
test_sp_df = pd.DataFrame(test_ready_to_ml['second_party_berted'])
test_facts_df = pd.DataFrame(test_ready_to_ml['facts_berted'])

In [165]:
train_fp_df = dlc.tensor_separator(train_fp_df, 'first_party_berted')
train_sp_df = dlc.tensor_separator(train_sp_df, 'second_party_berted')
train_facts_df = dlc.tensor_separator(train_facts_df, 'facts_berted')
test_fp_df = dlc.tensor_separator(test_fp_df, 'first_party_berted')
test_sp_df = dlc.tensor_separator(test_sp_df, 'second_party_berted')
test_facts_df = dlc.tensor_separator(test_facts_df, 'facts_berted')
train_fp_df = train_fp_df.astype('float64')
train_sp_df = train_sp_df.astype('float64')
train_facts_df = train_facts_df.astype('float64')
test_fp_df = test_fp_df.astype('float64')
test_sp_df = test_sp_df.astype('float64')
test_facts_df = test_facts_df.astype('float64')

100%|██████████████████████| 2478/2478 [00:01<00:00, 2160.80it/s]
100%|██████████████████████| 2478/2478 [00:01<00:00, 2228.75it/s]
100%|██████████████████████| 2478/2478 [00:01<00:00, 2259.91it/s]
100%|██████████████████████| 1240/1240 [00:00<00:00, 2254.79it/s]
100%|██████████████████████| 1240/1240 [00:00<00:00, 2272.71it/s]
100%|██████████████████████| 1240/1240 [00:00<00:00, 2142.32it/s]


In [166]:
t_f_p = pd.Series(train['first_party'])

In [167]:
label_encoder = LabelEncoder()
y_train_first_party = label_encoder.fit_transform(t_f_p)

In [182]:
y_train_first_party = pd.Series(y_train_first_party)
y_train_first_party = y_train_first_party.set_axis(train['ID'])

In [184]:
y_train_first_party

ID
TRAIN_0000    1559
TRAIN_0001     711
TRAIN_0002     223
TRAIN_0003    1213
TRAIN_0004    2040
              ... 
TRAIN_2473     877
TRAIN_2474    1759
TRAIN_2475    1555
TRAIN_2476     912
TRAIN_2477      80
Length: 2478, dtype: int64

In [185]:
train_facts_df = train_facts_df.set_index(train['ID'])

In [186]:
X = train_facts_df
X


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TRAIN_0000,-0.033830,0.011217,0.021617,-0.010866,0.032392,-0.011158,0.041054,0.025290,-0.003034,-0.026867,...,-0.041736,-0.006480,-0.024972,0.022913,0.003499,-0.013559,-0.056562,-0.020698,0.003666,-0.021622
TRAIN_0001,-0.038728,0.011993,0.011954,-0.009163,0.013441,-0.006253,0.049216,0.016051,0.003694,-0.029397,...,-0.021081,-0.004596,-0.020107,0.020765,-0.001840,-0.008348,-0.056933,-0.008860,0.011146,-0.012924
TRAIN_0002,-0.049457,0.009322,0.026977,-0.003048,0.018507,-0.012445,0.039873,0.004032,-0.004505,-0.018547,...,-0.024970,0.005365,-0.023475,0.023403,0.001966,-0.016449,-0.067600,-0.012318,0.021940,-0.015223
TRAIN_0003,-0.068268,0.014492,0.018610,0.007739,0.045458,-0.029632,0.019533,0.015546,-0.011195,-0.046313,...,-0.003665,-0.003942,-0.015112,0.014614,-0.015617,-0.004508,-0.037920,-0.019136,0.023042,-0.018693
TRAIN_0004,-0.039468,0.010202,0.022965,-0.003174,0.036305,-0.011503,0.041285,0.014516,0.000238,-0.039284,...,-0.014759,-0.013484,-0.016750,0.010149,-0.005003,-0.010588,-0.055886,-0.029736,0.021987,-0.014409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TRAIN_2473,-0.038044,0.016973,0.025856,0.017497,0.034284,-0.011928,0.013811,0.021045,-0.022041,-0.034271,...,-0.028971,-0.032330,-0.023707,0.034244,0.009421,-0.030279,-0.023804,-0.008327,0.003461,-0.017760
TRAIN_2474,-0.052684,0.014010,0.043104,0.012380,0.038227,-0.001981,0.007584,0.016783,0.012787,-0.033677,...,-0.001821,-0.013834,-0.000324,0.022426,0.014847,-0.018254,-0.063964,-0.009734,0.003993,0.003585
TRAIN_2475,-0.062706,-0.007085,0.018795,-0.002208,0.011292,-0.012623,0.041803,0.015978,-0.013886,-0.016605,...,-0.009397,0.004355,-0.002171,0.011380,-0.009125,-0.017224,-0.055894,-0.013227,0.014364,-0.011625
TRAIN_2476,-0.042257,0.023962,0.020516,-0.015240,0.027739,-0.015085,0.050874,-0.005060,0.009996,-0.040408,...,-0.012835,-0.001382,-0.018021,0.019966,0.004999,-0.027998,-0.057431,-0.015786,0.009186,-0.018314


In [175]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2478 entries, 0 to 2477
Columns: 768 entries, 0 to 767
dtypes: float64(768)
memory usage: 14.5 MB


In [187]:
y = y_train_first_party

In [188]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# to_be_X.info(), to_be_test_x.info()

X_train, X_val, y_train, y_val, test_X = dlc.test_val_separator(train_fp_df, train_fp_df, 0.3)


y_train = y_train['first_party'].astype(int)
y_val = y_val['first_party'].astype(int)

In [189]:
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(1734, 768) (744, 768) (1734,) (744,)


In [190]:
print(X_train, X_val)

                 0         1         2         3         4         5    \
ID                                                                       
TRAIN_1490 -0.048583  0.017428  0.041742 -0.004135  0.027642 -0.010154   
TRAIN_0552 -0.039892  0.016475  0.034347  0.014294  0.037033 -0.017109   
TRAIN_0087 -0.027427  0.031979  0.010358 -0.011227  0.027284 -0.006320   
TRAIN_0690 -0.030769  0.015130  0.028391  0.004179  0.020853  0.002137   
TRAIN_1929 -0.033910  0.023402  0.033990  0.003569  0.025818 -0.019636   
...              ...       ...       ...       ...       ...       ...   
TRAIN_1638 -0.021466  0.022234  0.039807  0.012977  0.034737 -0.019180   
TRAIN_1095 -0.032917  0.014242  0.021594 -0.005423  0.031188 -0.011909   
TRAIN_1130 -0.025655  0.000746  0.030883  0.009022  0.036475 -0.032818   
TRAIN_1294 -0.029932  0.021282  0.033949  0.004277  0.032167 -0.001339   
TRAIN_0860 -0.002219  0.007421  0.034606 -0.015948  0.037497  0.002776   

                 6         7         

In [153]:
y_train = y_train.rename('first_name')

In [154]:
y_val = y_val.rename('first_name')

In [191]:
y_train

ID
TRAIN_1490    1028
TRAIN_0552     635
TRAIN_0087     908
TRAIN_0690    1453
TRAIN_1929     925
              ... 
TRAIN_1638     251
TRAIN_1095    1777
TRAIN_1130     289
TRAIN_1294      79
TRAIN_0860     196
Length: 1734, dtype: int64

In [192]:
y_val.info()

<class 'pandas.core.series.Series'>
Index: 744 entries, TRAIN_1753 to TRAIN_1975
Series name: None
Non-Null Count  Dtype
--------------  -----
744 non-null    int64
dtypes: int64(1)
memory usage: 11.6+ KB


In [204]:
def xgb_objective(trial):
    xgb_params = {
        'objective': 'multi:softmax',
        'num_class' : 1500,
        'eval_metric': 'merror',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'nthread': trial.suggest_int('nthread', 1, 15),
        'n_estimators' : trial.suggest_int('n_estimators', 25, 1000),
        'max_depth': trial.suggest_int('max_depth', 4, 15),
        'subsample': trial.suggest_uniform('subsample', 0.1, 0.3),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.7),
        'lambda': trial.suggest_loguniform('lambda', 0.2, 200),
        'random_state': 42,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 3)
    }
    
    X_train_np = X_train.to_numpy()
    y_train_np = y_train.to_numpy()
    
    X_train_np, X_val_np, y_train_np, y_val_np = train_test_split(
    X_train_np, y_train_np, test_size=0.2, random_state=42)
    
    dtrain = xgb.DMatrix(X_train_np, label=y_train_np)
    dval = xgb.DMatrix(X_val_np, label = y_val_np)
    
    evallist = [(dval, 'eval')]
    
    
    xgb_model = xgb.train(
        xgb_params,
        dtrain,
        evals=evallist,
        verbose_eval=False,
        early_stopping_rounds=10,
    )
    
    xgb_preds = xgb_model.predict(dval)
    accuracy = 1.0 - accuracy_score(y_val_np, xgb_preds)
    
    return accuracy


def lgb_objective(trial):
    lgb_params = {
        'application': 'binary',
        'max_depth': -1,
        'metric': 'accuracy',
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt',  'dart']),
        'num_leaves': trial.suggest_int('num_leaves', 10, 2000),
        'n_estimators' : trial.suggest_int('n_estimators', 1, 500),
        'num_iteration': 500,
        'n_jobs': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.1),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.7, 0.9),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 0.8),
        'bagging_freq' : trial.suggest_int('bagging_freq', 1, 500),
        'random_state': 42
                                           }
    
    lgb_model = lgb.LGBMClassifier(**lgb_params)
    lgb_model.fit(X_train, y_train)
    lgb_preds = lgb_model.predict(X_val)
    
    return accuracy_score(y_val, lgb_preds)


def cat_objective(trial):
    params = {
            'loss_function': 'Logloss',
            'learning_rate': learning_rate,
            'depth': trial.suggest_int('depth', 3, 10),
            'random_state': 42
        }


    model = cat.CatBoostClassifier(**params)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    
    return accuracy_score(y_val, pred)

In [194]:
y_train, y_val

(ID
 TRAIN_1490    1028
 TRAIN_0552     635
 TRAIN_0087     908
 TRAIN_0690    1453
 TRAIN_1929     925
               ... 
 TRAIN_1638     251
 TRAIN_1095    1777
 TRAIN_1130     289
 TRAIN_1294      79
 TRAIN_0860     196
 Length: 1734, dtype: int64,
 ID
 TRAIN_1753     719
 TRAIN_0259     814
 TRAIN_2072     230
 TRAIN_1000    2085
 TRAIN_0056    1601
               ... 
 TRAIN_0591      73
 TRAIN_2415    1488
 TRAIN_1446     220
 TRAIN_1839     195
 TRAIN_1975    1940
 Length: 744, dtype: int64)

In [205]:
study = optuna.create_study(direction='minimize')
study.optimize(xgb_objective, n_trials=500)

[I 2023-06-12 23:57:55,034] A new study created in memory with name: no-name-e811a640-1ba0-4236-9aff-08364a04b518
[W 2023-06-12 23:57:55,094] Trial 0 failed with parameters: {'booster': 'dart', 'nthread': 5, 'n_estimators': 824, 'max_depth': 12, 'subsample': 0.21932483628574134, 'learning_rate': 0.031631224227365605, 'colsample_bytree': 0.5965356605554805, 'lambda': 2.886113872543456, 'min_child_weight': 2} because of the following error: XGBoostError('[23:57:55] /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/objective/multiclass_obj.cu:123: SoftmaxMultiClassObj: label must be in [0, num_class).\nStack trace:\n  [bt] (0) 1   libxgboost.dylib                    0x0000000162dddbc8 dmlc::LogMessageFatal::~LogMessageFatal() + 124\n  [bt] (1) 2   libxgboost.dylib                    0x0000000162f33f1c xgboost::obj::SoftmaxMultiClassObj::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDevic

Parameters: { "n_estimators" } are not used.



XGBoostError: [23:57:55] /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/objective/multiclass_obj.cu:123: SoftmaxMultiClassObj: label must be in [0, num_class).
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000162dddbc8 dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   libxgboost.dylib                    0x0000000162f33f1c xgboost::obj::SoftmaxMultiClassObj::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float>>*) + 844
  [bt] (2) 3   libxgboost.dylib                    0x0000000162ede6ac xgboost::LearnerImpl::UpdateOneIter(int, std::__1::shared_ptr<xgboost::DMatrix>) + 608
  [bt] (3) 4   libxgboost.dylib                    0x0000000162df7418 XGBoosterUpdateOneIter + 144
  [bt] (4) 5   libffi.8.dylib                      0x0000000104d2804c ffi_call_SYSV + 76
  [bt] (5) 6   libffi.8.dylib                      0x0000000104d257d4 ffi_call_int + 1336
  [bt] (6) 7   _ctypes.cpython-310-darwin.so       0x0000000104d0811c _ctypes_callproc + 944
  [bt] (7) 8   _ctypes.cpython-310-darwin.so       0x0000000104d023fc PyCFuncPtr_call + 228
  [bt] (8) 9   python3.10                          0x000000010455f350 _PyEval_EvalFrameDefault + 59104



print('Number of finished XGB trials: {}'.format(len(xgb_study.trials)))
print('XGB Best trial:')
xgb_trial = xgb_study.best_trial

print('  Value: {}'.format(xgb_trial.value))
print('  Params: ')

for key, value in xgb_trial.params.items():
    print('    {}: {}'.format(key, value))

In [None]:
print('Number of finished study trials: {}'.format(len(study.trials)))
print('study Best trial:')
study_trial = study.best_trial

print('  Value: {}'.format(study_trial.value))
print('  Params: ')

for key, value in study_trial.params.items():
    print('    {}: {}'.format(key, value))


In [None]:
study_best_params = study.best_params
study_best_params['random_state'] = 42
# if study_best_params['model_type'] == 'lgbm':
model = lgb.LGBMClassifier(**study_best_params)
# elif study_best_params['model_type'] == 'xgb':
#     model = xgb(**study_best_params)
model.fit(X_train, y_train)


In [None]:
preds = model.predict(X_val)
accuracy = accuracy_score(y_val, preds)

XGB_pred = XGB.predict(X_val)
accuracy = accuracy_score(y_val, XGB_pred)
print("\nAccuracy after tuning: %.2f%%" % (accuracy * 100.0))

In [None]:
print(f"-- Best_Model: {study_best_params} --")
print("Train ACC : %.3f" % accuracy_score(y_train, model.predict(X_train)))
print("Val ACC : %.3f" % accuracy_score(y_val, model.predict(X_val)))

In [None]:
print(classification_report(y_val, preds))


In [None]:
X_test = pd.get_dummies(data=test_X)
print(X_test)
preds = model.predict(X_test)

In [None]:
sample_submission = pd.read_csv('./sample_submission.csv')
final_pred = pd.DataFrame(preds.astype(int))
final_pred = final_pred.rename(columns={0:'first_party_winner'})

In [None]:
final_pred

In [None]:
sample_submission['first_party_winner'] = final_pred['first_party_winner']

In [None]:
sample_submission['first_party_winner'].value_counts()

In [None]:
sample_submission.to_csv("./results/{Model}_submission_{Train:.03f}_{Val:.03f}.csv".format(Model='LGBM', Train=accuracy_score(y_train, model.predict(X_train)), Val = accuracy_score(y_val, model.predict(X_val))), index=False)


In [None]:
# XGB_submission = pd.read_csv('./sample_submission.csv')
# XGB_pred = pd.DataFrame(XGB_pred.astype(int))
# XGB_submission['first_party_winner'] = XGB_pred
# XGB_submission.to_csv("./Bert_XGB_submission_{Train:.03f}_{Val:.03f}.csv".format(Train=accuracy_score(y_train, XGB.predict(X_train)), Val = accuracy_score(y_val, XGB.predict(X_val))), index=False)

In [None]:
XGB_submission

In [None]:
LGB_submission

In [None]:
param_xgb_gscv = {
    'max_depth' : [i for i in range(1,3)],
    'min_child_weight' : [i for i in range(1, 3)],
    'n_estimators' : [i for i in range(1, 3)]
}

In [None]:
param_lgb_gscv = {
    'max_depth' : -1,
    'learning_rate' : [i for i in np.arange(0, 1,0.0001)],
    'num_leaves' : [i for i in range(1, 2000)],
    'n_estimators' : [i for i in range(1, 3000)]
}

In [None]:
xgb_classifier = XGBClassifier()

In [None]:
gscv_xgb = GSCV(estimator = xgb_classifier, param_grid = param_xgb_gscv, scoring = 'accuracy', cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42), refit=True, n_jobs=2, return_train_score=True, verbose=10)


In [None]:
gscv_xgb.fit(X_train, y_train)

In [None]:
print("="*30)
print('XGB 파라미터: ', gscv_xgb.best_params_)
print('XGB 예측 정확도: {:.4f}'.format(gscv_xgb.best_score_))

In [None]:
preds = gscv_xgb.predict(X_val)
accuracy = accuracy_score(y_val, preds)

In [None]:
print("-- Best_Model --")
print("Train ACC : %.3f" % accuracy_score(y_train, gscv_xgb.predict(X_train)))
print("Val ACC : %.3f" % accuracy_score(y_val, gscv_xgb.predict(X_val)))

In [None]:
print(classification_report(y_val, preds))


In [None]:
X_test = pd.get_dummies(data=test_X)
print(X_test)
preds = gscv_xgb.predict(X_test)

In [None]:
sample_submission = pd.read_csv('./sample_submission.csv')
final_pred = pd.DataFrame(preds.astype(int))
final_pred = final_pred.rename(columns={0:'first_party_winner'})

In [None]:
final_pred

In [None]:
sample_submission['first_party_winner'] = final_pred['first_party_winner']

In [None]:
sample_submission['first_party_winner'].value_counts()

In [None]:
sample_submission.to_csv("./results/{Model}_submission_{Train:.03f}_{Val:.03f}.csv".format(Model='XGB', Train=accuracy_score(y_train, model.predict(X_train)), Val = accuracy_score(y_val, model.predict(X_val))), index=False)


In [None]:
# 3D to 2D

attention_mask_df = dlc.tensor_2_2d(train_bert_tokenized, 0)
input_ids_df = dlc.tensor_2_2d(train_bert_tokenized, 1)
token_type_ids_df = dlc.tensor_2_2d(train_bert_tokenized, 2)

attention_mask_df.info()
print('\n _______________________________ \n')
input_ids_df.info()
print('\n _______________________________ \n')
token_type_ids_df.info()


In [None]:
# attention_mask_df.info()
attention_mask_df
# input_ids_df.info()
# input_ids_df
# token_type_ids_df.info()
# token_type_ids_df


In [None]:
temp = pd.DataFrame()
temp = pd.concat([train_cleansed['ID'], attention_mask_df], axis=1)
temp = pd.concat([temp, input_ids_df], axis=1)
train_BertToken_df = pd.concat([temp, token_type_ids_df], axis=1)
train_BertToken_df


In [None]:
tBTdf = so.right_merger(train_cleansed, train_BertToken_df, 0)
tBTdf