In [None]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier as lgb
import re
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
from sklearn.model_selection import GridSearchCV as GSCV
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import xgboost.sklearn as xgb
import warnings
warnings.filterwarnings('ignore')
import spacy
spacy.prefer_gpu()

from pytorch_transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

import dacon_law_class as dlc
from dacon_law_class import SimpleOps as so


In [None]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')

In [None]:
train.head()
# test
# sample_submission

## BERT

@article{turc2019,
  title={Well-Read Students Learn Better: On the Importance of Pre-training Compact Models},
  author={Turc, Iulia and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  journal={arXiv preprint arXiv:1908.08962v2 },
  year={2019}
}


In [None]:
train_facts = dlc.alpha_numeric_3_cols(train, 'first_party', 'second_party', 'facts')
test_facts = dlc.alpha_numeric_3_cols(test, 'first_party', 'second_party', 'facts')

In [None]:
train_facts = pd.DataFrame(train['facts'])
test_fact = pd.DataFrame(test['facts'])

In [None]:
train_to_ml, test_ready_to_ml = dlc.rename_tokenized(train, test, 'first_party', 'second_party', 'facts', 'first_party_winner')


In [None]:
train_to_ml.to_csv('./embeddings/1_train_ready_to_ml.csv', index=False)
test_ready_to_ml.to_csv('./embeddings/2_test_ready_to_ml.csv', index=False)

# 여기

In [None]:
train_to_ml = pd.read_csv('./embeddings/1_train_ready_to_ml.csv')
test_ready_to_ml = pd.read_csv('./embeddings/2_test_ready_to_ml.csv')


In [None]:
X_temp = pd.DataFrame()
temp_train = train_to_ml.drop(columns=['first_party_berted', 'first_party_winner'])
temp_train.head()

In [None]:
facts_only_tensor = pd.concat([X_temp, dlc.tensor_separator(temp_train, 'facts_berted')], axis=1)
facts_only_tensor = facts_only_tensor.astype("float64")
facts_only_tensor

In [None]:
first_y = train_to_ml['first_party_berted']
second_y = train_to_ml['first_party_winner']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, first_y, test_size=0.3, random_state=42)
X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(X, second_y, test_size=0.3, random_state=42)

In [None]:
xgb_encoder = LabelEncoder()
y_train = xgb_encoder.fit_transform(y_train)
y_val = xgb_encoder.fit_transform(y_val)

In [None]:
def xgb_first(trial):
    params_1 = {'objective':'reg:linear', 
                'eval_metric' : 'error',
                'booster' : trial.suggest_categorical('booster', ['gbtree', 'dart']),
                'max_depth' : 15,
                'subsample' : 0.2,
                'verbose':False
               }
    
    
def xgb_second(trial):
    params_2 = {'objective': 'binary:logistic',
                'eval_metric': 'error',
                'booster': 'gbtree',
                'nthread': trial.suggest_int('nthread', 1, 15),
                'n_estimators' : trial.suggest_int('n_estimators', 25, 1000),
                'max_depth': trial.suggest_int('max_depth', 4, 15),
                'subsample': trial.suggest_uniform('subsample', 0.1, 0.3),
                'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
                'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.7),
                'lambda': trial.suggest_loguniform('lambda', 0.2, 200),
                'random_state': 42,
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 3),
                'warm_start':True
               }

In [None]:
xgb_encoder = LabelEncoder()
y_train = xgb_encoder.fit_transform(y_train)
y_val = xgb_encoder.fit_transform(y_val)

In [None]:
def xgb_objective(trial):
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'error',
        'booster': 'gbtree',
        'nthread': trial.suggest_int('nthread', 1, 15),
        'n_estimators' : trial.suggest_int('n_estimators', 25, 1000),
        'max_depth': trial.suggest_int('max_depth', 4, 15),
        'subsample': trial.suggest_uniform('subsample', 0.1, 0.3),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.7),
        'lambda': trial.suggest_loguniform('lambda', 0.2, 200),
        'random_state': 42,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 3),
        'warm_start':True
    }
    


    xgb_model = xgb.XGBClassifier(**xgb_params)
    xgb_model.fit(X_train, y_train)
    xgb_preds = xgb_model.predict(X_val)
    
    return accuracy_score(y_val, xgb_preds)


def lgb_objective(trial):
    lgb_params = {
        'application': 'binary',
        'max_depth': -1,
        'metric': 'accuracy',
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt',  'dart']),
        'num_leaves': trial.suggest_int('num_leaves', 10, 2000),
        'n_estimators' : trial.suggest_int('n_estimators', 1, 500),
        'num_iteration': 500,
        'n_jobs': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.1),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.7, 0.9),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 0.8),
        'bagging_freq' : trial.suggest_int('bagging_freq', 1, 500),
        'random_state': 42
                                           }
    
    lgb_model = lgb.LGBMClassifier(**lgb_params)
    lgb_model.fit(X_train, y_train)
    lgb_preds = lgb_model.predict(X_val)
    
    return accuracy_score(y_val, lgb_preds)


def cat_objective(trial):
    params = {
            'loss_function': 'Logloss',
            'learning_rate': learning_rate,
            'depth': trial.suggest_int('depth', 3, 10),
            'random_state': 42
        }


    model = cat.CatBoostClassifier(**params)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    
    return accuracy_score(y_val, pred)

In [None]:
name_fact_study = optuna.create_study(direction='minimize')
name_fact_study.optimize(xgb_objective, n_trials=2)

print('Number of finished XGB trials: {}'.format(len(xgb_study.trials)))
print('XGB Best trial:')
xgb_trial = xgb_study.best_trial

print('  Value: {}'.format(xgb_trial.value))
print('  Params: ')

for key, value in xgb_trial.params.items():
    print('    {}: {}'.format(key, value))

In [None]:
print('Number of finished study trials: {}'.format(len(study.trials)))
print('study Best trial:')
study_trial = study.best_trial

print('  Value: {}'.format(study_trial.value))
print('  Params: ')

for key, value in study_trial.params.items():
    print('    {}: {}'.format(key, value))


In [None]:
study_best_params = study.best_params
study_best_params['random_state'] = 42
# if study_best_params['model_type'] == 'lgbm':
model = lgb.LGBMClassifier(**study_best_params)
# elif study_best_params['model_type'] == 'xgb':
#     model = xgb(**study_best_params)
model.fit(X_train, y_train)


In [None]:
preds = model.predict(X_val)
accuracy = accuracy_score(y_val, preds)

XGB_pred = XGB.predict(X_val)
accuracy = accuracy_score(y_val, XGB_pred)
print("\nAccuracy after tuning: %.2f%%" % (accuracy * 100.0))

In [None]:
print(f"-- Best_Model: {study_best_params} --")
print("Train ACC : %.3f" % accuracy_score(y_train, model.predict(X_train)))
print("Val ACC : %.3f" % accuracy_score(y_val, model.predict(X_val)))

In [None]:
print(classification_report(y_val, preds))


In [None]:
X_test = pd.get_dummies(data=test_X)
print(X_test)
preds = model.predict(X_test)

In [None]:
sample_submission = pd.read_csv('./sample_submission.csv')
final_pred = pd.DataFrame(preds.astype(int))
final_pred = final_pred.rename(columns={0:'first_party_winner'})

In [None]:
final_pred

In [None]:
sample_submission['first_party_winner'] = final_pred['first_party_winner']

In [None]:
sample_submission['first_party_winner'].value_counts()

In [None]:
sample_submission.to_csv("./results/{Model}_submission_{Train:.03f}_{Val:.03f}.csv".format(Model='LGBM', Train=accuracy_score(y_train, model.predict(X_train)), Val = accuracy_score(y_val, model.predict(X_val))), index=False)


In [None]:
# XGB_submission = pd.read_csv('./sample_submission.csv')
# XGB_pred = pd.DataFrame(XGB_pred.astype(int))
# XGB_submission['first_party_winner'] = XGB_pred
# XGB_submission.to_csv("./Bert_XGB_submission_{Train:.03f}_{Val:.03f}.csv".format(Train=accuracy_score(y_train, XGB.predict(X_train)), Val = accuracy_score(y_val, XGB.predict(X_val))), index=False)

In [None]:
XGB_submission

In [None]:
LGB_submission

In [None]:
param_xgb_gscv = {
    'max_depth' : [i for i in range(1,3)],
    'min_child_weight' : [i for i in range(1, 3)],
    'n_estimators' : [i for i in range(1, 3)]
}

In [None]:
param_lgb_gscv = {
    'max_depth' : -1,
    'learning_rate' : [i for i in np.arange(0, 1,0.0001)],
    'num_leaves' : [i for i in range(1, 2000)],
    'n_estimators' : [i for i in range(1, 3000)]
}

In [None]:
xgb_classifier = XGBClassifier()

In [None]:
gscv_xgb = GSCV(estimator = xgb_classifier, param_grid = param_xgb_gscv, scoring = 'accuracy', cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42), refit=True, n_jobs=2, return_train_score=True, verbose=10)


In [None]:
gscv_xgb.fit(X_train, y_train)

In [None]:
print("="*30)
print('XGB 파라미터: ', gscv_xgb.best_params_)
print('XGB 예측 정확도: {:.4f}'.format(gscv_xgb.best_score_))

In [None]:
preds = gscv_xgb.predict(X_val)
accuracy = accuracy_score(y_val, preds)

In [None]:
print("-- Best_Model --")
print("Train ACC : %.3f" % accuracy_score(y_train, gscv_xgb.predict(X_train)))
print("Val ACC : %.3f" % accuracy_score(y_val, gscv_xgb.predict(X_val)))

In [None]:
print(classification_report(y_val, preds))


In [None]:
X_test = pd.get_dummies(data=test_X)
print(X_test)
preds = gscv_xgb.predict(X_test)

In [None]:
sample_submission = pd.read_csv('./sample_submission.csv')
final_pred = pd.DataFrame(preds.astype(int))
final_pred = final_pred.rename(columns={0:'first_party_winner'})

In [None]:
final_pred

In [None]:
sample_submission['first_party_winner'] = final_pred['first_party_winner']

In [None]:
sample_submission['first_party_winner'].value_counts()

In [None]:
sample_submission.to_csv("./results/{Model}_submission_{Train:.03f}_{Val:.03f}.csv".format(Model='XGB', Train=accuracy_score(y_train, model.predict(X_train)), Val = accuracy_score(y_val, model.predict(X_val))), index=False)


In [None]:
# 3D to 2D

attention_mask_df = dlc.tensor_2_2d(train_bert_tokenized, 0)
input_ids_df = dlc.tensor_2_2d(train_bert_tokenized, 1)
token_type_ids_df = dlc.tensor_2_2d(train_bert_tokenized, 2)

attention_mask_df.info()
print('\n _______________________________ \n')
input_ids_df.info()
print('\n _______________________________ \n')
token_type_ids_df.info()


In [None]:
# attention_mask_df.info()
attention_mask_df
# input_ids_df.info()
# input_ids_df
# token_type_ids_df.info()
# token_type_ids_df


In [None]:
temp = pd.DataFrame()
temp = pd.concat([train_cleansed['ID'], attention_mask_df], axis=1)
temp = pd.concat([temp, input_ids_df], axis=1)
train_BertToken_df = pd.concat([temp, token_type_ids_df], axis=1)
train_BertToken_df


In [None]:
tBTdf = so.right_merger(train_cleansed, train_BertToken_df, 0)
tBTdf