In [14]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier as lgb
import re
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
import dacon_law_class as dlc
from dacon_law_class import SimpleOps as so
from sklearn.model_selection import GridSearchCV as GSCV
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AutoModelForTokenClassification, pipeline
from tqdm import tqdm
import xgboost.sklearn as xgb
import warnings
warnings.filterwarnings('ignore')
import spacy
spacy.prefer_gpu()

from pytorch_transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')

In [3]:
# train.head()
# test
# sample_submission

## BERT

@article{turc2019,
  title={Well-Read Students Learn Better: On the Importance of Pre-training Compact Models},
  author={Turc, Iulia and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  journal={arXiv preprint arXiv:1908.08962v2 },
  year={2019}
}


In [22]:
train_facts = dlc.alpha_numeric_3_cols(train, 'first_party', 'second_party', 'facts')
test_facts = dlc.alpha_numeric_3_cols(test, 'first_party', 'second_party', 'facts')

In [5]:
train_facts = pd.DataFrame(train['facts'])
test_fact = pd.DataFrame(test['facts'])

train_all_done, test_all_done = dlc.rename_tokenized(train, test, 'first_party', 'second_party', 'facts')


# 여기

In [6]:
train_tokenized = pd.read_csv('./train_correlations.csv')
test_tokenized = pd.read_csv('./test_correlations.csv')


train_tokenized

test_tokenized

In [7]:
train_token = dlc.token_to_df(train_tokenized)
test_token = dlc.token_to_df(test_tokenized)



100%|███████████████████████████████████| 2478/2478 [00:00<00:00, 972171.48it/s]
100%|██████████████████████████████████| 2478/2478 [00:00<00:00, 2389856.36it/s]
100%|██████████████████████████████████| 2478/2478 [00:00<00:00, 2384920.91it/s]
100%|██████████████████████████████████| 1240/1240 [00:00<00:00, 1678262.98it/s]
100%|██████████████████████████████████| 1240/1240 [00:00<00:00, 1331866.06it/s]
100%|██████████████████████████████████| 1240/1240 [00:00<00:00, 2146486.57it/s]


In [8]:
train_token

Unnamed: 0,first_party_1,first_party_2,second_party_1,second_party_2,facts_1,facts_2
0,-0.4463822543621063,-0.8948424458503723,-0.9863601922988892,0.16460131108760834,0.21225066483020782,0.9772152304649353
1,0.9353076219558716,-0.35383573174476624,-0.9495260119438171,0.31368815898895264,0.2700807750225067,0.9628376364707947
2,-0.017951782792806625,-0.9998388290405273,-0.5238009095191956,-0.8518407344818115,0.3102778196334839,0.950645923614502
3,-0.19457940757274628,-0.9808868169784546,-0.9964062571525574,0.08470305055379868,0.17634129524230957,0.984329104423523
4,-0.012838165275752544,-0.9999175667762756,-0.4199574291706085,0.9075437784194946,0.13304223120212555,0.9911103844642639
...,...,...,...,...,...,...
2473,0.7062230110168457,-0.7079894542694092,0.3479677438735962,-0.9375064969062805,0.4467298090457916,0.8946689367294312
2474,0.7081589102745056,-0.7060530781745911,0.9421269297599792,0.3352562487125397,0.2615622580051422,0.965186595916748
2475,0.8284182548522949,-0.5601099133491516,-0.4199574291706085,0.9075437784194946,0.2331738919019699,0.9724350571632385
2476,0.36393409967422485,-0.9314247369766235,-0.7260524034500122,-0.6876394152641296,0.4886071979999542,0.8725038766860962


In [9]:
test_token

Unnamed: 0,first_party_1,first_party_2,second_party_1,second_party_2,facts_1,facts_2
0,-0.9378802180290222,0.34695932269096375,0.9965190887451172,0.0833643451333046,-0.15676727890968323,0.9876355528831482
1,0.044631924480199814,0.999003529548645,0.9047352075576782,0.4259744882583618,-0.5222179293632507,0.8528120517730713
2,0.6751019358634949,0.7377244234085083,0.9083398580551147,-0.41823291778564453,-0.45306575298309326,0.891477108001709
3,-0.6089559197425842,0.793204128742218,0.9965190887451172,0.0833643451333046,0.07415395230054855,0.9972467422485352
4,-0.6693098545074463,0.7429834604263306,0.999426543712616,-0.033860500901937485,-0.23630794882774353,0.9716782569885254
...,...,...,...,...,...,...
1235,0.48429757356643677,0.8749033212661743,0.661633312702179,0.74982750415802,0.48287805914878845,0.8756875991821289
1236,-0.04792654141783714,0.9988508224487305,0.8590741157531738,0.5118512511253357,-0.6453915238380432,0.7638519406318665
1237,0.11585450917482376,0.993266224861145,0.9999958276748657,-0.002867066999897361,-0.2503175735473633,0.9681637287139893
1238,-0.7357914447784424,0.6772081851959229,0.9894566535949707,-0.14482951164245605,0.003927184734493494,0.999992311000824


In [62]:
def mean_pooling(model_output, attention_mask):
    '''
    하단 tokenizer를 위한 definition
    '''
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def auto_tokenizer(df, column_name):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    bert_model = 'nlpaueb/bert-base-uncased-contracts'
    tokenizer = AutoTokenizer.from_pretrained(bert_model)
    model = AutoModelForTokenClassification.from_pretrained(bert_model)
    model = model.to(device)
    nlp = pipeline('ner', model=model, tokenizer=tokenizer, device=-1)

    ei_total_list = []
    encoded_input_list = []
    for text in tqdm(df[column_name]):
        text = text.lower()
        entities = nlp(text)

        party_names = {}
        for entity in entities:
            if 'entity_group' in entity and entity['entity_group'] == 'LABEL_1':
                if 'word' in entity:
                    party = entity['word']
                    if party not in party_names:
                        party_names[party] = {'first_name': '', 'family_name': ''}
                        names = re.findall(r'\b\w+\b', party)
                        if len(names) == 2:
                            party_names[party]['first_name'] = names[0]
                            party_names[party]['family_name'] = names[1]
                        elif len(names) == 1:
                            party_names[party]['first_name'] = names[0]
            else:
                if 'party' in entity:
                    party = entity['party']
                    if party not in party_names:
                        party_names[party] = {'first_name': '', 'family_name': ''}
                    if 'first_name' in entity:
                        party_names['party']['first_name'] = entity['first_name']
                    if 'family_name' in entity:
                        party_names[party]['family_name'] = entity['family_name']

        list_of_states = [
            'wyoming', 'wisconsin', 'west virginia', 'washington', 'virginia',
            'vermont', 'utah', 'texas', 'tennessee', 'south dakota',
            'south carolina', 'rhode island', 'pennsylvania', 'oregon', 'oklahoma',
            'ohio', 'north dakota', 'north carolina', 'new york', 'new mexico',
            'new jersey', 'new hampshire', 'nevada', 'nebraska', 'montana',
            'missouri', 'mississippi', 'minnesota', 'michigan', 'massachusetts',
            'maryland', 'maine', 'louisiana', 'kentucky', 'kansas',
            'iowa', 'indiana', 'illinois', 'idaho', 'hawaii',
            'georgia', 'florida', 'delaware', 'connecticut', 'colorado',
            'california', 'arkansas', 'arizona', 'alaska', 'alabama'

        ]

        list_of_usa = ['usa', 'america', 'u.s.', 'united states', 'the states', 'the us', 'the united states',
                       'the united states of america', 'the u.s.', 'the usa']

        masked_text = text
        for party, names in party_names.items():
            first_name = names['first_name']
            family_name = names['family_name']

            if first_name in list_of_states:
                first_name = '[MASK]'

            if family_name in list_of_states:
                family_name = '[MASK]'

            if first_name in list_of_usa:
                first_name = '[MASK]'
            if family_name in list_of_usa:
                family_name = '[MASK]'

            masked_text = masked_text.replace(first_name, '[MASK]')
            masked_text = masked_text.replace(family_name, '[MASK]')

        for state in list_of_states:
            masked_text = masked_text.replace(state, '[MASK]')

        for usa in list_of_usa:
            masked_text = masked_text.replace(usa, '[MASK]')

        encoded_input = tokenizer(masked_text, padding='max_length', max_length=512, truncation=True, return_tensors='pt')
        encoded_input = {key: value.to(device) for key, value in encoded_input.items()}
        encoded_input_list.append(encoded_input)

        for encoded_input in encoded_input_list:
            with torch.no_grad():
                model_output = model(**encoded_input)

            sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
            ei_total_list.append(sentence_embeddings.squeeze().cpu().numpy())

    df_berted = np.array(ei_total_list)

    return df_berted

def rename_tokenized(df_1, df_2, column_1, column_2, column_3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    df_1_list = []
    df_2_list = []
    df_list = [df_1, df_2]
    column_list = [column_1, column_2, column_3]

    for df in df_list:
        for col in column_list:
            df_berted = auto_tokenizer(df, col)

            if isinstance(df_berted, np.ndarray):
                column_names = [f'{col}_berted_{i}' for i in range(df_berted.shape[1])]
                df_berted = pd.DataFrame(df_berted, columns=column_names)

            tokenized_data = []
            for _, row in df_berted.iterrows():
                tensor = torch.tensor(row.values, device=device)
                tokenized_data.append(tensor.tolist())

            if df is df_1:
                df_1_list.extend([tokenized_data])
            elif df is df_2:
                df_2_list.extend([tokenized_data])

    df_1_df = pd.DataFrame(df_1_list, index=column_list)
    df_2_df = pd.DataFrame(df_2_list, index=column_list)

    df_1_df = df_1_df.T
    df_2_df = df_2_df.T

    return df_1_df, df_2_df


In [63]:
train_to_ml, test_ready_to_ml = rename_tokenized(train, test, 'first_party', 'second_party', 'facts')


Some weights of the model checkpoint at nlpaueb/bert-base-uncased-contracts were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not in

KeyboardInterrupt: 

In [None]:
pd.DataFrame(train_to_ml).to_csv("./train_correlations.csv", index=False)
pd.DataFrame(test_ready_to_ml).to_csv('./test_correlations.csv', index=False)