In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')

# FEATRUE ENGINEERING

In [2]:
data['first_party'].value_counts()

United States       154
Illinois              9
Maryland              8
Florida               8
New York              7
                   ... 
David Carpenter       1
Larry Gene Heath      1
PGA TOUR, Inc.        1
PPL Montana, LLC      1
Markman               1
Name: first_party, Length: 2110, dtype: int64

In [3]:
data['second_party'].value_counts()

United States                        240
California                            19
United States of America              15
Illinois                              13
Federal Communications Commission     10
                                    ... 
David Boren, Governor of Oklahoma      1
Federal Bureau of Prisons et al.       1
Town of Harrison                       1
Charles Burr et al.                    1
Westview Instruments, Inc.             1
Name: second_party, Length: 1974, dtype: int64

In [4]:
# 출력 옵션 설정
# pd.set_option("display.max_rows", None)  # 모든 행 표시
pd.set_option("display.max_columns", None)  # 모든 열 표시
pd.set_option("display.width", None)  # 줄 바꿈 없이 전체 내용 표시
data['facts']

0       On June 27, 1962, Phil St. Amant, a candidate ...
1       Ramon Nelson was riding his bike when he suffe...
2       An Alabama state court convicted Billy Joe Mag...
3       Victor Linkletter was convicted in state court...
4       On April 24, 1953 in Selma, Alabama, an intrud...
                              ...                        
2473    Congress amended the Clean Air Act through the...
2474    Alliance Bond Fund, Inc., an investment fund, ...
2475    In 1992, the District Court sentenced Manuel D...
2476    On March 8, 1996, Enrico St. Cyr, a lawful per...
2477    Herbert Markman owns the patent to a system th...
Name: facts, Length: 2478, dtype: object

In [5]:
data.iloc[1]['facts']

'Ramon Nelson was riding his bike when he suffered a lethal blow to the back of his head with a baseball bat. After two eyewitnesses identified Lawrence Owens from an array of photos and then a lineup, he was tried and convicted for Nelson’s death. Because Nelson was carrying cocaine and crack cocaine potentially for distribution, the judge at Owens’ bench trial ruled that Owens was probably also a drug dealer and was trying to “knock [Nelson] off.” Owens was found guilty of first-degree murder and sentenced to 25 years in prison.\nOwens filed a petition for a writ of habeas corpus on the grounds that his constitutional right to due process was violated during the trial. He argued that the eyewitness identification should have been inadmissible based on unreliability and that the judge impermissibly inferred a motive when a motive was not an element of the offense. The district court denied the writ of habeas corpus, and Owens appealed. The U.S. Court of Appeals for the Seventh Circuit

In [6]:
data.iloc[0]['facts']


'On June 27, 1962, Phil St. Amant, a candidate for public office, made a television speech in Baton Rouge, Louisiana.  During this speech, St. Amant accused his political opponent of being a Communist and of being involved in criminal activities with the head of the local Teamsters Union.  Finally, St. Amant implicated Herman Thompson, an East Baton Rouge deputy sheriff, in a scheme to move money between the Teamsters Union and St. Amant’s political opponent. \nThompson successfully sued St. Amant for defamation.  Louisiana’s First Circuit Court of Appeals reversed, holding that Thompson did not show St. Amant acted with “malice.”  Thompson then appealed to the Supreme Court of Louisiana.  That court held that, although public figures forfeit some of their First Amendment protection from defamation, St. Amant accused Thompson of a crime with utter disregard of whether the remarks were true.  Finally, that court held that the First Amendment protects uninhibited, robust debate, rather t

In [7]:
data

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


# bert train

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, BertConfig
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import torch.optim.lr_scheduler as lr_scheduler
import torch.nn as nn

def mixup_data(x, y, alpha=2.0):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]

    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

# def mixup_data(x, y, alpha = 2.0):
#     batch_size = x.size(0)
#     indices = torch.randperm(batch_size)
#     shuffled_input = x[indices]
#     shuffled_target = y[indices]

#     lam = torch.distributions.beta.Beta(alpha, alpha).sample()
#     lam = torch.max(lam, 1 - lam)

#     mixed_input = lam.view(batch_size, 1) * x.squeeze(1) + (1 - lam).view(batch_size, 1) * shuffled_input.squeeze(1)
#     mixed_target = lam * y + (1 - lam) * shuffled_target

#     return mixed_input, mixed_target

# def mixup_criterion(criterion, pred, target, lam):
#     return lam * criterion(pred, target) + (1 - lam) * criterion(pred, target.flip(0))

weight = 1 / data['first_party_winner'].value_counts().sort_index().values

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# BERT 모델 및 토크나이저 불러오기
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2, hidden_dropout_prob=0.5)

# legal bert
# tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-uncased-contracts")
# model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")

class LegalModel(nn.Module):
    def __init__(self, class_weight=None):
        super(LegalModel, self).__init__()
        self.bert_model =  BertForSequenceClassification.from_pretrained(model_name, num_labels=2, hidden_dropout_prob=0.5)
        # self.bert_model =  AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-base-uncased", hidden_dropout_prob=0.5)
        # self.dropout = nn.Dropout(0.5)
        # self.linear = nn.Linear(768, 2)
        
    def forward(self, x, mask):
        pooled_output = self.bert_model(x, mask, return_dict=False)[0]
        # dropout_output = self.dropout(pooled_output)
        # linear_output = self.linear(dropout_output)

        return pooled_output
    
# 판결 내용과 가해자의 승소 여부 데이터셋 클래스 정의
class LegalDataset(Dataset):
    def __init__(self, data, label):
        self.data = data.tolist()
        self.labels = label.tolist()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data[index]
        label = self.labels[index]
        encoding = tokenizer(text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label)
        }


X_train, X_val, y_train, y_val = train_test_split(data['facts'], data['first_party_winner'], 
                                                  test_size=0.2, shuffle=True, stratify=data['first_party_winner'], random_state=23)

def metrics(labels, preds):
    labels, preds = np.array(labels), np.array(preds)
    f1s = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    return f1s, acc

model = LegalModel().to(device)

for param in model.parameters():
    param.requires_grad = True

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [3]:
print(model)

LegalModel(
  (bert_model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.5, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=T

In [4]:
train_dataset = LegalDataset(X_train, y_train)
val_dataset = LegalDataset(X_val, y_val)

# 데이터로더 생성
# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# 모델 학습 설정
model.to(device)
# 옵티마이저 정의 
loss_fn = nn.CrossEntropyLoss(weight=torch.Tensor(weight).cuda())
# loss_fn = nn.CrossEntropyLoss()
# loss_fn = nn.BCELoss(weight=torch.Tensor(weight).cuda())
# optimizer = torch.optim.Adam(model.parameters(), lr = 1e-5)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1)

optimizer = AdamW(model.parameters(), lr= 1e-3)
# scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=100, T_mult=1)
scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=50, T_mult=2)






In [5]:
from tqdm import tqdm

# 최상의 모델을 저장할 변수 초기화
best_accuracy = 0.0
best_valid_loss = 0
best_score = 0
best_f1_score = 0
best_model_path = 'best_model.pt'
# 진행 바 형식 설정
# 학습 관련 변수 초기화
max_epochs = 500
no_improvement_count = 0
# learning rate decay
# 모델 학습
model.train()
# model.zero_grad()
for epoch in range(max_epochs):
    print('-' * 50)
    total_accuracy = 0
    total_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    step = 0
    # for batch in tqdm(train_loader, bar_format=bar_format):
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # mixup
        # if step % 4 == 0:
        #     # if random.random() > 0.5:
        #     x_batch, y_batch_a, y_batch_b, lam = mixup_data(model.forward(input_ids, attention_mask), labels)
        #     # else:
        #         # x_batch, y_batch_a, y_batch_b, lam = cutmix_data(inputs, labels)
                
        #     outputs = model.forward(input_ids, attention_mask)
        #     loss = mixup_criterion(loss_fn, outputs, y_batch_a.to(device), y_batch_b.to(device), lam)

        # else:
        #     outputs = model.forward(input_ids, attention_mask)
        #     loss = loss_fn(outputs, labels)
            
        # no mixup
        outputs = model.forward(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        
        total_loss += loss.item()

        # 정확도 도출
        pred = [torch.argmax(logit).cpu().detach().item() for logit in outputs]
        true = [label for label in labels.cpu().numpy()]
        accuracy = accuracy_score(true, pred)
        total_accuracy += accuracy
        
        # optimizer.zero_grad()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # model.zero_grad()
        step += 1
    # epoch 당 loss 와 정확도 계산
    avg_loss = total_loss / len(train_loader)
    avg_accuracy = total_accuracy/len(train_loader)
    print(f" {epoch+1} Epoch Average train loss :  {avg_loss}")
    print(f" {epoch+1} Epoch Average train accuracy :  {avg_accuracy}")

    # 검증 데이터셋으로 모델 평가
    model.eval()
    valid_loss = 0
    valid_labels = []
    valid_preds = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model.forward(input_ids, attention_mask)
            _, predicted = torch.max(outputs, dim=1)

            valid_labels.extend(labels.cpu().tolist()) 
            valid_preds.extend(predicted.cpu().tolist()) 
            valid_loss += loss_fn(outputs, labels).item()
    f1s, acc = metrics(valid_labels, valid_preds)

    # if acc > best_score:
    #     best_score = acc
    #     torch.save(model.state_dict(), best_model_path)
    #     print("Best model saved!")
        
    # else:
    #     no_improvement_count += 1
    
    
    # if f1s > best_f1_score:
    #     best_f1_score = f1s
    #     torch.save(model.state_dict(), best_model_path)
    #     print("Best model saved!")
    # else:
    #     no_improvement_count += 1
    
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), best_model_path)
        print("Best model saved!")
    else:
        no_improvement_count += 1
    
    print(f'EPOCH : {epoch} | valid_loss : {valid_loss/len(val_loader):.4f} | f1s : {f1s:.4f} | acc :{acc:.4f}')

    scheduler.step()

    # early stopping 조건
    if no_improvement_count >= 20:
        print("No improvement in validation accuracy for 20 epochs. Early stopping triggered!")
        break


--------------------------------------------------
 1 Epoch Average train loss :  0.7952160816038808
 1 Epoch Average train accuracy :  0.4879368279569893
EPOCH : 0 | valid_loss : 0.6936 | f1s : 0.1678 | acc :0.3347
--------------------------------------------------
 2 Epoch Average train loss :  0.7312524539809073
 2 Epoch Average train accuracy :  0.5018145161290323
EPOCH : 1 | valid_loss : 0.7320 | f1s : 0.1678 | acc :0.3347
--------------------------------------------------


In [10]:
total_accuracy

10.5625

In [13]:
a

(tensor([[ 0.0468,  0.3738],
         [ 0.0786,  0.2952],
         [ 0.1177,  0.1004],
         [-0.1134,  0.2944],
         [ 0.0028,  0.0403],
         [ 0.0313,  0.2784],
         [ 0.0023,  0.5215],
         [ 0.0843,  0.4572],
         [ 0.1056,  0.3013],
         [ 0.1165,  0.3590],
         [ 0.0798,  0.3035],
         [ 0.0453,  0.3545],
         [-0.0095,  0.5584],
         [ 0.0977,  0.3000],
         [ 0.0086,  0.3212],
         [-0.1732,  0.6333]], device='cuda:0', grad_fn=<AddmmBackward0>),)

# predict

In [4]:
import torch
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, BertConfig
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import torch.optim.lr_scheduler as lr_scheduler
import torch.nn as nn

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
test=  pd.read_csv('test.csv')
class LegalModel(nn.Module):
    def __init__(self, class_weight=None):
        super(LegalModel, self).__init__()
        # self.backbone =  BertForSequenceClassification.from_pretrained(model_name, num_labels=2, hidden_dropout_prob=0.5)
        self.bert_model =  AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-base-uncased", hidden_dropout_prob=0.5)
        # self.dropout = nn.Dropout(0.5)
        # self.linear = nn.Linear(768, 2)
        
    def forward(self, x, mask):
        pooled_output = self.bert_model(x, mask, return_dict=False)[0]
        # dropout_output = self.dropout(pooled_output)
        # linear_output = self.linear(dropout_output)

        return pooled_output
    
model = LegalModel()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)



# 저장된 모델 로드
model.load_state_dict(torch.load('best_model.pt'))
model.eval()

# 모델 학습 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
pred = []

# 예측 수행
with torch.no_grad():
    for input_text in test['facts']:
        # 텍스트를 토큰화하고 텐서로 변환
        inputs = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        
        outputs = model.forward(input_ids, attention_mask)
        # logits = outputs.logits
        pred.append(torch.argmax(outputs, dim=1).item())



Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

In [6]:
sub = pd.read_csv('sample_submission.csv')
sub['first_party_winner'] = pred
sub

Unnamed: 0,ID,first_party_winner
0,TEST_0000,1
1,TEST_0001,1
2,TEST_0002,1
3,TEST_0003,1
4,TEST_0004,1
...,...,...
1235,TEST_1235,1
1236,TEST_1236,1
1237,TEST_1237,1
1238,TEST_1238,1


In [7]:
sub['first_party_winner'].value_counts()

1    1240
Name: first_party_winner, dtype: int64

In [27]:
sub.to_csv('submit_230611.csv', index=False)