In [1]:
!pip3 install pytorch-transformers
!pip install transformers

Collecting pytorch-transformers
  Downloading pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.4/176.4 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting boto3 (from pytorch-transformers)
  Downloading boto3-1.26.162-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.9/135.9 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from pytorch-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses (from pytorch-transformers)
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880

In [2]:
import torch
import random
import numpy as np
from copy import deepcopy
seed=42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [3]:
from google.colab import drive
import pandas as pd
drive.mount('/content/gdrive')
file_path = '/content/gdrive/MyDrive/dacon law/'
train_data = pd.read_csv(file_path+'train.csv')
test_data = pd.read_csv(file_path+'test.csv')

Mounted at /content/gdrive


In [4]:
train_data.drop("ID", axis=1, inplace=True)

In [5]:
from sklearn.model_selection import train_test_split
val_rate=0.2
train_indices, val_indices = train_test_split(range(len(train_data)), test_size=val_rate)

In [6]:
from torch.utils.data import DataLoader, Subset
train_dataset = train_data.iloc[train_indices]
validation_dataset = train_data.iloc[val_indices]

train_dataset.reset_index(drop=False, inplace=True)
validation_dataset.reset_index(drop=False, inplace=True)

In [7]:
from transformers.optimization import get_linear_schedule_with_warmup
device = torch.device("cuda")
from transformers import BertTokenizer, BertModel, AdamW
import transformers
transformers.logging.set_verbosity_error()
from torch import nn
import torch.nn.functional as F

In [8]:
class CaseDataset(torch.utils.data.Dataset):
    def __init__(self, first_parties, second_parties, case_contents, labels):
        self.first_parties = first_parties
        self.second_parties = second_parties
        self.case_contents = case_contents
        self.labels = labels
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.first_parties)

    def __getitem__(self, idx):
        inputs = self.tokenizer.encode_plus(
            self.first_parties[idx],
            self.case_contents[idx],
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        label = torch.tensor(self.labels[idx])
        return {
            'input_ids': torch.tensor(inputs['input_ids'].squeeze(), device=device),
            'attention_mask': torch.tensor(inputs['attention_mask'].squeeze(), device=device),
            'labels': torch.tensor(label, device=device)
        }

In [20]:
# Define training parameters
batch_size = 4
num_epochs = 20
learning_rate = 2e-5

In [21]:
#dataset tuning
train_first_parties=train_dataset['first_party']
train_second_parties=train_dataset['second_party']
train_case_contents=train_dataset['facts']
train_labels=train_dataset['first_party_winner']

train_dataset_ = CaseDataset(train_first_parties, train_second_parties, train_case_contents, train_labels)
train_dataloader = DataLoader(train_dataset_, batch_size=batch_size, shuffle=True)

total_steps=len(train_dataloader)*num_epochs

validation_first_parties=validation_dataset['first_party']
validation_second_parties=validation_dataset['second_party']
validation_case_contents=validation_dataset['facts']
validation_labels=validation_dataset['first_party_winner']

validation_dataset_ = CaseDataset(validation_first_parties, validation_second_parties, validation_case_contents, train_labels)
validation_dataloader = DataLoader(validation_dataset_, batch_size=batch_size, shuffle=True)

In [22]:
class CustomModel(nn.Module):
    def __init__(self, base_model):
        super(CustomModel, self).__init__()
        self.bert = base_model
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(base_model.config.hidden_size, 1)  # Adjust the input and output dimensions as needed
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.linear(pooled_output)
        logits = self.sigmoid(logits)
        return logits

In [23]:
criterion = nn.BCELoss().to(device)
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
base_model = BertModel.from_pretrained(model_name)
model=CustomModel(base_model).to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

scheduler=get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_epochs)



In [45]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [46]:
# Training loop

model.train()

valid_losses, lowest_loss= list(), np.inf
early_stop = 100
progress_interval=1

for epoch in range(num_epochs):
    model.train()
    total_loss=0
    total_correct=0
    total_samples = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels'].float()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        outputs=outputs.squeeze()

        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        # Compute accuracy
        predicted_labels = torch.round(outputs)
        correct_predictions = torch.eq(predicted_labels, labels).sum().item()
        total_correct += correct_predictions
        total_samples += labels.size(0)

        total_loss += loss.item()

    # Compute metrics for the epoch
    epoch_loss = total_loss / len(train_dataloader)
    epoch_accuracy = total_correct / total_samples

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"  Loss: {epoch_loss:.4f}")
    print(f"  Accuracy: {epoch_accuracy:.4f}")

    # validate the model
    model.eval()
    val_loss=0
    val_samples=0
    itr=0

    with torch.no_grad():
        for batch in validation_dataloader:
          itr+=1
          input_ids = batch['input_ids']
          attention_mask = batch['attention_mask']
          labels = batch['labels'].float()
          outputs = model(input_ids=input_ids, attention_mask=attention_mask)
          outputs=outputs.squeeze()

          loss = criterion(outputs, labels)
          val_loss+=loss.item()
          val_samples += labels.size(0)
    valid_avg_loss=val_loss / itr
    valid_losses.append(valid_avg_loss)

    if valid_losses[-1] < lowest_loss:
        lowest_loss = valid_losses[-1]
        lowest_epoch = epoch+1
        best_model = deepcopy(model.state_dict())
        torch.save(model.state_dict(), file_path + 'BERT_with_lin_layer_adamw_2e5.pth')  # 모델 객체의 state_dict 저장
    else:
        if early_stop > 0 and lowest_epoch + early_stop < epoch:
            print("Early Stopped", epoch, "epochs")
            model.load_state_dict(best_model)
            break

    if (epoch % progress_interval) == 0:
        print(valid_losses[-1], lowest_loss, lowest_epoch, epoch+1)




  'input_ids': torch.tensor(inputs['input_ids'].squeeze(), device=device),
  'attention_mask': torch.tensor(inputs['attention_mask'].squeeze(), device=device),
  'labels': torch.tensor(label, device=device)
  torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)


Epoch 1/20
  Loss: 0.6430
  Accuracy: 0.6660
0.6334173710596177 0.6334173710596177 1 1
Epoch 2/20
  Loss: 0.6398
  Accuracy: 0.6655
0.6334173698579112 0.6334173698579112 2 2
Epoch 3/20
  Loss: 0.6418
  Accuracy: 0.6665
0.633417374424396 0.6334173698579112 2 3
Epoch 4/20
  Loss: 0.6396
  Accuracy: 0.6670
0.6334173703385938 0.6334173698579112 2 4
Epoch 5/20
  Loss: 0.6424
  Accuracy: 0.6665
0.6334173720209829 0.6334173698579112 2 5
Epoch 6/20
  Loss: 0.6383
  Accuracy: 0.6665
0.6334173741840547 0.6334173698579112 2 6
Epoch 7/20
  Loss: 0.6408
  Accuracy: 0.6665
0.633417371299959 0.6334173698579112 2 7
Epoch 8/20
  Loss: 0.6420
  Accuracy: 0.6660
0.6334173705789351 0.6334173698579112 2 8
Epoch 9/20
  Loss: 0.6411
  Accuracy: 0.6665
0.6334173688965459 0.6334173688965459 9 9
Epoch 10/20
  Loss: 0.6416
  Accuracy: 0.6665
0.6334173686562046 0.6334173686562046 10 10
Epoch 11/20
  Loss: 0.6428
  Accuracy: 0.6665
0.6334173698579112 0.6334173686562046 10 11
Epoch 12/20
  Loss: 0.6412
  Accuracy: 

KeyboardInterrupt: ignored

In [None]:
model.load_state_dict(best_model)

In [47]:
# Inference on the test set
test_first_parties=test_data['first_party']
test_second_parties=test_data['second_party']
test_case_contents=test_data['facts']

test_dataset = CaseDataset(test_first_parties, test_second_parties, test_case_contents)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predicted_labels=round(outputs)
        predictions.extend(predicted_labels.tolist())

TypeError: ignored

In [None]:
predictions

In [None]:
just_test = pd.read_csv(file_path+'test.csv')
df = pd.DataFrame(predictions)
df.transpose()
just_test['first_party_winner']=df

In [None]:
just_test.to_csv('submission_4.csv', sep=',')