## setting

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install datasets==2.14.0

In [None]:
import os
import pandas as pd
import numpy as np
import random

In [None]:
import torch
import time
from tqdm import tqdm
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import FunnelTokenizer, FunnelForSequenceClassification, AdamW, get_linear_schedule_with_warmup, TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
# Seed 설정
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed = 0
set_seed(seed)

## data

In [None]:
path = '/kaggle/input/'
sample_SEED = 0

In [None]:
data = pd.read_json(path+'data.json')

def split_data(data, sample_SEED, fixSEED=0):
  # normal bot split
  normal = data[data['restrict']==0].reset_index(drop=True)
  bot = data[data['restrict']==1].reset_index(drop=True)
  # normal underampling
  random.seed(sample_SEED)
  sample = random.sample(list(range(len(normal))), len(bot))
  normal = normal.loc[sample].reset_index(drop=True)
  data = pd.concat([normal, bot], axis=0)
  data['restrict'].value_counts()
  # train valid test split
  train, test = train_test_split(data, test_size=0.3, random_state=fixSEED, stratify = data['restrict'])
  train, valid = train_test_split(train, test_size=0.3, random_state=fixSEED, stratify = train['restrict'])
  train = train.reset_index(drop=True)
  valid = valid.reset_index(drop=True)
  test = test.reset_index(drop=True)
  # 1:1 fix
  random.seed(fixSEED)
  move = random.sample(list(valid[valid['restrict']==1].index), 1)
  test = pd.concat([test, valid.loc[move]], axis=0).reset_index(drop=True)
  valid = valid.drop(move, axis=0).reset_index(drop=True)
  return train, valid, test

# data split
train, valid, test = split_data(data, sample_SEED)

## model setting

In [None]:
# model, tokenizer
tokenizer = FunnelTokenizer.from_pretrained("funnel-transformer/small-base")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = FunnelForSequenceClassification.from_pretrained("funnel-transformer/small-base", num_labels=2)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
CUDA_LAUNCH_BLOCKING=1

## text

In [None]:
from datasets import Dataset, DatasetDict

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=1024)

# 전처리 함수
def prepare_data(df):
    data = {'text': df['text'].tolist(), 'restrict': df['restrict'].tolist()}
    dataset = Dataset.from_dict(data)
    dataset = dataset.map(tokenize_function, batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'restrict'])
    return dataset

# X, y 분리
X_train, y_train = train['text'], train['restrict']
X_val, y_val = valid['text'], valid['restrict']
X_test, y_test = test['text'], test['restrict']


train_df = pd.DataFrame(X_train)
train_df['restrict'] = y_train
val_df = pd.DataFrame(X_val)
val_df['restrict'] = y_val
test_df = pd.DataFrame(X_test)
test_df['restrict'] = y_test

train_dataset = prepare_data(train_df)
val_dataset = prepare_data(val_df)
test_dataset = prepare_data(test_df)

In [None]:
# batch size = 4
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)
test_loader = DataLoader(test_dataset, batch_size=4)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

# Learning rate scheduler -> 학습률을 줄이거나 늘리면서 안정성+성능 개선 가능
num_training_steps = num_epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Accumulate gradients over
gradient_accumulation_steps = 4

# Training function
def train(model, train_loader, optimizer, scheduler):
    model.train()
    total_loss = 0
    scaler = torch.cuda.amp.GradScaler()

    for step, batch in enumerate(tqdm(train_loader, desc="Training")):
        with torch.cuda.amp.autocast():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['restrict'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / gradient_accumulation_steps
            total_loss += loss.item()

        scaler.scale(loss).backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()

    return total_loss / len(train_loader)

# Evaluation function
def evaluate(model, data_loader):
    model.eval()
    preds, true_labels = [], []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['restrict'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, axis=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return preds, true_labels

# 훈련 시간 측정 시작
start_time = time.time()

for epoch in range(num_epochs):
    epoch_start_time = time.time()
    train_loss = train(model, train_loader, optimizer, scheduler)
    epoch_end_time = time.time()

    print(f"Epoch {epoch + 1}, Loss: {train_loss}")
    print(f"Epoch {epoch + 1} Computing Time: {epoch_end_time - epoch_start_time:} seconds")

    # validation data 평가
    predicted_labels, true_labels = evaluate(model, val_loader)
    f1 = f1_score(true_labels, predicted_labels, zero_division=1)
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels, zero_division=1)
    accuracy = accuracy_score(true_labels, predicted_labels)
    conf_matrix = confusion_matrix(true_labels, predicted_labels)

    print(f"Epoch {epoch + 1}, Validation F1 Score: {f1:}")
    print(f"Validation Precision: {precision:}")
    print(f"Validation Recall: {recall:}")
    print(f"Validation Accuracy: {accuracy:}")
    print("Validation Confusion Matrix:\n", conf_matrix)

# 훈련 시간 측정 종료
end_time = time.time()
total_training_time = end_time - start_time # 훈련시간

# Final prediction
predicted_labels, true_labels = evaluate(model, test_loader)
f1 = f1_score(true_labels, predicted_labels, zero_division=1)
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels, zero_division=1)
accuracy = accuracy_score(true_labels, predicted_labels)
conf_matrix = confusion_matrix(true_labels, predicted_labels)

print("Final Test Results:")
print(f"Test F1 Score: {f1:}")
print(f"Test Precision: {precision:}")
print(f"Test Recall: {recall:}")
print(f"Test Accuracy: {accuracy:}")
print("Test Confusion Matrix:\n", conf_matrix)
print(f"Total Computing Time: {total_training_time:} seconds")

## longtext

In [None]:
data = pd.read_json(path+'data.json')

# data split
train, valid, test = split_data(data, sample_SEED)

seed = 0
set_seed(seed)

# model, tokenizer
tokenizer = FunnelTokenizer.from_pretrained("funnel-transformer/small-base")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = FunnelForSequenceClassification.from_pretrained("funnel-transformer/small-base", num_labels=2)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
from datasets import Dataset, DatasetDict

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['longtext'], padding='max_length', truncation=True, max_length=512)

# 전처리 함수
def prepare_data(df):
    data = {'longtext': df['longtext'].tolist(), 'restrict': df['restrict'].tolist()}
    dataset = Dataset.from_dict(data)
    dataset = dataset.map(tokenize_function, batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'restrict'])
    return dataset

# X, y 분리
X_train, y_train = train['longtext'], train['restrict']
X_val, y_val = valid['longtext'], valid['restrict']
X_test, y_test = test['longtext'], test['restrict']


train_df = pd.DataFrame(X_train)
train_df['restrict'] = y_train
val_df = pd.DataFrame(X_val)
val_df['restrict'] = y_val
test_df = pd.DataFrame(X_test)
test_df['restrict'] = y_test

train_dataset = prepare_data(train_df)
val_dataset = prepare_data(val_df)
test_dataset = prepare_data(test_df)

Map:   0%|          | 0/5642 [00:00<?, ? examples/s]

Map:   0%|          | 0/2418 [00:00<?, ? examples/s]

Map:   0%|          | 0/3456 [00:00<?, ? examples/s]

In [None]:
# batch size = 4
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)
test_loader = DataLoader(test_dataset, batch_size=4)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

# Learning rate scheduler -> 학습률을 줄이거나 늘리면서 안정성+성능 개선 가능
num_training_steps = num_epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Accumulate gradients over
gradient_accumulation_steps = 4

# Training function
def train(model, train_loader, optimizer, scheduler):
    model.train()
    total_loss = 0
    scaler = torch.cuda.amp.GradScaler()

    for step, batch in enumerate(tqdm(train_loader, desc="Training")):
        with torch.cuda.amp.autocast():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['restrict'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / gradient_accumulation_steps
            total_loss += loss.item()

        scaler.scale(loss).backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()

    return total_loss / len(train_loader)

# Evaluation function
def evaluate(model, data_loader):
    model.eval()
    preds, true_labels = [], []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['restrict'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, axis=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return preds, true_labels

# 훈련 시간 측정 시작
start_time = time.time()

for epoch in range(num_epochs):
    epoch_start_time = time.time()
    train_loss = train(model, train_loader, optimizer, scheduler)
    epoch_end_time = time.time()

    print(f"Epoch {epoch + 1}, Loss: {train_loss}")
    print(f"Epoch {epoch + 1} Computing Time: {epoch_end_time - epoch_start_time:} seconds")

    # validation data 평가
    predicted_labels, true_labels = evaluate(model, val_loader)
    f1 = f1_score(true_labels, predicted_labels, zero_division=1)
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels, zero_division=1)
    accuracy = accuracy_score(true_labels, predicted_labels)
    conf_matrix = confusion_matrix(true_labels, predicted_labels)

    print(f"Epoch {epoch + 1}, Validation F1 Score: {f1:}")
    print(f"Validation Precision: {precision:}")
    print(f"Validation Recall: {recall:}")
    print(f"Validation Accuracy: {accuracy:}")
    print("Validation Confusion Matrix:\n", conf_matrix)

# 훈련 시간 측정 종료
end_time = time.time()
total_training_time = end_time - start_time # 훈련시간

# Final prediction
predicted_labels, true_labels = evaluate(model, test_loader)
f1 = f1_score(true_labels, predicted_labels, zero_division=1)
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels, zero_division=1)
accuracy = accuracy_score(true_labels, predicted_labels)
conf_matrix = confusion_matrix(true_labels, predicted_labels)

print("Final Test Results:")
print(f"Test F1 Score: {f1:}")
print(f"Test Precision: {precision:}")
print(f"Test Recall: {recall:}")
print(f"Test Accuracy: {accuracy:}")
print("Test Confusion Matrix:\n", conf_matrix)
print(f"Total Computing Time: {total_training_time:} seconds")

## longlongtext

In [None]:
data = pd.read_json(path+'data.json')

# data split
train, valid, test = split_data(data, sample_SEED)

seed = 0
set_seed(seed)

# model, tokenizer
tokenizer = FunnelTokenizer.from_pretrained("funnel-transformer/small-base")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = FunnelForSequenceClassification.from_pretrained("funnel-transformer/small-base", num_labels=2)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
from datasets import Dataset, DatasetDict

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['longlongtext'], padding='max_length', truncation=True, max_length=512)

# 전처리 함수
def prepare_data(df):
    data = {'longlongtext': df['longlongtext'].tolist(), 'restrict': df['restrict'].tolist()}
    dataset = Dataset.from_dict(data)
    dataset = dataset.map(tokenize_function, batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'restrict'])
    return dataset

# X, y 분리
X_train, y_train = train['longlongtext'], train['restrict']
X_val, y_val = valid['longlongtext'], valid['restrict']
X_test, y_test = test['longlongtext'], test['restrict']


train_df = pd.DataFrame(X_train)
train_df['restrict'] = y_train
val_df = pd.DataFrame(X_val)
val_df['restrict'] = y_val
test_df = pd.DataFrame(X_test)
test_df['restrict'] = y_test

train_dataset = prepare_data(train_df)
val_dataset = prepare_data(val_df)
test_dataset = prepare_data(test_df)

Map:   0%|          | 0/5642 [00:00<?, ? examples/s]

Map:   0%|          | 0/2418 [00:00<?, ? examples/s]

Map:   0%|          | 0/3456 [00:00<?, ? examples/s]

In [None]:
# batch size = 4
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)
test_loader = DataLoader(test_dataset, batch_size=4)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

# Learning rate scheduler -> 학습률을 줄이거나 늘리면서 안정성+성능 개선 가능
num_training_steps = num_epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Accumulate gradients over
gradient_accumulation_steps = 4

# Training function
def train(model, train_loader, optimizer, scheduler):
    model.train()
    total_loss = 0
    scaler = torch.cuda.amp.GradScaler()

    for step, batch in enumerate(tqdm(train_loader, desc="Training")):
        with torch.cuda.amp.autocast():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['restrict'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / gradient_accumulation_steps
            total_loss += loss.item()

        scaler.scale(loss).backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()

    return total_loss / len(train_loader)

# Evaluation function
def evaluate(model, data_loader):
    model.eval()
    preds, true_labels = [], []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['restrict'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, axis=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return preds, true_labels

# 훈련 시간 측정 시작
start_time = time.time()

for epoch in range(num_epochs):
    epoch_start_time = time.time()
    train_loss = train(model, train_loader, optimizer, scheduler)
    epoch_end_time = time.time()

    print(f"Epoch {epoch + 1}, Loss: {train_loss}")
    print(f"Epoch {epoch + 1} Computing Time: {epoch_end_time - epoch_start_time:} seconds")

    # validation data 평가
    predicted_labels, true_labels = evaluate(model, val_loader)
    f1 = f1_score(true_labels, predicted_labels, zero_division=1)
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels, zero_division=1)
    accuracy = accuracy_score(true_labels, predicted_labels)
    conf_matrix = confusion_matrix(true_labels, predicted_labels)

    print(f"Epoch {epoch + 1}, Validation F1 Score: {f1:}")
    print(f"Validation Precision: {precision:}")
    print(f"Validation Recall: {recall:}")
    print(f"Validation Accuracy: {accuracy:}")
    print("Validation Confusion Matrix:\n", conf_matrix)

# 훈련 시간 측정 종료
end_time = time.time()
total_training_time = end_time - start_time # 훈련시간

# Final prediction
predicted_labels, true_labels = evaluate(model, test_loader)
f1 = f1_score(true_labels, predicted_labels, zero_division=1)
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels, zero_division=1)
accuracy = accuracy_score(true_labels, predicted_labels)
conf_matrix = confusion_matrix(true_labels, predicted_labels)

print("Final Test Results:")
print(f"Test F1 Score: {f1:}")
print(f"Test Precision: {precision:}")
print(f"Test Recall: {recall:}")
print(f"Test Accuracy: {accuracy:}")
print("Test Confusion Matrix:\n", conf_matrix)
print(f"Total Computing Time: {total_training_time:} seconds")