In [10]:
import pandas as pd

In [4]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Data

In [12]:
data = pd.read_csv('data/data_5474.csv')
data

Unnamed: 0,0,1,2,3,4,5
0,전문가가 알려주는 창의 영재교육의 비법,알차네요,star star-rate5,무엇보다도 빨리들을 수 있어 좋습니다,학습지도,2023.07.28
1,연수로 완성하는 아이스크림 수학 교과서(4학년),명불허전,star star-rate4,명강사라 그런지 말씀을 참 잘하시네요~,교과지도,2023.07.28
2,본격! 청개구리 학생심리 사로잡기,강추,star star-rate5,30년 가까이 일하면서도 놓치고 있었던 것들을 알려주었습니다. 그리고 무조건 교사 ...,학급경영,2023.07.28
3,"가장 쉬운 수업도구, 패들렛과 띵커벨",패들렛과 띵커벨 활용 잘 할 수 있어요,star star-rate5,패들렛과 띵커벨 활용하고 있었으나 다양한 교과에 특히 독서 지도에 많이 활용 할 수...,ICT,2023.07.28
4,신영일과 함께하는 안전교육 365,필수 연수 부담 없이 편안하게,star star-rate4,"안전 연수 올 해 꼭 받아야 하는데, 부담 없이 편안한 마음으로 마쳤어요.~",생활지도,2023.07.27
...,...,...,...,...,...,...
5352,알베르토가 함께 하는 핵심역량 연계 다문화 교육,알베르토가 함께하는 핵심역량 연계 다문화 교육,star star-rate5,여타 연수와 다르게 새로 알게된 내용이 많이 있습니다.\n아이들 가르치는데 많은 도...,생활지도,2022.03.30
5353,전문가가 알려주는 창의 영재교육의 비법,전문가가 알려주는 창의 영재교육의 비법 연수 후기,star star-rate5,영재 교육에 대한 틀을 잡아주시는 넘 좋은 연수였습니다!\n많은 도움이 되어 강력 ...,학습지도,2022.03.30
5354,도도한 교사생활,도도한 교사생활,star star-rate5,연수를 통해 자신감을 더 갖게 되었다.,생활지도,2022.03.30
5355,생각을 코딩하다! 소프트웨어교실,교실에서 활용할 수 있는 앱들 중심으로 재미있게 들었어요.,star star-rate5,교실에서 활용할 수 있는 앱들 중심으로 재미있게 들었어요.\n코로나19로 집콕하며 ...,ICT,2022.03.30


In [5]:
# Example data (replace this with your data)
texts = [
    "This is a positive sentence.",
    "Another positive example.",
    "Negative sentences are no good.",
    "Positivity is the key.",
]

labels = [1, 1, 0, 1]  # 1 for positive, 0 for negative (replace with your labels)

# Tokenize

In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize input texts
input_ids = []
attention_masks = []

for text in texts:
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors="pt",
        truncation=True,
    )

    input_ids.append(encoded_dict["input_ids"])
    attention_masks.append(encoded_dict["attention_mask"])

# Convert lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



# Split the dataset into training and testing sets

In [7]:
# Split the data into training and testing sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    input_ids, labels, test_size=0.2, random_state=42
)

train_masks, test_masks, _, _ = train_test_split(
    attention_masks, input_ids, test_size=0.2, random_state=42
)

# Create DataLoader for efficient batching

In [8]:
# Define batch size and create DataLoader
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# Fine-tune BERT for classification

In [9]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks)[0]
        loss = loss_fn(outputs, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

# Evaluation
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch

        outputs = model(inputs, attention_mask=masks)[0]
        _, pred_labels = torch.max(outputs, dim=1)

        predictions.extend(pred_labels.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate evaluation metrics
print(classification_report(true_labels, predictions))

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Loss: 0.8662
Epoch 2/3, Loss: 0.9342
Epoch 3/3, Loss: 0.7230
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
