In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [5]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [6]:
!pip install transformers



In [7]:
import json

# JSON 파일 읽기 및 데이터 추출
def extract_captions_labels(json_file_path, num_samples = None):
    with open(json_file_path, "r") as json_file:
        data = json.load(json_file)

    captions_labels = []
    annotations = data["annotations"]

    if num_samples is not None:
        annotations = annotations[:num_samples]

    for annotation in annotations:
        caption = annotation["caption"]
        label = annotation["danger_score"]
        captions_labels.append((caption, label))

    return captions_labels

#데이터 추출
file1_captions_labels = extract_captions_labels("/content/drive/MyDrive/train_abnormal_dataset.json")

captions_labels = file1_captions_labels

print(len(captions_labels))

captions_labels = list(set(captions_labels))

print(len(captions_labels))

2091
1900


In [8]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# 데이터 분리
sentences, labels = zip(*captions_labels)

In [9]:
from transformers import BertTokenizer

print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
# 0번 문장으로 테스트
print(' Original: ', sentences[0])

print('Tokenized: ', tokenizer.tokenize(sentences[0]))

print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 Original:  A woman and a man are standing next to a car parked in a parking lot.
Tokenized:  ['a', 'woman', 'and', 'a', 'man', 'are', 'standing', 'next', 'to', 'a', 'car', 'parked', 'in', 'a', 'parking', 'lot', '.']
Token IDs:  [1037, 2450, 1998, 1037, 2158, 2024, 3061, 2279, 2000, 1037, 2482, 9083, 1999, 1037, 5581, 2843, 1012]


In [11]:
max_len = 0

for sent in sentences:

    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  35


In [12]:
input_ids = []
attention_masks = []

for sent in sentences:

    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])


input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  A woman and a man are standing next to a car parked in a parking lot.
Token IDs: tensor([ 101, 1037, 2450, 1998, 1037, 2158, 2024, 3061, 2279, 2000, 1037, 2482,
        9083, 1999, 1037, 5581, 2843, 1012,  102,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])


In [13]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.95 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

1,805 training samples
   95 validation samples


In [14]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

In [15]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 8,
    output_attentions = False,
    output_hidden_states = False,
)

model.cuda()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [16]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )




In [17]:
from transformers import get_linear_schedule_with_warmup

epochs = 5

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [18]:
import numpy as np

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [19]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))

    return str(datetime.timedelta(seconds=elapsed_rounded))


In [31]:
import random
import numpy as np
import time
import torch

# Seed 설정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []  # 학습 통계 저장
total_t0 = time.time()  # 전체 학습 시간 측정 시작

# Epoch 반복
for epoch_i in range(epochs):
    print(f"\n======== Epoch {epoch_i + 1} / {epochs} ========")
    print("Training...")

    t0 = time.time()  # Epoch 시작 시간
    total_train_loss = 0

    model.train()  # 모델을 학습 모드로 전환

    # Batch 반복
    for step, batch in enumerate(train_dataloader):
        # 진행 상황 출력
        if step % 40 == 0 and step > 0:
            elapsed = format_time(time.time() - t0)
            print(f"  Batch {step:>5} of {len(train_dataloader)}. Elapsed: {elapsed}.")

        # Batch 데이터 준비
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # 그래디언트 초기화
        model.zero_grad()

        # Forward pass
        outputs = model(
            b_input_ids,
            attention_mask=b_input_mask,
            labels=b_labels,
            return_dict=True
        )
        loss = outputs.loss
        logits = outputs.logits

        # Loss 축적
        total_train_loss += loss.item()

        # Backward pass
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 옵티마이저 업데이트
        optimizer.step()
        scheduler.step()

    # Epoch 종료 후 평균 손실 계산
    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)

    print(f"\n  Average training loss: {avg_train_loss:.2f}")
    print(f"  Training epoch took: {training_time}")



Training...
  Batch    40 of 57. Elapsed: 0:00:13.

  Average training loss: 0.34
  Training epoch took: 0:00:18

Training...
  Batch    40 of 57. Elapsed: 0:00:13.

  Average training loss: 0.34
  Training epoch took: 0:00:18

Training...
  Batch    40 of 57. Elapsed: 0:00:13.

  Average training loss: 0.34
  Training epoch took: 0:00:18

Training...
  Batch    40 of 57. Elapsed: 0:00:12.

  Average training loss: 0.35
  Training epoch took: 0:00:18

Training...
  Batch    40 of 57. Elapsed: 0:00:12.

  Average training loss: 0.35
  Training epoch took: 0:00:18


In [32]:
model.save_pretrained('/content/drive/MyDrive/bert')
tokenizer.save_pretrained('/content/drive/MyDrive/bert')

('/content/drive/MyDrive/bert/tokenizer_config.json',
 '/content/drive/MyDrive/bert/special_tokens_map.json',
 '/content/drive/MyDrive/bert/vocab.txt',
 '/content/drive/MyDrive/bert/added_tokens.json')

# 정답과 예측 값 비교

In [41]:
import json
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# 모델, 토크나이저 로드
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/bert')
tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/bert')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenizer로 입력 데이터 준비
inputs = tokenizer(list(sentences), padding=True, truncation=True, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

# 모델 예측
model.eval()
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=1)
    predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()

# 라벨 맵핑 (예: 8개의 클래스)
label_map = {0: 'Label_0(정상)', 1: 'Label_1(위험)', 2: 'Label_2(위험)',
             3: 'Label_3(위험)', 4: 'Label_4(위험)', 5: 'Label_5(위험)',
             6: 'Label_6(위험)', 7: 'Label_7(위험)'}

# 예측 라벨과 정답 라벨 비교
print("\nComparing Predicted Labels and True Labels:")
for i, text in enumerate(sentences):
    pred_label = predicted_labels[i]
    true_label = true_labels[i]
    print(f"Input: {text}")
    print(f"True Label: {label_map[true_label]} | Predicted Label: {label_map[pred_label]}")
    print("="*50)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Input: A woman wearing a white top has fallen in the snow, and another man is approaching her.
True Label: Label_6(위험) | Predicted Label: Label_6(위험)
Input: A man in a suit is riding a board on the road.
True Label: Label_4(위험) | Predicted Label: Label_4(위험)
Input: A man wearing suit is kicking a person in the room.
True Label: Label_5(위험) | Predicted Label: Label_5(위험)
Input: A man wearing jeans kicking a fallen man next to a car parked on the street.
True Label: Label_5(위험) | Predicted Label: Label_5(위험)
Input: A woman in a black clothes and black car is colliding on a crosswalk.
True Label: Label_6(위험) | Predicted Label: Label_6(위험)
Input: A firefighter comes out of a car on fire holding a child.
True Label: Label_7(위험) | Predicted Label: Label_7(위험)
Input: A person in white clothes is walking on the road during the night.
True Label: Label_3(위험) | Predicted Label: Label_3(위험)
Input: A girl wearing mask is attacking a boy wearing whi

In [42]:
print(f"Total number of captions: {len(sentences)}")

Total number of captions: 1900


In [43]:
import torch.nn.functional as F
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# 분류 성능 평가
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')
accuracy = accuracy_score(true_labels, predicted_labels)

print("\nClassification Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")




Classification Metrics:
Accuracy: 0.9253
Precision: 0.9208
Recall: 0.9253
F1 Score: 0.9209


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 정답과 예측이 다른 캡션 추출

In [38]:
# 정답과 예측이 다른 캡션 추출
mismatched_samples = []  # 정답과 예측이 다른 샘플 저장

for i, text in enumerate(sentences):
    pred_label = predicted_labels[i]
    true_label = true_labels[i]
    if pred_label != true_label:
        mismatched_samples.append({
            "Input": text,
            "True Label": label_map[true_label],
            "Predicted Label": label_map[pred_label]
        })

# 정답과 예측이 다른 샘플 출력
print("\nMismatched Samples:")
for sample in mismatched_samples:
    print(f"Input: {sample['Input']}")
    print(f"True Label: {sample['True Label']} | Predicted Label: {sample['Predicted Label']}")
    print("="*50)

# 정답과 예측이 다른 샘플 총 개수 출력
print(f"\nTotal Mismatched Samples: {len(mismatched_samples)}")



Mismatched Samples:
Input: A woman wearing a blue shirt and sunglasses is walking on the road.
True Label: Label_4(위험) | Predicted Label: Label_3(위험)
Input: Three people are fighting in front of the store, and two people are walking on the street.
True Label: Label_6(위험) | Predicted Label: Label_5(위험)
Input: The person wearing a green top fell on the flower bed.
True Label: Label_6(위험) | Predicted Label: Label_5(위험)
Input: A woman wearing an orange tank top is sitting on the railing and taking pictures.
True Label: Label_0(정상) | Predicted Label: Label_3(위험)
Input: A man is praying with his hands together and a fire is burning behind him.
True Label: Label_6(위험) | Predicted Label: Label_7(위험)
Input: A man wearing a white shirt is sitting on a chair on the balcony.
True Label: Label_3(위험) | Predicted Label: Label_4(위험)
Input: Two people are fighting on the street in front of the cross walk.
True Label: Label_6(위험) | Predicted Label: Label_7(위험)
Input: A building is on fire and a man is 