In [1]:
# ========================================================
# 1. 필수 라이브러리 설치 (최신 버전)
# ========================================================
!pip install --upgrade transformers accelerate seqeval datasets

import pandas as pd
import numpy as np
import re
import torch
import json
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from google.colab import drive

# ========================================================
# 2. 데이터 로드 및 항공사 DB 구축
# ========================================================
drive.mount('/content/drive')
MODEL_SAVE_PATH = '/content/drive/MyDrive/aero_bert_model'

# 1. 뉴스 데이터 로드
try:
    df = pd.read_csv('data/aviation_data.csv')
    df = df[df['source'] == 'AeroRoutes'] # AeroRoutes만 학습
    print(f"✅ 학습용 뉴스 데이터: {len(df)}개")
except:
    print("❌ aviation_data.csv 파일이 없습니다.")

# 2. 항공사 이름 DB 로드 (CSV 활용)
try:
    airline_df = pd.read_csv('data/airlines_list.csv', encoding='cp949')
    # 영문 이름 추출 및 정제
    full_airline_list = airline_df['영문항공사명'].dropna().astype(str).tolist()

    refined_airlines = set()
    for name in full_airline_list:
        # "Delta Air Lines" -> "Delta"도 추가
        parts = name.split()
        if len(parts) > 0:
            refined_airlines.add(parts[0])
            refined_airlines.add(name)

    # 오탐지 방지 (너무 흔한 단어 제외)
    stop_words = {'Air', 'Airlines', 'Airways', 'Aviation', 'International', 'Limited', 'Co.', 'Inc.', 'The', 'Group'}
    final_airline_list = [w for w in refined_airlines if w not in stop_words and len(w) > 2]

    print(f"✅ 항공사 DB 구축 완료: {len(final_airline_list)}개 항공사 이름 학습 반영")

except Exception as e:
    print(f"⚠️ 항공사 CSV 로드 실패 (기본 리스트 사용): {e}")
    final_airline_list = ['Korean', 'Asiana', 'Jeju', 'Jin', 'Cathay', 'Delta', 'United', 'Singapore']

# ========================================================
# 3. 자동 라벨링 함수 (CSV 기반 강화됨)
# ========================================================
def auto_label_text(text):
    text = str(text)
    # 특수문자 보존을 위해 단순 split 대신 토크나이징 흉내
    tokens = text.split()
    ner_tags = ['O'] * len(tokens)

    # 정규식 패턴
    date_pattern = r'^\d{1,2}[A-Z]{3}\d{2}$'  # 18APR25
    aircraft_pattern = r'^(A3\d{2}|B7\d{2}|7\d{7}|[A-Z]?\d{3}[a-z]?(-[A-Z0-9]+)?)$' # 기종

    for i, token in enumerate(tokens):
        # 특수문자 제거 후 단어 비교
        clean_token = re.sub(r'[^a-zA-Z0-9]', '', token)

        if re.match(date_pattern, clean_token):
            ner_tags[i] = 'B-DATE'
        elif re.match(aircraft_pattern, clean_token):
            ner_tags[i] = 'B-AIRCRAFT'
        elif ('–' in token or '-' in token) and len(token) > 3 and not re.match(aircraft_pattern, clean_token):
            ner_tags[i] = 'I-ROUTE' # 노선
        # CSV에 있는 항공사 이름이면 태깅
        elif clean_token in final_airline_list:
            ner_tags[i] = 'B-AIRLINE'

    return tokens, ner_tags

# 데이터셋 변환
data = []
for content in df['content']:
    tokens, tags = auto_label_text(content)
    data.append({'tokens': tokens, 'ner_tags': tags})

# ========================================================
# 4. Dataset 클래스 (Subword 라벨링 완벽 지원)
# ========================================================
class NERDataset(Dataset):
    def __init__(self, data, tokenizer, label_map, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.label_map = label_map
        self.max_len = max_len

    def __len__(self): return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        tokens = item['tokens']
        tags = item['ner_tags']

        # 토큰화
        encoding = self.tokenizer(tokens, is_split_into_words=True, padding='max_length', truncation=True, max_length=self.max_len)
        word_ids = encoding.word_ids()

        # 라벨 확장 (18APR25 -> 18, ##APR, ##25 모두 DATE로)
        labels_ids = []
        for word_idx in word_ids:
            if word_idx is None: labels_ids.append(-100)
            else: labels_ids.append(self.label_map[tags[word_idx]])

        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(labels_ids)
        return item

# ========================================================
# 5. 학습 실행
# ========================================================
# 라벨 맵핑
label_list = sorted(list(set([tag for item in data for tag in item['ner_tags']])))
label_map = {label: i for i, label in enumerate(label_list)}
print(f"Labels: {label_map}")

# 데이터셋 준비
train_data, val_data = train_test_split(data, test_size=0.15, random_state=42)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')
train_dataset = NERDataset(train_data, tokenizer, label_map)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# 모델 준비
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(label_list))
model.to(device)
optimizer = AdamW(model.parameters(), lr=3e-5)

# 학습 루프 (5 Epoch)
print("=== 학습 시작 ===")
for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Loss: {total_loss/len(train_loader):.4f}")

# 모델 저장 (라벨 맵 포함)
model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)
config = model.config
config.id2label = {i: label for label, i in label_map.items()}
config.label2id = label_map
config.save_pretrained(MODEL_SAVE_PATH)

print(f"✅ 모델 저장 완료: {MODEL_SAVE_PATH}")

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m144.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m47.0 MB/s[0

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


=== 학습 시작 ===
Epoch 1 | Loss: 0.3272
Epoch 2 | Loss: 0.0627
Epoch 3 | Loss: 0.0332
Epoch 4 | Loss: 0.0214
Epoch 5 | Loss: 0.0166
✅ 모델 저장 완료: /content/drive/MyDrive/aero_bert_model
