In [1]:
import json
import pandas as pd
from datasets import Dataset

def load_data(file_path):
    """
    JSON 데이터를 불러와 Pandas DataFrame으로 변환
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
        
    data = []
    for item in json_data['data']:
        if 'text' in item and 'keywords' in item:
            data.append({
                'text': item['text'],
                'keywords': item['keywords']
            })
    
    df = pd.DataFrame(data)
    print(df.info())
    # None 값을 빈 문자열로 대체
    df = df.fillna('')
    return df


# JSON 파일 경로
file_path_train = "/home/yjtech2/Desktop/yurim/LLM/Data/smell_keyword/smell_keyword_train.json"
df = load_data(file_path_train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6177900 entries, 0 to 6177899
Data columns (total 2 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   text      object
 1   keywords  object
dtypes: object(2)
memory usage: 94.3+ MB
None


In [2]:
df.head()

Unnamed: 0,text,keywords
0,쓰레기 매립장에서 악취가 쾌쾌하게 발생하고 있습니다. 이로 인해 주민들의 일상생활에...,"[쓰레기 매립장, 악취, 쾌쾌하게]"
1,쓰레기 매립장에서 악취가 쾌쾌하게 발생하고 있습니다. 이는 주변 환경을 심각하게 오...,"[쓰레기 매립장, 악취, 쾌쾌하게]"
2,쓰레기 매립장에서 악취가 쾌쾌하게 발생하고 있습니다. 이로 인해 주민들이 극심한 불...,"[쓰레기 매립장, 악취, 쾌쾌하게]"
3,쓰레기 매립장에서 악취가 쾌쾌하게 발생하고 있습니다. 이 문제로 인해 지역 주민들이...,"[쓰레기 매립장, 악취, 쾌쾌하게]"
4,쓰레기 매립장에서 악취가 쾌쾌하게 발생하고 있습니다. 주변의 자연 생태계까지 악영향...,"[쓰레기 매립장, 악취, 쾌쾌하게]"


In [8]:
print(type(df['keywords']))

<class 'pandas.core.series.Series'>


In [5]:
random_sample = df.sample(n = 100)
random_sample.to_csv('./mini_data.csv', index = False)

In [9]:
from sklearn.model_selection import train_test_split

# 데이터를 70:30 비율로 나누기
def split_data(df, train_ratio = 0.7):
    """
    DataFrame을 train과 val로 나눔
    """
    train_df, val_df = train_test_split(df, train_size = train_ratio, random_state = 42, shuffle = True)
    return train_df, val_df

# train, val로 나누기
train_df, val_df = split_data(df)

train_df.to_csv('/home/yjtech2/Desktop/yurim/LLM/Data/smell_keyword/train_df.csv', index = False)
val_df.to_csv('/home/yjtech2/Desktop/yurim/LLM/Data/smell_keyword/val_df.csv', index = False)

# 나눈 데이터 확인
print("Train Data:")
print(train_df.head())
print("\nValidation Data:")
print(val_df.head())

# 데이터 크기 확인
print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}")


Train Data:
                                                      text  \
5918002  도심 개발 공사장에서 불쾌한 냄새가 오랫동안 남아있는 발생하고 있습니다. 이 문제로...   
5957914  운동장 공사장에서 먼지가 오랫동안 남아있는 발생하고 있습니다. 공사 현장에서 발생한...   
2907278  사료 공장에서 말린 사료 냄새가 압박감 있게 발생하고 있습니다. 악취 문제로 인해 ...   
268308   공원 주변 쓰레기장에서 침투성 냄새가 거슬리게 발생하고 있습니다. 이 문제로 인해 ...   
4675819  집 근처 쓰레기통에서 마른 흙 냄새가 혼합되게 발생하고 있습니다. 냄새로 인해 창문...   

                           keywords  
5918002         [도심 개발 공사장, 불쾌한 냄새]  
5957914               [운동장 공사장, 먼지]  
2907278   [사료 공장, 말린 사료 냄새, 압박감 있게]  
268308   [공원 주변 쓰레기장, 침투성 냄새, 거슬리게]  
4675819  [집 근처 쓰레기통, 마른 흙 냄새, 혼합되게]  

Validation Data:
                                                      text  \
5961388  운동장 공사장에서 기름 냄새가 스산하게 발생하고 있습니다. 공사장 문제로 주민들이 ...   
5604857  공공시설 공사 현장에서 고무 타는 냄새가 역하게 발생하고 있습니다. 환경 단체가 문...   
5996629  철도 터널 공사장에서 고무 타는 냄새가 자극적으로 발생하고 있습니다. 먼지와 냄새가...   
3496443  해가 뜨기 전에서 바람을 타고 퍼지는 냄새가 날카롭게 발생하고 있습니다. 주민들이 ...   
679929   도로 공사 현장 쓰레기장에서 더러운 냄새가 끈질기게 발생하고 있습니다. 악취 문제 ... 

In [10]:
import pandas as pd
from datasets import Dataset

train_df = pd.read_csv('/home/yjtech2/Desktop/yurim/LLM/Data/smell_keyword/train_df.csv')
val_df = pd.read_csv('/home/yjtech2/Desktop/yurim/LLM/Data/smell_keyword/val_df.csv')


# 데이터셋 전체 변환
if train_df['keywords'].dtype == 'object':  # keywords가 문자열인지 확인
    train_df['keywords'] = train_df['keywords'].apply(eval)  # 또는 json.loads

# val_df의 'keywords' 컬럼 문자열 -> 리스트 변환
if val_df['keywords'].dtype == 'object':  # keywords가 문자열인지 확인
    val_df['keywords'] = val_df['keywords'].apply(eval)  # 또는 json.loads

train_data_dict = train_df.to_dict(orient='list')
train_dataset = Dataset.from_dict(train_data_dict)
train_dataset
print(len(train_dataset))

val_data_dict = val_df.to_dict(orient='list')
val_dataset = Dataset.from_dict(val_data_dict)
print(len(val_dataset))

4324530
1853370


In [11]:
print(train_dataset[0]['keywords'])  
print(train_dataset[0]['keywords'][0])

['도심 개발 공사장', '불쾌한 냄새']
도심 개발 공사장


In [12]:
val_dataset[0]

{'text': '운동장 공사장에서 기름 냄새가 스산하게 발생하고 있습니다. 공사장 문제로 주민들이 민원을 계속 제기하고 있습니다',
 'keywords': ['운동장 공사장', '기름 냄새', '스산하게']}

In [13]:
from datasets import Dataset
import random

def sample_dataset(dataset, fraction):
    """
    데이터셋에서 지정된 비율만큼 샘플링
    """
    sample_size = int(len(dataset) * fraction)
    sampled_indices = random.sample(range(len(dataset)), sample_size)  # 랜덤 인덱스 선택
    return dataset.select(sampled_indices)

# Train과 Val 데이터셋 샘플링
train_sampled_dataset = sample_dataset(train_dataset, 0.001)
val_sampled_dataset = sample_dataset(val_dataset, 0.001)

# 샘플링 후 데이터 크기 확인
print("Original Train Dataset Size:", len(train_dataset))
print("Sampled Train Dataset Size:", len(train_sampled_dataset))
print("Original Validation Dataset Size:", len(val_dataset))
print("Sampled Validation Dataset Size:", len(val_sampled_dataset))

Original Train Dataset Size: 4324530
Sampled Train Dataset Size: 4324
Original Validation Dataset Size: 1853370
Sampled Validation Dataset Size: 1853


In [14]:
# 데이터 타입 확인
print(type(val_sampled_dataset[0]['keywords']))  # <class 'str'>일 가능성이 큼

# keywords 내용 확인
print(val_sampled_dataset[0]['keywords'])


<class 'list'>
['폐기물 소각 공장']


In [20]:
import os
import torch
from tqdm import tqdm
from typing import Dict
import time
from datetime import datetime
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AdamW,
    DataCollatorForSeq2Seq,
)

from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler, autocast

class CustomKeyBERTTrainer:
    def __init__(self, model_name: str, **kwargs):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")
        if self.device == "cuda":
            print(f"GPU Model: {torch.cuda.get_device_name(0)}")
            print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
        self.optimizer = AdamW(self.model.parameters(), lr=kwargs.get("learning_rate", 2e-5))
        self.max_length = kwargs.get("max_length", 128)
        self.training_args = kwargs
        self.save_dir = kwargs.get("save_dir", "./best_model")
        
        self.best_model_path = os.path.join(self.save_dir, "pytorch_model.bin")
        self.tokenizer_path = self.save_dir
        
        self.history = {
            'train_loss': [],
            'val_loss': [],
            'epoch_times': [],
            'best_epoch': 0
        }

        os.makedirs(self.save_dir, exist_ok=True)

    def preprocess_data(self, examples: Dict) -> Dict:
        # 입력 텍스트 처리
        inputs = [f"키워드 추출: {text}" for text in examples["text"]]
        model_inputs = self.tokenizer(
            inputs,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors=None  # 텐서 변환을 DataCollator에 맡김
        )

        # 레이블(키워드) 처리
        labels = [", ".join(keywords) if keywords else "" for keywords in examples["keywords"]]
        
        with self.tokenizer.as_target_tokenizer():
            tokenized_labels = self.tokenizer(
                labels,
                max_length=self.max_length,
                padding="max_length",
                truncation=True,
                return_tensors=None  # 텐서 변환을 DataCollator에 맡김
            )

        # -100으로 패딩 토큰을 마스킹
        labels = tokenized_labels["input_ids"]
        for i in range(len(labels)):
            for j in range(len(labels[i])):
                if labels[i][j] == self.tokenizer.pad_token_id:
                    labels[i][j] = -100

        model_inputs["labels"] = labels
        return model_inputs
        
    def save_model_and_tokenizer(self, epoch=None, is_best=False):
        """
        최고 성능 모델만 저장하고 이전 모델을 삭제
        """
        if is_best:
            # 이전 최고 모델 디렉토리 삭제
            if os.path.exists(self.best_model_path):
                print(f"Deleting previous best model at {self.best_model_path}")
                os.system(f"rm -rf {self.best_model_path}")
            
            # 새로운 최고 모델 저장
            save_path = os.path.join(self.save_dir, f"best_model_epoch_{epoch}")
            os.makedirs(save_path, exist_ok=True)
            self.model.save_pretrained(save_path)
            self.tokenizer.save_pretrained(save_path)
            torch.save(self.history, os.path.join(save_path, 'training_history.pt'))
            print(f"New best model saved at {save_path}")

            # 최고 모델 경로 업데이트
            self.best_model_path = save_path

    def calculate_metrics(self, predictions, labels):
        predictions = torch.argmax(predictions, dim=-1)
        correct = (predictions == labels).masked_fill(labels == -100, 0)
        accuracy = correct.sum().item() / (labels != -100).sum().item()
        return accuracy

    def train(self, train_dataset, valid_dataset=None):
        start_time = time.time()
        print(f"\nStarting training at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"Training parameters:")
        print(f"- Batch size: {self.training_args['batch_size']}")
        print(f"- Learning rate: {self.training_args.get('learning_rate', '2e-5')}")
        print(f"- Max length: {self.max_length}")
        print(f"- Number of epochs: {self.training_args['num_epochs']}")
        print(f"- Training samples: {len(train_dataset)}")
        if valid_dataset:
            print(f"- Validation samples: {len(valid_dataset)}")
        print("\n" + "="*50 + "\n")

        # 데이터셋 전처리
        print("Preprocessing training data...")
        train_dataset = train_dataset.map(
            self.preprocess_data,
            batched=True,
            remove_columns=train_dataset.column_names,
            desc="Processing training data"
        )

        if valid_dataset is not None:
            print("Preprocessing validation data...")
            valid_dataset = valid_dataset.map(
                self.preprocess_data,
                batched=True,
                remove_columns=valid_dataset.column_names,
                desc="Processing validation data"
            )

        # DataCollator 설정
        data_collator = DataCollatorForSeq2Seq(
            tokenizer=self.tokenizer,
            model=self.model,
            padding=True,
            return_tensors="pt"
        )

        # DataLoader 설정 (num_workers=0으로 변경하여 멀티프로세싱 관련 오류 방지)
        train_dataloader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.training_args["batch_size"],
            shuffle=True,
            collate_fn=data_collator,
            num_workers=0,
            pin_memory=True
        )

        if valid_dataset is not None:
            valid_dataloader = torch.utils.data.DataLoader(
                valid_dataset,
                batch_size=self.training_args["batch_size"],
                shuffle=False,
                collate_fn=data_collator,
                num_workers=0,
                pin_memory=True
            )
        best_val_loss = float('inf')
        early_stopping_counter = 0
        early_stopping_patience = self.training_args.get('patience', 3)

        for epoch in range(self.training_args["num_epochs"]):
            epoch_start_time = time.time()
            
            self.model.train()
            epoch_loss = 0
            epoch_accuracy = 0
            train_steps = 0
            
            progress_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}")
            batch_losses = []
            batch_accuracies = []
            
            for batch_idx, batch in enumerate(progress_bar):
                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)
                labels = batch["labels"].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels,
                )
                
                loss = outputs.loss # 손실 계산
                accuracy = self.calculate_metrics(outputs.logits, labels)
                
                loss.backward()# 손실 역전파 
                self.optimizer.step() # 가중치 업데이트
                self.optimizer.zero_grad() # 그래디언트 초기화

                batch_losses.append(loss.item())
                batch_accuracies.append(accuracy)
                
                current_loss = np.mean(batch_losses[-100:])
                current_accuracy = np.mean(batch_accuracies[-100:])
                progress_bar.set_postfix({
                    'loss': f'{current_loss:.4f}',
                    'accuracy': f'{current_accuracy:.4f}',
                    'batch': f'{batch_idx + 1}/{len(train_dataloader)}'
                })

            avg_train_loss = np.mean(batch_losses)
            avg_train_accuracy = np.mean(batch_accuracies)

            if valid_dataset is not None:
                self.model.eval()
                val_losses = []
                val_accuracies = []

                print("\nRunning validation...")
                with torch.no_grad():
                    for batch in tqdm(valid_dataloader, desc="Validating"):
                        input_ids = batch["input_ids"].to(self.device)
                        attention_mask = batch["attention_mask"].to(self.device)
                        labels = batch["labels"].to(self.device)

                        outputs = self.model(
                            input_ids=input_ids,
                            attention_mask=attention_mask,
                            labels=labels,
                        )
                        
                        loss = outputs.loss
                        accuracy = self.calculate_metrics(outputs.logits, labels)
                        
                        val_losses.append(loss.item())
                        val_accuracies.append(accuracy)

                avg_val_loss = np.mean(val_losses)
                avg_val_accuracy = np.mean(val_accuracies)

                if avg_val_loss < best_val_loss:
                    best_val_loss = avg_val_loss
                    early_stopping_counter = 0
                    self.history['best_epoch'] = epoch + 1
                    print(f"\nNew best validation loss: {best_val_loss:.4f}")
                    self.save_model_and_tokenizer(epoch + 1, is_best=True)  # 최고 모델만 저장
                else:
                    early_stopping_counter += 1


            epoch_time = time.time() - epoch_start_time
            self.history['epoch_times'].append(epoch_time)
            self.history['train_loss'].append(avg_train_loss)
            if valid_dataset is not None:
                self.history['val_loss'].append(avg_val_loss)

            # Print epoch summary
            print(f"\nEpoch {epoch + 1} Summary:")
            print(f"Time taken: {epoch_time:.2f} seconds")
            print(f"Average training loss: {avg_train_loss:.4f}")
            print(f"Training accuracy: {avg_train_accuracy:.4f}")
            if valid_dataset is not None:
                print(f"Validation loss: {avg_val_loss:.4f}")
                print(f"Validation accuracy: {avg_val_accuracy:.4f}")
                print(f"Best validation loss so far: {best_val_loss:.4f}")
                print(f"Early stopping counter: {early_stopping_counter}/{early_stopping_patience}")

            if early_stopping_counter >= early_stopping_patience:
                print("\nEarly stopping triggered.")
                break
    def predict(self, text: str) -> str:
        """모델 추론"""
        inputs = self.tokenizer(
            f"키워드 추출: {self._normalize_text(text)}",
            return_tensors="pt",
            max_length=self.max_length,
            truncation=True,
        ).to(self.device)

        outputs = self.model.generate(
            inputs["input_ids"], max_length=self.max_length, num_beams=5
        )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def _normalize_text(self, text: str) -> str:
        return text.strip()

2024-11-27 10:16:43.103757: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-27 10:16:43.297316: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [21]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # 멀티프로세싱 경고 방지

trainer = CustomKeyBERTTrainer(
    model_name="facebook/bart-base", # t5-base     skt/kobart-base-v2
    max_length=128,
    learning_rate=1e-4,
    batch_size=8,
    num_epochs=10,
    gradient_accumulation_steps=8,
    patience=3 # 몇 에폭마다 체크포인트 저장할지
)


if __name__ == "__main__":
    torch.cuda.empty_cache()  # GPU 메모리 초기화
    trainer.train(train_sampled_dataset, val_sampled_dataset)


Using device: cuda
GPU Model: NVIDIA GeForce RTX 4080
Available GPU memory: 15.59 GB





Starting training at: 2024-11-27 10:16:51
Training parameters:
- Batch size: 8
- Learning rate: 0.0001
- Max length: 128
- Number of epochs: 10
- Training samples: 4324
- Validation samples: 1853


Preprocessing training data...


Processing training data:   0%|          | 0/4324 [00:00<?, ? examples/s]



Preprocessing validation data...


Processing validation data:   0%|          | 0/1853 [00:00<?, ? examples/s]

Training Epoch 1: 100%|██████████| 541/541 [00:29<00:00, 18.29it/s, loss=0.0564, accuracy=0.9812, batch=541/541]



Running validation...


Validating: 100%|██████████| 232/232 [00:03<00:00, 62.62it/s]



New best validation loss: 0.0290




New best model saved at ./best_model/best_model_epoch_1

Epoch 1 Summary:
Time taken: 33.98 seconds
Average training loss: 0.1317
Training accuracy: 0.9630
Validation loss: 0.0290
Validation accuracy: 0.9901
Best validation loss so far: 0.0290
Early stopping counter: 0/3


Training Epoch 2: 100%|██████████| 541/541 [00:29<00:00, 18.32it/s, loss=0.0725, accuracy=0.9834, batch=541/541]



Running validation...


Validating: 100%|██████████| 232/232 [00:03<00:00, 62.40it/s]



New best validation loss: 0.0229
Deleting previous best model at ./best_model/best_model_epoch_1
New best model saved at ./best_model/best_model_epoch_2

Epoch 2 Summary:
Time taken: 33.83 seconds
Average training loss: 0.0404
Training accuracy: 0.9883
Validation loss: 0.0229
Validation accuracy: 0.9943
Best validation loss so far: 0.0229
Early stopping counter: 0/3


Training Epoch 3: 100%|██████████| 541/541 [00:29<00:00, 18.28it/s, loss=0.0214, accuracy=0.9947, batch=541/541]



Running validation...


Validating: 100%|██████████| 232/232 [00:03<00:00, 62.45it/s]



New best validation loss: 0.0052
Deleting previous best model at ./best_model/best_model_epoch_2
New best model saved at ./best_model/best_model_epoch_3

Epoch 3 Summary:
Time taken: 33.86 seconds
Average training loss: 0.0207
Training accuracy: 0.9946
Validation loss: 0.0052
Validation accuracy: 0.9984
Best validation loss so far: 0.0052
Early stopping counter: 0/3


Training Epoch 4: 100%|██████████| 541/541 [00:29<00:00, 18.26it/s, loss=0.0361, accuracy=0.9925, batch=541/541]



Running validation...


Validating: 100%|██████████| 232/232 [00:03<00:00, 62.44it/s]



Epoch 4 Summary:
Time taken: 33.34 seconds
Average training loss: 0.0216
Training accuracy: 0.9955
Validation loss: 0.0062
Validation accuracy: 0.9990
Best validation loss so far: 0.0052
Early stopping counter: 1/3


Training Epoch 5: 100%|██████████| 541/541 [00:29<00:00, 18.25it/s, loss=0.0499, accuracy=0.9899, batch=541/541]



Running validation...


Validating: 100%|██████████| 232/232 [00:03<00:00, 62.53it/s]



New best validation loss: 0.0025
Deleting previous best model at ./best_model/best_model_epoch_3
New best model saved at ./best_model/best_model_epoch_5

Epoch 5 Summary:
Time taken: 33.89 seconds
Average training loss: 0.0349
Training accuracy: 0.9926
Validation loss: 0.0025
Validation accuracy: 0.9993
Best validation loss so far: 0.0025
Early stopping counter: 0/3


Training Epoch 6: 100%|██████████| 541/541 [00:29<00:00, 18.29it/s, loss=0.0066, accuracy=0.9987, batch=541/541]



Running validation...


Validating: 100%|██████████| 232/232 [00:03<00:00, 62.37it/s]



New best validation loss: 0.0003
Deleting previous best model at ./best_model/best_model_epoch_5
New best model saved at ./best_model/best_model_epoch_6

Epoch 6 Summary:
Time taken: 33.86 seconds
Average training loss: 0.0142
Training accuracy: 0.9973
Validation loss: 0.0003
Validation accuracy: 0.9999
Best validation loss so far: 0.0003
Early stopping counter: 0/3


Training Epoch 7: 100%|██████████| 541/541 [00:29<00:00, 18.26it/s, loss=0.0189, accuracy=0.9963, batch=541/541]



Running validation...


Validating: 100%|██████████| 232/232 [00:03<00:00, 62.45it/s]



Epoch 7 Summary:
Time taken: 33.34 seconds
Average training loss: 0.0135
Training accuracy: 0.9973
Validation loss: 0.0042
Validation accuracy: 0.9993
Best validation loss so far: 0.0003
Early stopping counter: 1/3


Training Epoch 8: 100%|██████████| 541/541 [00:29<00:00, 18.25it/s, loss=0.0132, accuracy=0.9974, batch=541/541]



Running validation...


Validating: 100%|██████████| 232/232 [00:03<00:00, 62.28it/s]



Epoch 8 Summary:
Time taken: 33.37 seconds
Average training loss: 0.0144
Training accuracy: 0.9971
Validation loss: 0.0038
Validation accuracy: 0.9994
Best validation loss so far: 0.0003
Early stopping counter: 2/3


Training Epoch 9: 100%|██████████| 541/541 [00:29<00:00, 18.26it/s, loss=0.0086, accuracy=0.9987, batch=541/541]



Running validation...


Validating: 100%|██████████| 232/232 [00:03<00:00, 62.42it/s]


Epoch 9 Summary:
Time taken: 33.34 seconds
Average training loss: 0.0195
Training accuracy: 0.9958
Validation loss: 0.0017
Validation accuracy: 0.9996
Best validation loss so far: 0.0003
Early stopping counter: 3/3

Early stopping triggered.





In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained('/home/yjtech2/Desktop/yurim/LLM/Pre_processing/smell_keyword/best_model/best_model_epoch_4')
tokenizer = AutoTokenizer.from_pretrained("/home/yjtech2/Desktop/yurim/LLM/Pre_processing/smell_keyword/best_model/best_model_epoch_4")
print("Model and tokenizer loaded successfully.")

Model and tokenizer loaded successfully.


In [25]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# 저장된 모델 경로
saved_model_path = "/home/yjtech2/Desktop/yurim/LLM/Pre_processing/smell_keyword/best_model/best_model_epoch_6"

# 모델과 토크나이저 불러오기
tokenizer = AutoTokenizer.from_pretrained(saved_model_path, local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained(saved_model_path, local_files_only=True).to('cpu')

# 예측 테스트
text = '신호등 근처에서 음식물 쓰레기 냄새가 진동합니다.'
inputs = tokenizer(
    f"키워드 추출: {text}",
    return_tensors="pt",
    max_length=128,
    truncation=True
).to("cpu")

outputs = model.generate(
    inputs["input_ids"],
    max_length=128,
    num_beams=5,
    length_penalty=0.7,
    repetition_penalty=1.2,
    early_stopping=True
)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("원래 문장: ", text)
print("키워드 추출: ", result)

원래 문장:  신호등 근처에서 음식물 쓰레기 냄새가 진동합니다.
키워드 추출:  신호등 근처, 음식물 쓰레기 냄새, 진동합니다


In [23]:
val_sampled_dataset[0]

{'text': '폐기물 소각 공장에서 찌르는 냄새가 매캐하게 발생하고 있습니다. 이는 장기적인 건강 문제로 이어질 가능성이 있습니다',
 'keywords': ['폐기물 소각 공장']}