# AI 챗봇 개발 기초

## 챗봇 기본 개념 

### 의도 분류 기본 예제

In [None]:
class BasicChatbot:
    def __init__(self):
        self.intents = {
            "greeting": ["안녕", "hi", "hello", "반가워"],
            "weather": ["날씨", "weather", "비", "맑아"],
            "goodbye": ["안녕히", "bye", "goodbye", "잘가"]
        }

        self.responses = {
            "greeting": ["안녕하세요! 무엇을 도와드릴까요?", "반갑습니다!"],
            "weather": ["날씨 정보를 조회하겠습니다.", "어느 지역의 날씨를 알고 싶으신가요?"],
            "goodbye": ["안녕히 가세요!", "좋은 하루 되세요!"],
            "default": ["죄송합니다. 이해하지 못했습니다.", "다시 말씀해 주시겠어요?"]
        }
    
    # 의도 분류 함수
    def classify_intent(self, user_input):
        for intent, keywords in self.intents.items():
            for keyword in keywords:
                if keyword in user_input.lower():
                    
                    return intent
        return "default"

    # 응답 생성 함수
    def generate_response(self, intent):
        import random
        
        responses = self.responses.get(intent, self.responses["default"])
        return random.choice(responses)
    
    def chat(self, user_input):
        intent = self.classify_intent(user_input)
        response = self.generate_response(intent)
        return response
    
bot = BasicChatbot()
print(bot.chat("안녕"))  
print(bot.chat("오늘 날씨 어때?"))
print(bot.chat("잘가"))

반갑습니다!
날씨 정보를 조회하겠습니다.


### BERT 기반 의도 분류 예제

In [19]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from sklearn.preprocessing import LabelEncoder

class IntentClassifier:
    """BERT 기반 의도 분류기"""
    def __init__(self, model_name = "klue/bert-base"):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = None
        self.label_encoder = LabelEncoder()
        
    def prepare_data(self, texts, labels):
        """데이터 전처리 및 라벨 인코딩"""
        encoded_labels = self.label_encoder.fit_transform(labels)
        # 토큰화
        encodings = self.tokenizer(
            texts, truncation=True, padding=True, max_length=128, return_tensors="pt"
        )
        print(f"encodings: {encodings}")
        print(f"encoded_labels: {encoded_labels}")
        return encodings, encoded_labels
    
    def train(self, train_texts, train_labels):
        """모델 학습"""
        
        num_labels = len(set(train_labels))
        
        self.model = BertForSequenceClassification.from_pretrained(
            "klue/bert-base", num_labels=num_labels
        )
        
        # 데이터 준비
        train_encodings, train_labels = self.prepare_data(train_texts, train_labels)

        class IntentDataset(torch.utils.data.Dataset):
            def __init__(self, encodings, labels):
                self.encodings = encodings
                self.labels = labels

            def __getitem__(self, idx):
                item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
                item["labels"] = torch.tensor(self.labels[idx])
                return item
                
            def __len__(self):
                return len(self.labels)
                
        train_dataset = IntentDataset(train_encodings, train_labels)
            
        batch_size = 16
        epochs = 3
        learning_rate = 2e-5
        weight_decay = 0.01
            
        train_loader = torch.utils.data.DataLoader(
                train_dataset, batch_size=batch_size, shuffle=True
                )
        optimizer = torch.optim.AdamW(
                self.model.parameters(), 
                lr=learning_rate, 
                weight_decay=weight_decay
                )
            
        self.model.train()
        for epoch in range(epochs):
            print(f"Epoch {epoch+1}/{epochs}")
            total_loss = 0

            for batch in train_loader:
                optimizer.zero_grad()

                outputs = self.model(**batch)
                loss = outputs.loss
                    
                loss.backward()
                    
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                    
                optimizer.step()

                total_loss += loss.item()

            avg_loss = total_loss / len(train_loader)
            print(f"Average loss: {avg_loss:.4f}")

            print("훈련 완료")
            
    def predict(self, text):
        """의도 예측"""
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        print(f"inputs: {inputs}")
                
        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

        predicted_class = torch.argmax(predictions, dim=1).item()
        confidence = torch.max(predictions).item()

        intent = self.label_encoder.inverse_transform([predicted_class])[0]

        return intent, confidence
        
# 예제 데이터
train_texts = [
    "안녕하세요", "오늘 날씨 어때?", "잘 가", "반가워", "비 올까?", "안녕히 계세요",
    "주문하고싶어요", "배달 시켜줘", "음식 추천해줘", "영화 추천해줘","메뉴 보여줘"
]

train_labels = [
    "greeting", "weather", "goodbye", "greeting", "weather", "goodbye",
    "order_food", "order_food", "recommend_food", "recommend_movie", "order_food"
]

classifier = IntentClassifier()

classifier.train(train_texts, train_labels)

intent, confidence = classifier.predict("오늘 날씨 어때?")
print(f"Predicted intent: {intent}, Confidence: {confidence:.2f}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


encodings: {'input_ids': tensor([[    2,  5891,  2205,  5971,     3,     0],
        [    2,  3822,  5792, 10604,    35,     3],
        [    2,  1521,   543,     3,     0,     0],
        [    2, 28542,     3,     0,     0,     0],
        [    2,  1187, 31444,    35,     3,     0],
        [    2, 26382, 20260,     3,     0,     0],
        [    2,  4867, 19521,  2585, 10283,     3],
        [    2,  7301,  4212,  2810,     3,     0],
        [    2,  4182,  4635,  2097,  2810,     3],
        [    2,  3771,  4635,  2097,  2810,     3],
        [    2,  5396,  3897,  2810,     3,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1],

### spacy/ 개체 검출 예제

In [3]:
# pip install spacy

In [6]:
import spacy
import spacy.cli  # ← 함수 밖(전역)에서 임포트 권장

class EntityRecognizer:
    def __init__(self, lang='xx_ent_wiki_sm'):  # 다국어 NER
        self.nlp = self._load_pipeline(lang, fallback='en_core_web_sm')

    def _load_pipeline(self, name, fallback=None):
        # 1차 시도
        try:
            return spacy.load(name)
        except OSError:
            pass

        # 없으면 다운로드 후 재시도
        try:
            spacy.cli.download(name)
            return spacy.load(name)
        except Exception:
            pass

        # 폴백 모델 시도
        if fallback:
            try:
                spacy.cli.download(fallback)
                return spacy.load(fallback)
            except Exception:
                pass

        # 마지막 폴백: 토크나이저만
        return spacy.blank("xx")

    def extract_entities(self, text):
        doc = self.nlp(text)
        return [
            {
                "text": ent.text,
                "label": ent.label_,
                "start": ent.start_char,
                "end": ent.end_char,
                "description": spacy.explain(ent.label_) or "",
            }
            for ent in doc.ents
        ]

extractor = EntityRecognizer()
text = "내일 서울의 날씨는 어때."
print(extractor.extract_entities(text))


[{'text': '내일 서울의', 'label': 'PER', 'start': 0, 'end': 6, 'description': 'Named person or family.'}]
