In [1]:
# !pip install flask

Collecting flask
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting itsdangerous>=2.1.2 (from flask)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting blinker>=1.6.2 (from flask)
  Downloading blinker-1.8.2-py3-none-any.whl.metadata (1.6 kB)
Downloading flask-3.0.3-py3-none-any.whl (101 kB)
Downloading blinker-1.8.2-py3-none-any.whl (9.5 kB)
Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Installing collected packages: itsdangerous, blinker, flask
Successfully installed blinker-1.8.2 flask-3.0.3 itsdangerous-2.2.0


In [1]:
import os
import json
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
import torch
from flask import Flask, request, jsonify

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import json
import pickle
from flask import Flask, request, jsonify
from concurrent.futures import ThreadPoolExecutor
from transformers import BertTokenizer, BertForQuestionAnswering
import torch
from torch.utils.data import Dataset, DataLoader

# Flask 애플리케이션 설정
app = Flask(__name__)

# BERT 모델 및 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# 캐시 파일 경로 설정
train_cache_file = 'qa_data_cache.pkl'
val_cache_file = 'val_data_cache.pkl'

# 캐시에서 데이터 로드
def load_data_from_cache(cache_file):
    if os.path.exists(cache_file):
        print(f"캐시된 데이터를 불러오는 중: {cache_file}")
        with open(cache_file, 'rb') as f:
            return pickle.load(f)
    return None

def save_data_to_cache(data, cache_file):
    with open(cache_file, 'wb') as f:
        pickle.dump(data, f)
    print(f"데이터를 캐시에 저장했습니다: {cache_file}")

# JSON 파일에서 데이터 로드
def load_single_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    qa_pairs = []
    for qa in data.get('qa_pairs', []):
        question = qa.get('question')
        answer = qa.get('answer')
        context = qa.get('context')
        answer_start = qa.get('answer_start', 0)
        qa_pairs.append((question, context, answer, answer_start))
    return qa_pairs

# 디렉토리 내 모든 JSON 파일에서 데이터 로드
def load_data_parallel(data_folder):
    qa_pairs = []
    files = [os.path.join(root, file)
             for root, _, files in os.walk(data_folder)
             for file in files if file.endswith('.json')]

    with ThreadPoolExecutor() as executor:
        results = list(executor.map(load_single_file, files))

    qa_pairs = [result for result in results if result is not None]
    return qa_pairs

# 캐시 활용 데이터 로드 함수
def load_data_with_cache(data_folder, cache_file):
    data = load_data_from_cache(cache_file)
    if data is not None:
        return data

    data = load_data_parallel(data_folder)
    save_data_to_cache(data, cache_file)
    return data

# 데이터셋 클래스 정의
class QADataset(Dataset):
    def __init__(self, qa_pairs, tokenizer, max_len=512):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        question, context, answer, answer_start = self.qa_pairs[idx]

        inputs = self.tokenizer.encode_plus(
            question, context, add_special_tokens=True, max_length=self.max_len,
            truncation=True, padding="max_length", return_tensors="pt"
        )

        answer_end = answer_start + len(self.tokenizer.encode(answer, add_special_tokens=False))
        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()
        token_type_ids = inputs["token_type_ids"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            "start_positions": torch.tensor(answer_start),
            "end_positions": torch.tensor(answer_end)
        }

# API 기본 경로 (서버 테스트용)
@app.route('/')
def home():
    return "QA 챗봇 서버가 실행 중입니다."

# 질문에 대한 답변 처리 엔드포인트
@app.route('/ask', methods=['POST'])
def ask():
    data = request.json
    question = data.get('question', '')

    if not question:
        return jsonify({'error': 'No question provided'}), 400

    # 가장 최근 데이터셋에서 답변 가져오기
    context = qa_pairs[0][1]  # 예시로 첫 번째 context를 사용
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**inputs)
        answer_start = torch.argmax(outputs.start_logits)
        answer_end = torch.argmax(outputs.end_logits) + 1
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

    return jsonify({'question': question, 'answer': answer})

# 데이터 로드 및 캐시 적용
train_data_folder = 'C:/fintech_service/08_AI_Serving/extracted_files/Training/unzipped'
val_data_folder = 'C:/fintech_service/08_AI_Serving/extracted_files/Validation/unzipped'

print("캐시에서 데이터를 로드 중...")
train_qa_pairs = load_data_with_cache(train_data_folder, train_cache_file)
val_qa_pairs = load_data_with_cache(val_data_folder, val_cache_file)

if __name__ == '__main__':
    app.run(debug=False, host='0.0.0.0', port=5000)


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


캐시에서 데이터를 로드 중...
캐시된 데이터를 불러오는 중: qa_data_cache.pkl
캐시된 데이터를 불러오는 중: val_data_cache.pkl
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.30.26.143:5000
[33mPress CTRL+C to quit[0m
172.30.26.143 - - [14/Oct/2024 15:15:28] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [14/Oct/2024 15:20:31] "GET / HTTP/1.1" 200 -
172.30.26.143 - - [14/Oct/2024 15:34:55] "GET / HTTP/1.1" 200 -


In [3]:
from flask import Flask, render_template, request, jsonify
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

app = Flask(__name__)

# BERT 모델 및 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

@app.route('/')
def home():
    return render_template('index.html')

@app.route('/get_answer', methods=['POST'])
def get_answer():
    data = request.json
    question = data['question']
    context = data['context']

    # BERT 모델을 이용해 질문-답변 예측
    inputs = tokenizer(question, context, return_tensors='pt', truncation=True)
    input_ids = inputs["input_ids"].tolist()[0]

    outputs = model(**inputs)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    return jsonify({'answer': answer})

if __name__ == '__main__':
    app.run(debug=True)


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
 * Restarting with stat
Traceback (most recent call last):
  File "/home/user/miniforge3/envs/dml/lib/python3.9/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/user/miniforge3/envs/dml/lib/python3.9/site-packages/traitlets/config/application.py", line 1074, in launch_instance
    app.initialize(argv)
  File "/home/user/miniforge3/envs/dml/lib/python3.9/site-packages/traitlets/config/application.py", line 118, in inner
    return method(app, *args, **kwargs)
  File "/home/user/miniforge3/envs/dml/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 692, in initialize
    self.init_sockets()
  File "/home/user/miniforge3/envs/dml/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 331, in init_sockets
    self.shell_port = self._bind_socket(self.shell_socket, self.shell_port)
  File "/home/user/miniforge3/envs/dml/lib/python3.9/site-packages/ipykernel/kernel

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
import gradio as gr
import torch
import os
import pickle
from transformers import BertTokenizer, BertForQuestionAnswering

# BERT 모델과 토크나이저 로드
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# 캐시에서 데이터를 로드하는 함수
def load_data_from_cache(cache_file):
    if os.path.exists(cache_file):
        print(f"캐시된 데이터를 불러오는 중: {cache_file}")
        with open(cache_file, 'rb') as f:
            return pickle.load(f)
    return None

# 데이터를 캐시에 저장하는 함수
def save_data_to_cache(data, cache_file):
    with open(cache_file, 'wb') as f:
        pickle.dump(data, f)
    print(f"데이터를 캐시에 저장했습니다: {cache_file}")

# JSON 파일에서 데이터 로드 (여기서는 단순화된 형태로 사용)
def load_data_from_json_files(data_folder):
    qa_pairs = []
    for root, dirs, files in os.walk(data_folder):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                # 파일을 읽고, 질문과 답변 쌍을 로드합니다 (여기서는 단순 텍스트로 가정)
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = f.read()
                    # JSON에서 질문과 답변을 추출하여 추가합니다.
                    question = "What is the main issue discussed?"  # 임의로 설정
                    answer = data  # 파일 전체를 answer로 설정 (실제 구현에서는 JSON 파싱 필요)
                    qa_pairs.append((question, answer))
    return qa_pairs

# 캐시를 활용한 데이터 로드 함수
def load_data_with_cache(data_folder, cache_file):
    data = load_data_from_cache(cache_file)
    if data is not None:
        return data
    data = load_data_from_json_files(data_folder)
    save_data_to_cache(data, cache_file)
    return data

# 질문에 대한 답변을 생성하는 함수 정의
def answer_question(question, context):
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
    
    input_ids = inputs["input_ids"].tolist()[0]

    # 모델 예측
    with torch.no_grad():
        outputs = model(**inputs)

    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    # 가장 높은 점수를 받은 토큰의 인덱스
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1

    # 토큰을 문자열로 변환
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    return answer

# Gradio 인터페이스 정의
def chatbot_interface(question):
    # 캐시된 데이터 로드
    data_folder = "C:/fintech_service/08_AI_Serving/extracted_files/Training/unzipped"  # 데이터가 저장된 폴더
    cache_file = "qa_data_cache.pkl"  # 캐시 파일
    qa_data = load_data_with_cache(data_folder, cache_file)

    # 데이터에서 임의의 컨텍스트를 선택 (여기서는 첫 번째 데이터 사용)
    context = qa_data[0][1]  # 첫 번째 데이터의 답변을 컨텍스트로 사용

    answer = answer_question(question, context)
    return answer

# Gradio 인터페이스 설정
interface = gr.Interface(fn=chatbot_interface, 
                         inputs="text", 
                         outputs="text",
                         title="정책/법률 현안 QA 챗봇",
                         description="질문을 입력하면 해당 현안에 대한 답변을 제공합니다.")

# 인터페이스 실행
interface.launch()


In [5]:
import pandas as pd

# JSON 파일을 불러오거나 CSV로 변경하여 처리 가능
# 예시로 CSV 파일을 불러옴 (질문과 답변이 있는 파일)
def load_data(file_path):
    df = pd.read_csv(file_path)
    df = df[['question', 'answer']]  # 질문과 답변만 사용
    return df

# 데이터를 전처리하여 학습 가능한 텍스트 형태로 변환
def preprocess_data(df):
    processed_data = []
    for idx, row in df.iterrows():
        question = row['question']
        answer = row['answer']
        # 질문과 답변을 이어붙여서 하나의 학습 데이터로 만듦
        text = f"질문: {question}\n답변: {answer}"
        processed_data.append(text)
    return processed_data

# 데이터 로드 및 전처리
file_path = './data/qa_data.csv'
df = load_data(file_path)
processed_data = preprocess_data(df)

print(processed_data[:3])  # 첫 3개 데이터 출력


FileNotFoundError: [Errno 2] No such file or directory: './data/qa_data.csv'