In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer, util

# 한글 폰트 설정
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)

# 데이터 불러오기
individual_data = pd.read_excel('C:\\Users\\dbwld\\OneDrive\\바탕 화면\\Project\\individual_data.xlsx')
nonindividual_data = pd.read_excel('C:\\Users\\dbwld\\OneDrive\\바탕 화면\\Project\\nonindividual_data.xlsx')
symptom_data = pd.read_excel('C:\\Users\\dbwld\\OneDrive\\바탕 화면\\Project\\symptom_data.xlsx')

# 데이터 전처리
texts = symptom_data['text']
symptom_data['건강 카테고리'] = symptom_data['건강 카테고리'].replace('정신 및 건강', '수면 및 정신')
labels = symptom_data['건강 카테고리']

# TF-IDF 벡터화
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
text_vectors = vectorizer.fit_transform(texts)

# 라벨 인코딩
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# 오버샘플링 적용 (데이터 불균형 해결)
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(text_vectors, encoded_labels)

# 데이터 분할 (Train: 70%, Valid: 15%, Test: 15%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_resampled, y_resampled, test_size=0.3, random_state=42, stratify=y_resampled
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# 모델 학습
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# 모델 평가
y_pred = model.predict(X_test)
print('📈 Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# 모델 및 벡터 저장
joblib.dump(model, 'health_recommendation_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

# SBERT 모델 로드 (증상 유사도 분석용 및 섭취 주의사항 검토용 각각 다르게 설정)
symptom_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
warning_model = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')

# 개인형 & 비개인형 데이터 결합
individual_data['형태'] = '개별 인정형 품목'
nonindividual_data['형태'] = '고시형 품목'
combined_data = pd.concat([individual_data, nonindividual_data], ignore_index=True)

def recommend_health_products(user_input, user_condition, top_n=5):
    input_vector = symptom_model.encode(user_input)
    symptom_similarities = []
    
    for idx, row in symptom_data.iterrows():
        symptom_embedding = symptom_model.encode(row['text'])
        sim = util.cos_sim(np.array([input_vector]), np.array([symptom_embedding]))[0][0].item()
        symptom_similarities.append((sim, row['text'], row['건강 카테고리']))
    
    top_similar_symptoms = sorted(symptom_similarities, key=lambda x: x[0], reverse=True)[:2]
    similar_categories = list(set([item[2] for item in top_similar_symptoms]))
    
    recommended_products = []
    for idx, row in combined_data.iterrows():
        if row['건강 카테고리'] in similar_categories:
            product_embedding = symptom_model.encode(row['주요 기능'])
            sim_score = util.cos_sim(input_vector.reshape(1, -1), product_embedding.reshape(1, -1))[0][0].item()
            recommended_products.append((row['품목명'], sim_score, row['섭취 주의사항'], row['형태']))
    
    recommended_products = sorted(recommended_products, key=lambda x: x[1], reverse=True)[:top_n]
    
    final_recommendations = []
    for prod_name, score, warning, kind in recommended_products:
        if is_safe_for_condition(warning, user_condition, threshold=0.45):
            final_recommendations.append((prod_name, kind))
    
    print(f"\n🧠 예측된 건강 카테고리: {', '.join(similar_categories)}")
    print(f"💊 최종 추천 건강기능식품 (상위 {top_n}개):")
    for idx, (name, kind) in enumerate(final_recommendations):
        print(f"{idx+1}. {name} ({kind})")

def is_safe_for_condition(product_warning, user_condition, threshold=0.45):
    if isinstance(product_warning, float) and pd.isna(product_warning):
        return True
    bullets = product_warning.split('\n')
    for bullet in bullets:
        bullet = bullet.strip()
        if bullet == "":
            continue
        sim = util.cos_sim(warning_model.encode(bullet).reshape(1, -1), warning_model.encode(user_condition).reshape(1, -1))[0][0].item()
        if sim >= threshold:
            return False
    return True

# 예제 실행
user_input = "스트레스를 많이 받아요"
user_condition = "임산부에요"
recommend_health_products(user_input, user_condition, top_n=3)


📈 Accuracy: 0.9
              precision    recall  f1-score   support

      간 및 소화       1.00      1.00      1.00        10
     면역 및 체력       1.00      0.70      0.82        10
      뼈 및 구조       0.83      1.00      0.91        10
     수면 및 정신       0.90      0.90      0.90        10
    심혈관 및 대사       0.82      0.90      0.86        10

    accuracy                           0.90        50
   macro avg       0.91      0.90      0.90        50
weighted avg       0.91      0.90      0.90        50


🧠 예측된 건강 카테고리: 수면 및 정신
💊 최종 추천 건강기능식품 (상위 3개):
1. 홍경천 추출물 (고시형 품목)
2. 유단백가수분해물 (고시형 품목)
3. 돌외잎추출물(제2015-7호) (고시형 품목)


In [2]:
pip install --upgrade numexpr bottleneck

Collecting numexpr
  Downloading numexpr-2.10.2-cp39-cp39-win_amd64.whl.metadata (8.3 kB)
Collecting bottleneck
  Downloading Bottleneck-1.4.2-cp39-cp39-win_amd64.whl.metadata (7.9 kB)
Downloading numexpr-2.10.2-cp39-cp39-win_amd64.whl (144 kB)
Downloading Bottleneck-1.4.2-cp39-cp39-win_amd64.whl (111 kB)
Installing collected packages: numexpr, bottleneck
  Attempting uninstall: numexpr
    Found existing installation: numexpr 2.8.3
    Uninstalling numexpr-2.8.3:
      Successfully uninstalled numexpr-2.8.3
  Attempting uninstall: bottleneck
    Found existing installation: Bottleneck 1.3.5
    Uninstalling Bottleneck-1.3.5:
      Successfully uninstalled Bottleneck-1.3.5
Successfully installed bottleneck-1.4.2 numexpr-2.10.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install --upgrade pandas


