In [2]:
# 1. 필요한 패키지 설치
!pip install sentence-transformers
!pip install sklearn
!pip install torch

# 2. 필요한 모듈 임포트
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# 3. 데이터 전처리 함수
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z가-힣0-9\s]', '', text)  # 특수문자 제거
    text = text.lower()  # 소문자 변환
    return text

# 4. 키워드 정의
keywords = {
    "연봉": ["돈", "급여", "연봉", "보상", "인센티브"],
    "수평적문화": ["수평적", "자유로운", "열린", "소통", "의견"],
    "워라벨": ["밸런스", "칼퇴", "유연", "쉬다", "휴식"],
    "복지": ["복지", "지원", "혜택", "제도"],
    "안정성": ["안정", "안정성", "오래", "튼튼"],
    "성장기회": ["성장", "배움", "발전", "기회", "교육"]
}

# 5. 리뷰 데이터 예시
reviews = [
    "돈을 많이 주고 다들 으쌰으쌰하는 분위기이다.",
    "소통이 잘 되고 자유로운 회사 분위기가 좋다.",
    "워라벨이 뛰어나고 복지 혜택도 많다."
]

# 6. BERT 모델 로드
print("모델 로드 중... 시간이 걸릴 수 있습니다.")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# 7. 키워드와 리뷰 분석 함수
def assign_keywords(reviews, keywords):
    processed_reviews = [preprocess_text(review) for review in reviews]
    review_embeddings = model.encode(processed_reviews)

    # 키워드 매칭
    results = []
    for review, embedding in zip(reviews, review_embeddings):
        keyword_scores = {}
        for key, key_phrases in keywords.items():
            key_embeddings = model.encode(key_phrases)
            scores = cosine_similarity([embedding], key_embeddings)
            avg_score = scores.mean()
            keyword_scores[key] = avg_score
        # 상위 3개 키워드 추출
        top_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)[:3]
        results.append((review, [kw[0] for kw in top_keywords]))
    return results

# 8. 분석 실행 및 결과 출력
results = assign_keywords(reviews, keywords)
for review, top_keywords in results:
    print(f"리뷰: {review}")
    print(f"키워드: {top_keywords}")


Collecting sentence-transformers
  Using cached sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-0.26.3-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached tokenizers-0.20.3-cp312-none-win_amd64.whl.metadata (6.9 kB)
Using cached sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
Using cached huggingface_hub-0.26.3-py3-none-any.whl (447 kB)
Using cached transformers-4.46.3-py3-none-any.whl (10.0 MB)
Using cached tokenizers-0.20.3-cp312-none-win_amd64.whl (2.4 MB)
Installing collected packages: huggingface-hub, tokenizers, transformers, sentence-transformers
Successfully installed huggingface-hub-0.26.3 sentence-transformers-3.3.1 tokenizers-0.20.3 transfo

  error: subprocess-exited-with-error
  
  python setup.py egg_info did not run successfully.
  exit code: 1
  
  [15 lines of output]
  The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  rather than 'sklearn' for pip commands.
  
  Here is how to fix this error in the main use cases:
  - use 'pip install scikit-learn' rather than 'pip install sklearn'
  - replace 'sklearn' by 'scikit-learn' in your pip requirements files
    (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  - if the 'sklearn' package is used by one of your dependencies,
    it would be great if you take some time to track which package uses
    'sklearn' instead of 'scikit-learn' and report it to their issue tracker
  - as a last resort, set the environment variable
    SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
  
  More information is available at
  https://github.com/scikit-learn/sklearn-pypi-package
  [end of output]
  
  note: This error originates from a subpr

모델 로드 중... 시간이 걸릴 수 있습니다.


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

리뷰: 돈을 많이 주고 다들 으쌰으쌰하는 분위기이다.
키워드: ['안정성', '연봉', '수평적문화']
리뷰: 소통이 잘 되고 자유로운 회사 분위기가 좋다.
키워드: ['수평적문화', '안정성', '연봉']
리뷰: 워라벨이 뛰어나고 복지 혜택도 많다.
키워드: ['수평적문화', '안정성', '복지']
