In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 데이터 로드
file_path = "C:\\Users\\jhpar\\Desktop\\머신러닝의 기초\\구현\\텍스트_데이터.xlsx"
data = pd.read_excel(file_path)

# 1. 필요한 열만 추출
data = data[['요약', '고용 현황']]
data = data.dropna()  # 결측값 제거
data['퇴사여부'] = data['고용 현황'].apply(lambda x: 1 if x == '전직원' else 0)  # 전직원: 1, 현직원: 0

# 2. 텍스트 벡터화
X = data['요약']
y = data['퇴사여부']
vectorizer = TfidfVectorizer(max_features=500)
X = vectorizer.fit_transform(X)

# 3. 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. 모델 훈련
model = LogisticRegression()
model.fit(X_train, y_train)

# 5. 예측 및 정확도 평가
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("모델 정확도:", accuracy)


모델 정확도: 0.612


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# 데이터 전처리
data = data[['요약', '고용 현황']].dropna()
data['퇴사여부'] = data['고용 현황'].apply(lambda x: 1 if x == '전직원' else 0)

# 텍스트 벡터화
X = data['요약']
y = data['퇴사여부']
vectorizer = TfidfVectorizer(max_features=500)
X = vectorizer.fit_transform(X)

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 리스트
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Naive Bayes": MultinomialNB()
}

# 모델 성능 비교
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} 정확도: {accuracy:.4f}")


Logistic Regression 정확도: 0.6120
Random Forest 정확도: 0.5720
SVM 정확도: 0.6040
Naive Bayes 정확도: 0.6280


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# 1. 필요한 열만 추출
data = data[['요약', '고용 현황']].dropna()
data['퇴사여부'] = data['고용 현황'].apply(lambda x: 1 if x == '전직원' else 0)

# 2. 텍스트 벡터화
X = data['요약']
y = data['퇴사여부']
vectorizer = TfidfVectorizer(max_features=500)
X = vectorizer.fit_transform(X)

# 3. 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. 모델 훈련
model = LogisticRegression()
model.fit(X_train, y_train)

# 5. 예측 및 정확도 평가
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("모델 정확도:", accuracy)

# 6. 각 단어의 가중치 확인 (추가 코드)
feature_names = vectorizer.get_feature_names_out()
coefficients = model.coef_.flatten()

# 가중치가 높은 순으로 정렬하여 상위 10개 단어 추출
top_positive_coefficients = np.argsort(coefficients)[-10:]  # 퇴사에 긍정적인 영향
top_negative_coefficients = np.argsort(coefficients)[:10]   # 퇴사에 부정적인 영향

# 상위 단어들 출력
top_positive_words = [(feature_names[i], coefficients[i]) for i in top_positive_coefficients]
top_negative_words = [(feature_names[i], coefficients[i]) for i in top_negative_coefficients]

print("퇴사에 긍정적 영향을 미치는 단어들:", top_positive_words)
print("퇴사에 부정적 영향을 미치는 단어들:", top_negative_words)


모델 정확도: 0.612
퇴사에 긍정적 영향을 미치는 단어들: [('좋았습니다', 0.8429102509758447), ('복지와', 0.846603504257779), ('같음', 0.8471754898484076), ('건물이', 0.8676054235260595), ('같습니다', 0.9052196445961774), ('복지포인트', 0.9103476299777947), ('다니기', 0.9781449091544405), ('사내', 1.0282160973256884), ('좋았고', 1.0499517259212958), ('높음', 1.0723773963374637)]
퇴사에 부정적 영향을 미치는 단어들: [('없음', -1.4499370603671664), ('워라밸', -1.3263143388722956), ('대체적으로', -1.3215224610235983), ('따라', -1.1731464797889524), ('휴가사용', -1.1670882680790753), ('않다', -1.0530654343695445), ('연봉', -1.0205451100967475), ('워라밸은', -1.0095082704858411), ('안정성이', -1.0068819603446129), ('급여', -1.0045033623581543)]
