In [89]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
from konlpy.tag import Okt

# 데이터 로드
df = pd.read_csv("movie_sample.csv", encoding="utf-8")

# 텍스트 전처리 함수
def clean_text(text):
    text = re.sub(r"[^가-힣\s]", "", str(text))  # 한글과 공백만 남기기
    return text.strip()

df["document"] = df["document"].fillna("").apply(clean_text)

# 학습, 검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(df["document"], df["label"], test_size=0.2, random_state=42)

In [90]:
# model은 Logistic으로 고정
#============================
# 1. One-hot Encoding
#============================
vectorizer_onehot = CountVectorizer(binary=True)
X_train_onehot=vectorizer_onehot.fit_transform(X_train)
X_test_onehot=vectorizer_onehot.transform(X_test)
#============================
# 2. Count Vectorizer
#============================
vectorizer_count = CountVectorizer()
X_train_count = vectorizer_count.fit_transform(X_train)
X_test_count = vectorizer_count.transform(X_test)
#============================
# 3. TF-IDF
#============================
vectorizer_tfidf=TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)
#============================
# 4. Word2Vec
#============================
X_train_tokens = [sentence.split() for sentence in X_train]
X_test_tokens = [sentence.split() for sentence in X_test]

w2v_model = Word2Vec(sentences = X_train_tokens, vector_size = 100, window=5, min_count=1)
w2v_model.train(X_train_tokens, total_examples = len(X_train_tokens), epochs=10)

def sentence_vector(tokens, model, vector_size= 100):
    vectors =[model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis =0) if vectors else np.zeros(vector_size)

X_train_w2v =np.array([sentence_vector(tokens, w2v_model) for tokens in X_train_tokens])
X_test_w2v =np.array([sentence_vector(tokens, w2v_model) for tokens in X_test_tokens])


In [3]:
## 각 임베딩별로 성능 평가

def train_eval_model(X_train, X_test, y_train, y_test):
    model = LogisticRegression(max_iter=500)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

## 임베딩별로 성능평가

result={
    'one-hot' : train_eval_model(X_train_onehot, X_test_onehot, y_train, y_test),
    'count' : train_eval_model(X_train_count, X_test_count, y_train, y_test),
    'tfidf' : train_eval_model(X_train_tfidf, X_test_tfidf, y_train, y_test),    
    'Word2Vec' : train_eval_model(X_train_w2v, X_test_w2v, y_train, y_test)       
}

#성능비교 결과

print('Model 임베딩별 성능 결과')
for method,accuracy in result.items():
    print(f'{method}:{accuracy:.4f}')

Model 임베딩별 성능 결과
one-hot:0.7566
count:0.7553
tfidf:0.7563
Word2Vec:0.6390


### (방법1) 성능을 올리기 위해서는?
- 형태소 분석 추가 ( 어간 추출 추가 )
- 한국어 불용어 제거 ( 단어 사전을 사용하진 않음 )

In [91]:
# 데이터 로드
df = pd.read_csv("movie_sample.csv", encoding="utf-8")

# 텍스트 전처리 함수
def clean_text(text):
    okt = Okt()
    text = re.sub(r"[^가-힣\s]", "", str(text))  # 한글과 공백만 남기기
    text = text.strip()
    tokens = okt.morphs(text, stem=True) #형태소 분석(어간추출)
    ## 불용어 제거
    stopwords = set(['은','는','가','을','를','의','에','과','와','도','한'])
    return " ".join([word for word in tokens if word not in stopwords])

df["document"] = df["document"].fillna("").apply(clean_text)

# 학습, 검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(df["document"], df["label"], test_size=0.2, random_state=42)

KeyboardInterrupt: 

In [5]:
# model은 Logistic으로 고정
#============================
# 1. One-hot Encoding
#============================
vectorizer_onehot = CountVectorizer(binary=True)
X_train_onehot=vectorizer_onehot.fit_transform(X_train)
X_test_onehot=vectorizer_onehot.transform(X_test)
#============================
# 2. Count Vectorizer
#============================
vectorizer_count = CountVectorizer()
X_train_count = vectorizer_count.fit_transform(X_train)
X_test_count = vectorizer_count.transform(X_test)
#============================
# 3. TF-IDF
#============================
vectorizer_tfidf=TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)
#============================
# 4. Word2Vec
#============================
X_train_tokens = [sentence.split() for sentence in X_train]
X_test_tokens = [sentence.split() for sentence in X_test]

w2v_model = Word2Vec(sentences = X_train_tokens, vector_size = 100, window=5, min_count=1)
w2v_model.train(X_train_tokens, total_examples = len(X_train_tokens), epochs=10)

def sentence_vector(tokens, model, vector_size= 100):
    vectors =[model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis =0) if vectors else np.zeros(vector_size)

X_train_w2v =np.array([sentence_vector(tokens, w2v_model) for tokens in X_train_tokens])
X_test_w2v =np.array([sentence_vector(tokens, w2v_model) for tokens in X_test_tokens])


In [6]:
## 임베딩별로 성능평가

result_rvs1={
    'one-hot' : train_eval_model(X_train_onehot, X_test_onehot, y_train, y_test),
    'count' : train_eval_model(X_train_count, X_test_count, y_train, y_test),
    'tfidf' : train_eval_model(X_train_tfidf, X_test_tfidf, y_train, y_test),    
    'Word2Vec' : train_eval_model(X_train_w2v, X_test_w2v, y_train, y_test)       
}

#성능비교 결과

print('Model 임베딩별 성능 결과')
for method,accuracy in result_rvs1.items():
    print(f'{method}:{accuracy:.4f}')

Model 임베딩별 성능 결과
one-hot:0.8047
count:0.8042
tfidf:0.8077
Word2Vec:0.7753


### (방법2) 하이퍼파라미터 추가
- TF-IDF 성능을 좀 더 개선 시킬 수 있는 방법은?
- TFIDF를 하이퍼파라미터를 추가해서 성능을 더 개선해 볼까?

In [7]:
# model은 Logistic으로 고정
#============================
# 1. One-hot Encoding
#============================
vectorizer_onehot = CountVectorizer(binary=True)
X_train_onehot=vectorizer_onehot.fit_transform(X_train)
X_test_onehot=vectorizer_onehot.transform(X_test)
#============================
# 2. Count Vectorizer
#============================
vectorizer_count = CountVectorizer()
X_train_count = vectorizer_count.fit_transform(X_train)
X_test_count = vectorizer_count.transform(X_test)
#============================
# 3. TF-IDF
#============================
vectorizer_tfidf=TfidfVectorizer(min_df=3, max_df=0.85, ngram_range=(1,3))
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)
#============================
# 4. Word2Vec
#============================
X_train_tokens = [sentence.split() for sentence in X_train]
X_test_tokens = [sentence.split() for sentence in X_test]

w2v_model = Word2Vec(sentences = X_train_tokens, vector_size = 100, window=5, min_count=1)
w2v_model.train(X_train_tokens, total_examples = len(X_train_tokens), epochs=10)

def sentence_vector(tokens, model, vector_size= 100):
    vectors =[model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis =0) if vectors else np.zeros(vector_size)

X_train_w2v =np.array([sentence_vector(tokens, w2v_model) for tokens in X_train_tokens])
X_test_w2v =np.array([sentence_vector(tokens, w2v_model) for tokens in X_test_tokens])


In [8]:
result_rvs2={
    'one-hot' : train_eval_model(X_train_onehot, X_test_onehot, y_train, y_test),
    'count' : train_eval_model(X_train_count, X_test_count, y_train, y_test),
    'tfidf' : train_eval_model(X_train_tfidf, X_test_tfidf, y_train, y_test),    
    'Word2Vec' : train_eval_model(X_train_w2v, X_test_w2v, y_train, y_test)       
}

#성능비교 결과

print('Model 임베딩별 성능 결과')
for method,accuracy in result_rvs2.items():
    print(f'{method}:{accuracy:.4f}')

Model 임베딩별 성능 결과
one-hot:0.8047
count:0.8042
tfidf:0.8159
Word2Vec:0.7759


- base 전처리 (tfidf) : 0.7662
- Model 텍스트 전처리 (tfidf) : 0.8077
- TFIDF 하이퍼파라미터 튜닝 : 0.8159

### (방법3) Model을 여러 개 섞어서 성능을 개선시키자!

- voting(xgb, randomforest, log)

In [9]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from xgboost import XGBClassifier

In [10]:
## Voting추가하여 성능평가
log_rg = LogisticRegression(max_iter=1000)
rf_clf= RandomForestClassifier(n_estimators=100)
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

ensemble_model = VotingClassifier(estimators=[
    ('lr',log_rg),
    ('rf',rf_clf),
    ('xgb',xgb_clf)
],voting='hard')

ensemble_model.fit(X_train_tfidf,y_train)
y_pred_ensemble = ensemble_model.predict(X_test_tfidf)
ensemble_accuracy=accuracy_score(y_test, y_pred_ensemble)
#성능비교 결과

result_rvs3={
    'one-hot' : train_eval_model(X_train_onehot, X_test_onehot, y_train, y_test),
    'count' : train_eval_model(X_train_count, X_test_count, y_train, y_test),
    'tfidf' : train_eval_model(X_train_tfidf, X_test_tfidf, y_train, y_test),    
    'Word2Vec' : train_eval_model(X_train_w2v, X_test_w2v, y_train, y_test),
    'Ensemble': ensemble_accuracy
}

print('Model 임베딩별 성능 결과')
for method,accuracy in result_rvs3.items():
    print(f'{method}:{accuracy:.4f}')

Parameters: { "use_label_encoder" } are not used.



Model 임베딩별 성능 결과
one-hot:0.8047
count:0.8042
tfidf:0.8159
Word2Vec:0.7759
Ensemble:0.7953


## 필수과제1
- 정규식, 형태소 등의 전처리를 사용 및 TFIDF 하이퍼파라미터 튜닝 -> tfidf:0.8190 가 나왔음 
- word2vec 튜닝 작업 진행해서 -> 0.8190보다 성능이 더 높게 나와야 합니다.
    - 모델의 앙상블 형태의 결합 등을 같이 이용해서 word2vec의 전처리로만 성능이 0.8190가 넘으면 됩니다.
- 0.8190 넘지 않으면 과제 미제출(Word2vec)
- **만약 성능이 너무 개선되지 않으면 데이터셋을 추가로 더 사용하셔도 됩니다!**

In [None]:
# 데이터 로드
df = pd.read_csv("movie_sample.csv", encoding="utf-8")

# 텍스트 전처리 함수
def clean_text(text):
    okt = Okt()
    text = re.sub(r"[^가-힣\s]", "", str(text))  # 한글과 공백만 남기기
    text = text.strip()
    tokens = okt.morphs(text, stem=True) #형태소 분석(어간추출)
    ## 불용어 제거
    stopwords = set(['은','는','가','을','를','의','에','과','와','도','한'])
    return " ".join([word for word in tokens if word not in stopwords])

df["document"] = df["document"].fillna("").apply(clean_text)

# 학습, 검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(df["document"], df["label"], test_size=0.2, random_state=42)

In [19]:
X_train_tokens = [sentence.split() for sentence in X_train]
X_test_tokens = [sentence.split() for sentence in X_test]

w2v_model = Word2Vec(sentences = X_train_tokens, vector_size = 300, window=10, min_count=5)
w2v_model.train(X_train_tokens, total_examples = len(X_train_tokens), epochs=30)

def sentence_vector(tokens, model, vector_size= 300):
    vectors =[model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis =0) if vectors else np.zeros(vector_size)

X_train_w2v =np.array([sentence_vector(tokens, w2v_model) for tokens in X_train_tokens])
X_test_w2v =np.array([sentence_vector(tokens, w2v_model) for tokens in X_test_tokens])

In [20]:
result_rvs4={
    'one-hot' : train_eval_model(X_train_onehot, X_test_onehot, y_train, y_test),
    'count' : train_eval_model(X_train_count, X_test_count, y_train, y_test),
    'tfidf' : train_eval_model(X_train_tfidf, X_test_tfidf, y_train, y_test),    
    'Word2Vec' : train_eval_model(X_train_w2v, X_test_w2v, y_train, y_test)       
}

#성능비교 결과

print('Model 임베딩별 성능 결과')
for method,accuracy in result_rvs4.items():
    print(f'{method}:{accuracy:.4f}')

Model 임베딩별 성능 결과
one-hot:0.8047
count:0.8042
tfidf:0.8159
Word2Vec:0.7930


In [25]:
X_train_tokens = [sentence.split() for sentence in X_train]
X_test_tokens = [sentence.split() for sentence in X_test]

w2v_model_sg = Word2Vec(sentences = X_train_tokens, vector_size = 300, window=10, min_count=5, sg=1)
w2v_model_sg.train(X_train_tokens, total_examples = len(X_train_tokens), epochs=30)

def sentence_vector(tokens, model, vector_size= 300):
    vectors =[model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis =0) if vectors else np.zeros(vector_size)

X_train_w2v =np.array([sentence_vector(tokens, w2v_model_sg) for tokens in X_train_tokens])
X_test_w2v =np.array([sentence_vector(tokens, w2v_model_sg) for tokens in X_test_tokens])

In [26]:
result_rvs5={
    'one-hot' : train_eval_model(X_train_onehot, X_test_onehot, y_train, y_test),
    'count' : train_eval_model(X_train_count, X_test_count, y_train, y_test),
    'tfidf' : train_eval_model(X_train_tfidf, X_test_tfidf, y_train, y_test),    
    'Word2Vec' : train_eval_model(X_train_w2v, X_test_w2v, y_train, y_test)       
}

#성능비교 결과

print('Model 임베딩별 성능 결과')
for method,accuracy in result_rvs5.items():
    print(f'{method}:{accuracy:.4f}')

Model 임베딩별 성능 결과
one-hot:0.8047
count:0.8042
tfidf:0.8159
Word2Vec:0.8047


In [30]:
ensemble_model = VotingClassifier(estimators=[
    ('lr',log_rg),
    ('rf',rf_clf),
    ('xgb',xgb_clf)
],voting='hard')

ensemble_model.fit(X_train_w2v,y_train)
y_pred_ensemble = ensemble_model.predict(X_test_w2v)
ensemble_accuracy=accuracy_score(y_test, y_pred_ensemble)
#성능비교 결과

result_rvs6={
    'one-hot' : train_eval_model(X_train_onehot, X_test_onehot, y_train, y_test),
    'count' : train_eval_model(X_train_count, X_test_count, y_train, y_test),
    'tfidf' : train_eval_model(X_train_tfidf, X_test_tfidf, y_train, y_test),    
    'Word2Vec' : train_eval_model(X_train_w2v, X_test_w2v, y_train, y_test),
    'Ensemble': ensemble_accuracy
}

print('Model 임베딩별 성능 결과')
for method,accuracy in result_rvs6.items():
    print(f'{method}:{accuracy:.4f}')

Parameters: { "use_label_encoder" } are not used.



Model 임베딩별 성능 결과
one-hot:0.8047
count:0.8042
tfidf:0.8159
Word2Vec:0.8047
Ensemble:0.8068


In [92]:
# 데이터 로드
df = pd.read_csv("movie_sample.csv", encoding="utf-8")

def clean_text(text):
    okt = Okt()
    
    # 특수문자, 숫자 제거, 공백 정리
    text = re.sub(r"[^가-힣\s]", "", str(text))  
    text = re.sub(r"\d+", "", text)  
    text = re.sub(r"\s+", " ", text).strip()  
    
    # 형태소 분석 (어간 추출 적용)
    tokens = okt.morphs(text, stem=True)  
    
    # 불용어 제거 
    stopwords = set([
        '은', '는', '이', '가', '을', '를', '의', '에', '과', '와', '도', '한',
        '에서', '부터', '까지', '하고', '보다', '인데', '때문', '였다', '다면', '처럼'
    ])
    tokens = [word for word in tokens if word not in stopwords]
    
    return " ".join(tokens)

# 데이터 적용
df["document"] = df["document"].fillna("").apply(clean_text)

# 학습, 검증 데이터 분리s
X_train, X_test, y_train, y_test = train_test_split(df["document"], df["label"], test_size=0.2, random_state=42)

In [36]:
X_train_tokens = [sentence.split() for sentence in X_train]
X_test_tokens = [sentence.split() for sentence in X_test]

w2v_model_sg = Word2Vec(sentences = X_train_tokens, vector_size = 300, window=10, min_count=5, sg=1)
w2v_model_sg.train(X_train_tokens, total_examples = len(X_train_tokens), epochs=30)

def sentence_vector(tokens, model, vector_size= 300):
    vectors =[model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis =0) if vectors else np.zeros(vector_size)

X_train_w2v =np.array([sentence_vector(tokens, w2v_model_sg) for tokens in X_train_tokens])
X_test_w2v =np.array([sentence_vector(tokens, w2v_model_sg) for tokens in X_test_tokens])

In [37]:
result_rvs7={
    'one-hot' : train_eval_model(X_train_onehot, X_test_onehot, y_train, y_test),
    'count' : train_eval_model(X_train_count, X_test_count, y_train, y_test),
    'tfidf' : train_eval_model(X_train_tfidf, X_test_tfidf, y_train, y_test),    
    'Word2Vec' : train_eval_model(X_train_w2v, X_test_w2v, y_train, y_test)       
}

#성능비교 결과

print('Model 임베딩별 성능 결과')
for method,accuracy in result_rvs7.items():
    print(f'{method}:{accuracy:.4f}')

Model 임베딩별 성능 결과
one-hot:0.8047
count:0.8042
tfidf:0.8159
Word2Vec:0.8047


In [38]:
ensemble_model = VotingClassifier(estimators=[
    ('lr',log_rg),
    ('rf',rf_clf),
    ('xgb',xgb_clf)
],voting='hard')

ensemble_model.fit(X_train_w2v,y_train)
y_pred_ensemble = ensemble_model.predict(X_test_w2v)
ensemble_accuracy=accuracy_score(y_test, y_pred_ensemble)
#성능비교 결과

result_rvs8={
    'one-hot' : train_eval_model(X_train_onehot, X_test_onehot, y_train, y_test),
    'count' : train_eval_model(X_train_count, X_test_count, y_train, y_test),
    'tfidf' : train_eval_model(X_train_tfidf, X_test_tfidf, y_train, y_test),    
    'Word2Vec' : train_eval_model(X_train_w2v, X_test_w2v, y_train, y_test),
    'Ensemble': ensemble_accuracy
}

print('Model 임베딩별 성능 결과')
for method,accuracy in result_rvs8.items():
    print(f'{method}:{accuracy:.4f}')

Parameters: { "use_label_encoder" } are not used.



Model 임베딩별 성능 결과
one-hot:0.8047
count:0.8042
tfidf:0.8159
Word2Vec:0.8047
Ensemble:0.8079


In [39]:
X_train_tokens = [sentence.split() for sentence in X_train]
X_test_tokens = [sentence.split() for sentence in X_test]

w2v_model_sg = Word2Vec(sentences = X_train_tokens, vector_size = 300, window=10, min_count=5, sg=1)
w2v_model_sg.train(X_train_tokens, total_examples = len(X_train_tokens), epochs=30)

def sentence_vector(tokens, model, vector_size= 300):
    vectors =[model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis =0) if vectors else np.zeros(vector_size)

X_train_w2v =np.array([sentence_vector(tokens, w2v_model_sg) for tokens in X_train_tokens])
X_test_w2v =np.array([sentence_vector(tokens, w2v_model_sg) for tokens in X_test_tokens])

In [40]:
result_rvs8={
    'one-hot' : train_eval_model(X_train_onehot, X_test_onehot, y_train, y_test),
    'count' : train_eval_model(X_train_count, X_test_count, y_train, y_test),
    'tfidf' : train_eval_model(X_train_tfidf, X_test_tfidf, y_train, y_test),    
    'Word2Vec' : train_eval_model(X_train_w2v, X_test_w2v, y_train, y_test)       
}

#성능비교 결과

print('Model 임베딩별 성능 결과')
for method,accuracy in result_rvs8.items():
    print(f'{method}:{accuracy:.4f}')

Model 임베딩별 성능 결과
one-hot:0.8047
count:0.8042
tfidf:0.8159
Word2Vec:0.8056


In [42]:
log_rg = LogisticRegression(C=5.0, max_iter=1000)
rf_clf = RandomForestClassifier(n_estimators=300, max_depth=20, random_state=1)
xgb_clf = XGBClassifier(learning_rate=0.05, n_estimators=500, max_depth=6, random_state=1)

# 최적화된 앙상블 모델
ensemble_model = VotingClassifier(estimators=[
    ('lr', log_rg),
    ('rf', rf_clf),
    ('xgb', xgb_clf)
], voting='hard')

# 학습 및 평가
ensemble_model.fit(X_train_w2v, y_train)
y_pred_ensemble = ensemble_model.predict(X_test_w2v)
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)

result_rvs9={
    'one-hot' : train_eval_model(X_train_onehot, X_test_onehot, y_train, y_test),
    'count' : train_eval_model(X_train_count, X_test_count, y_train, y_test),
    'tfidf' : train_eval_model(X_train_tfidf, X_test_tfidf, y_train, y_test),    
    'Word2Vec' : train_eval_model(X_train_w2v, X_test_w2v, y_train, y_test),
    'Ensemble': ensemble_accuracy
}

print('Model 임베딩별 성능 결과')
for method,accuracy in result_rvs9.items():
    print(f'{method}:{accuracy:.4f}')

Model 임베딩별 성능 결과
one-hot:0.8047
count:0.8042
tfidf:0.8159
Word2Vec:0.8056
Ensemble:0.8113


In [93]:
X_train_tokens = [sentence.split() for sentence in X_train]
X_test_tokens = [sentence.split() for sentence in X_test]

w2v_model_sg = Word2Vec(sentences = X_train_tokens, vector_size = 500, window=20, min_count=3, sg=1, negative=15, sample=0.0001, epochs=100)
w2v_model_sg.train(X_train_tokens, total_examples = len(X_train_tokens), epochs=100)

def sentence_vector(tokens, model, vector_size= 500):
    vectors =[model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis =0) if vectors else np.zeros(vector_size)

X_train_w2v =np.array([sentence_vector(tokens, w2v_model_sg) for tokens in X_train_tokens])
X_test_w2v =np.array([sentence_vector(tokens, w2v_model_sg) for tokens in X_test_tokens])

In [94]:
result_rvs10={
    'one-hot' : train_eval_model(X_train_onehot, X_test_onehot, y_train, y_test),
    'count' : train_eval_model(X_train_count, X_test_count, y_train, y_test),
    'tfidf' : train_eval_model(X_train_tfidf, X_test_tfidf, y_train, y_test),    
    'Word2Vec' : train_eval_model(X_train_w2v, X_test_w2v, y_train, y_test)       
}

#성능 비교 결과

print('Model 임베딩별 성능 결과')
for method,accuracy in result_rvs10.items():
    print(f'{method}:{accuracy:.4f}')

Model 임베딩별 성능 결과
one-hot:0.7653
count:0.7662
tfidf:0.7675
Word2Vec:0.8097


In [95]:
log_rg = LogisticRegression(C=10.0, max_iter=1000)
rf_clf = RandomForestClassifier(n_estimators=500, max_depth=None, random_state=1)
xgb_clf = XGBClassifier(learning_rate=0.03, n_estimators=700, max_depth=8, random_state=1)

# 최적화된 앙상블 모델
ensemble_model = VotingClassifier(estimators=[
    ('lr', log_rg),
    ('rf', rf_clf),
    ('xgb', xgb_clf)
], voting='hard')

ensemble_model.fit(X_train_w2v, y_train)
y_pred_ensemble = ensemble_model.predict(X_test_w2v)
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)

result_rvs11={
    'one-hot' : train_eval_model(X_train_onehot, X_test_onehot, y_train, y_test),
    'count' : train_eval_model(X_train_count, X_test_count, y_train, y_test),
    'tfidf' : train_eval_model(X_train_tfidf, X_test_tfidf, y_train, y_test),    
    'Word2Vec' : train_eval_model(X_train_w2v, X_test_w2v, y_train, y_test),
    'Ensemble': ensemble_accuracy
}

print('Model 임베딩별 성능 결과')
for method,accuracy in result_rvs11.items():
    print(f'{method}:{accuracy:.4f}')

Model 임베딩별 성능 결과
one-hot:0.7653
count:0.7662
tfidf:0.7675
Word2Vec:0.8097
Ensemble:0.8150


In [96]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

# 최적화된 개별 모델
log_rg = LogisticRegression(C=10.0, max_iter=1000)
rf_clf = RandomForestClassifier(n_estimators=500, max_depth=6, n_jobs=-1, random_state=1)
xgb_clf = XGBClassifier(learning_rate=0.05, n_estimators=500, max_depth=6, n_jobs=-1, random_state=1)
lgb_clf = LGBMClassifier(learning_rate=0.05, n_estimators=500, max_depth=6, n_jobs=-1, random_state=1)

ensemble_model = VotingClassifier(estimators=[
    ('lr', log_rg),
    ('rf', rf_clf),
    ('xgb', xgb_clf),
    ('lgb', lgb_clf),
], voting='soft')

# 학습 및 평가
ensemble_model.fit(X_train_w2v, y_train)
y_pred_ensemble = ensemble_model.predict(X_test_w2v)
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)

result_rvs12={
    'one-hot' : train_eval_model(X_train_onehot, X_test_onehot, y_train, y_test),
    'count' : train_eval_model(X_train_count, X_test_count, y_train, y_test),
    'tfidf' : train_eval_model(X_train_tfidf, X_test_tfidf, y_train, y_test),    
    'Word2Vec' : train_eval_model(X_train_w2v, X_test_w2v, y_train, y_test),
    'Ensemble': ensemble_accuracy
}

print('Model 임베딩별 성능 결과')
for method,accuracy in result_rvs12.items():
    print(f'{method}:{accuracy:.4f}')


[LightGBM] [Info] Number of positive: 11908, number of negative: 12092
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074185 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 127500
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 500
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496167 -> initscore=-0.015334
[LightGBM] [Info] Start training from score -0.015334
Model 임베딩별 성능 결과
one-hot:0.7653
count:0.7662
tfidf:0.7675
Word2Vec:0.8097
Ensemble:0.8200
