In [4]:
# algorithm: naive bayes sentiment classifier + TD / TF-IDF
# data: imdb movie reviews

import sys
import os
import math
import re
import json
import random
import numpy as np
from collections import Counter
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import json

# 데이터 로드
imdb = load_dataset('imdb')

# 데이터 2000건 추출(긍정 1000건, 부정 1000건)
imdb_select = imdb['train'][:1000]
for k, v in imdb["train"][-1000:].items():
    imdb_select[k] += v

# random seed 및 하이퍼파라미터 설정

seeds = [10, 20, 30, 40, 50]
alphas = [0.1, 0.2, 0.3]
gammas = [0.8, 0.9, 1.0]

result_dict ={}
result_dict["설정"] =[]
result_dict["count"] =[]
result_dict["tfidf"] =[]
result_dict["RL"] =[]
# 반복 실험
for seed in seeds:
    for alpha in alphas:
        for gamma in gammas:
            print(f"Running experiment with seed={seed}, alpha={alpha}, gamma={gamma}")
            result_dict["설정"].append(f"seed={seed}, alpha={alpha}, gamma={gamma}")
            train_text, test_text, train_label, test_label = train_test_split(imdb_select["text"], imdb_select["label"], test_size=0.2, train_size=0.8, shuffle=True, random_state=seed)
            # counter based vector test
            dtmvector = CountVectorizer()
            dtm = dtmvector.fit_transform(train_text)
            print(dtm.shape)

            model = MultinomialNB()

            model.fit(dtm, train_label)

            MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


            dtm_test = dtmvector.transform(test_text) #테스트 데이터를 DTM으로 변환
            predicted = model.predict(dtm_test) #테스트 데이터에 대한 예측
            print("정확도:", accuracy_score(test_label, predicted)) #예측값과 실제값 비교
            result_dict["count"].append(accuracy_score(test_label, predicted))

            # tf-idf based vector test
            tfidf_vector = TfidfVectorizer()
            tfidfv = tfidf_vector.fit_transform(train_text)
            print(tfidfv.shape)
           

            model = MultinomialNB()

            model.fit(tfidfv, train_label)

            MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

            tfidfv_test = tfidf_vector.transform(test_text) #DTM을 TF-IDF 행렬로 변환
            result_dict["tfidf"] =accuracy_score(test_label, predicted)
            predicted = model.predict(tfidfv_test) #테스트 데이터에 대한 예측
            print("정확도:", accuracy_score(test_label, predicted)) #예측값과 실제값 비교
            result_dict["tfidf"].append(accuracy_score(test_label, predicted))

            num_episodes = 1000  # number of episodes
            num_timesteps = dtm.shape[1]  # number of timesteps (words)

            # 가중치 초기화
            weights = np.abs(np.random.normal(size=num_timesteps))

            for episode in tqdm(range(num_episodes)):
                for i in range(len(train_text)):
                  if i != len(train_text)-1:
                    state = dtm.getrow(i).toarray().flatten()
                    action = state * weights
                    # TD learning
                    action_value =  state * weights
                    reward = 1 if train_label[i] == 1 else -1
                    weights += alpha * (reward + gamma * action - action_value)

                    # 음수 제거
                    weights[weights < 0] = 0

            # 가중치를 DTM에 적용
            weighted_dtm = dtm.copy()

            for i in range(num_timesteps):
                weighted_dtm[:, i] = weighted_dtm[:, i] * weights[i]

            # Multinomial Naive Bayes 적용
            model = MultinomialNB()
            model.fit(weighted_dtm, train_label)

            # 모델 테스트
            dtm_test = dtmvector.transform(test_text)
            for i in range(num_timesteps):
                dtm_test[:, i] = dtm_test[:, i] * weights[i]
            predicted = model.predict(dtm_test)
            print("Counter Vectorizer 정확도:", accuracy_score(test_label, predicted))
            result_dict["RL"].append(accuracy_score(test_label, predicted))

            with open("./result_rl.json","w",encoding = "utf-8") as f:
              json.dump(result_dict, f, indent=4, sort_keys=True, ensure_ascii=False)